b7fc5b87beced4598f3bafd2cfa6e0cfb4daa42a
[htsworkflow.git] / experiments / models.py
1 from __future__ import absolute_import, print_function, unicode_literals
2
3 import datetime
4 import glob
5 import logging
6 import os
7 import re
8 import uuid
9
10 from django.conf import settings
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import pre_save
15
16 from samples.models import Library, HTSUser
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
19
20 import pytz
21
22 LOGGER = logging.getLogger(__name__)
23 default_pM = 5
24 try:
25     default_pM = int(settings.DEFAULT_PM)
26 except AttributeError as e:
27     LOGGER.error("invalid value for default_pm")
28
29 # how many days to wait before trying to re-import a runfolder
30 RESCAN_DELAY = 1
31 try:
32     RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35                  "defaulting to %s" % (RESCAN_DELAY,))
36
37 RUN_STATUS_CHOICES = (
38     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
39     (1, 'Data Pipeline Started'),
40     (2, 'Data Pipeline Interrupted'),
41     (3, 'Data Pipeline Finished'),
42     (4, 'Collect Results Started'),
43     (5, 'Collect Results Finished'),
44     (6, 'QC Started'),
45     (7, 'QC Finished'),
46     (255, 'DONE'),
47   )
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
49
50
51 class ClusterStation(models.Model):
52     """List of cluster stations"""
53     name = models.CharField(max_length=50, unique=True)
54     isdefault = models.BooleanField(default=False, null=False)
55
56     class Meta:
57         ordering = ["-isdefault", "name"]
58
59     def __str__(self):
60         return str(self.name)
61
62     @staticmethod
63     def update_isdefault(sender, instance, **kwargs):
64         """Clear default if needed
65         """
66         if instance.isdefault:
67             for c in ClusterStation.objects.filter(isdefault=True).all():
68                 if c.id != instance.id:
69                     c.isdefault = False
70                     c.save()
71
72 def cluster_station_default():
73     d = ClusterStation.objects.filter(isdefault=True).all()
74     if len(d) > 0:
75         return d[0]
76     d = ClusterStation.objects.order_by('-id').all()
77     if len(d) > 0:
78         return d[0]
79     return None
80
81 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
82
83 class Sequencer(models.Model):
84     """Sequencers we've owned
85     """
86     name = models.CharField(max_length=50, db_index=True)
87     instrument_name = models.CharField(max_length=50, db_index=True)
88     serial_number = models.CharField(max_length=50, db_index=True)
89     model = models.CharField(max_length=255)
90     active = models.BooleanField(default=True, null=False)
91     isdefault = models.BooleanField(default=False, null=False)
92     comment = models.CharField(max_length=255)
93
94     class Meta:
95         ordering = ["-isdefault", "-active", "name"]
96
97     def __str__(self):
98         name = [str(self.name)]
99         if self.instrument_name is not None:
100             name.append("(%s)" % (str(self.instrument_name),))
101         return " ".join(name)
102
103     def get_absolute_url(self):
104         return urlresolvers.reverse('sequencer',
105                                     kwargs={'sequencer_id': self.id})
106
107     @staticmethod
108     def update_isdefault(sender, instance, **kwargs):
109         """Clear default if needed
110         """
111         if instance.isdefault:
112             for s in Sequencer.objects.filter(isdefault=True).all():
113                 if s.id != instance.id:
114                     s.isdefault = False
115                     s.save()
116
117 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
118
119 def sequencer_default():
120     d = Sequencer.objects.filter(isdefault=True).all()
121     if len(d) > 0:
122         return d[0]
123     d = Sequencer.objects.order_by('active', '-id').all()
124     if len(d) > 0:
125         return d[0]
126     return None
127
128
129 class FlowCell(models.Model):
130     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
131     run_date = models.DateTimeField()
132     advanced_run = models.BooleanField(default=False)
133     paired_end = models.BooleanField(default=False)
134     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
135     control_lane = models.IntegerField(choices=[(1, 1),
136                                                 (2, 2),
137                                                 (3, 3),
138                                                 (4, 4),
139                                                 (5, 5),
140                                                 (6, 6),
141                                                 (7, 7),
142                                                 (8, 8),
143                                                 (0, 'All Lanes')],
144                                        null=True,
145                                        blank=True)
146
147     cluster_station = models.ForeignKey(ClusterStation,
148                                         default=cluster_station_default)
149     sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
150
151     notes = models.TextField(blank=True)
152
153     def __str__(self):
154         return str(self.flowcell_id)
155
156     def Lanes(self):
157         html = ['<table>']
158         for lane in self.lane_set.order_by('lane_number'):
159             cluster_estimate = lane.cluster_estimate
160             if cluster_estimate is not None:
161                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
162             else:
163                 cluster_estimate = 'None'
164             library_id = lane.library_id
165             library = lane.library
166             element = '<tr><td>%d</td>'\
167                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
168             html.append(element % (lane.lane_number,
169                                    library.get_admin_url(),
170                                    library,
171                                    cluster_estimate))
172         html.append('</table>')
173         return "\n".join(html)
174     Lanes.allow_tags = True
175
176     class Meta:
177         ordering = ["-run_date"]
178
179     def get_admin_url(self):
180         # that's the django way... except it didn't work
181         return urlresolvers.reverse('admin:experiments_flowcell_change',
182                                     args=(self.id,))
183
184     def flowcell_type(self):
185         """Convert our boolean 'is paired' flag to a name
186         """
187         if self.paired_end:
188             return "Paired"
189         else:
190             return "Single"
191
192     def get_absolute_url(self):
193         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
194         return urlresolvers.reverse('flowcell_detail', args=[str(flowcell_id)])
195
196     def get_raw_data_directory(self):
197         """Return location of where the raw data is stored"""
198         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
199
200         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
201
202     def update_sequencing_runs(self):
203         result_root = self.get_raw_data_directory()
204         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
205         if result_root is None:
206             return
207
208         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
209
210         for dirpath, dirnames, filenames in os.walk(result_root):
211             for filename in filenames:
212                 if run_xml_re.match(filename):
213                     # we have a run directory
214                     relative_pathname = get_relative_pathname(dirpath)
215                     self.import_sequencing_run(relative_pathname, filename)
216
217     def import_sequencing_run(self, relative_pathname, run_xml_name, force=False):
218         """Given a result directory import files"""
219         now = timezone.now()
220         run_dir = get_absolute_pathname(relative_pathname)
221         run_xml_path = os.path.join(run_dir, run_xml_name)
222
223         runs = SequencingRun.objects.filter(result_dir = relative_pathname)
224         if len(runs) == 0:
225             run = SequencingRun()
226             created = True
227         elif len(runs) > 1:
228             raise RuntimeError("Too many data runs for %s" % (
229                 relative_pathname,))
230         else:
231             run = runs[0]
232             created = False
233
234         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
235             LOGGER.debug("Importing run from %s" % (relative_pathname,))
236             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
237             run.flowcell = self
238             run.status = RUN_STATUS_REVERSE_MAP['DONE']
239             run.result_dir = relative_pathname
240             run.runfolder_name = run_xml_data.runfolder_name
241             run.cycle_start = run_xml_data.image_analysis.start
242             run.cycle_stop = run_xml_data.image_analysis.stop
243             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
244             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
245             run.image_software = run_xml_data.image_analysis.software
246             run.image_version = run_xml_data.image_analysis.version
247             run.basecall_software = run_xml_data.bustard.software
248             run.basecall_version = run_xml_data.bustard.version
249             # we're frequently not running alignments
250             if run_xml_data.gerald:
251                 run.alignment_software = run_xml_data.gerald.software
252                 run.alignment_version = run_xml_data.gerald.version
253
254             run.last_update_time = timezone.now()
255             run.save()
256
257             run.update_result_files()
258
259
260 # FIXME: should we automatically update sequencing run?
261 #        Or should we expect someone to call update_sequencing_runs?
262 #def update_flowcell_sequencingruns(sender, instance, *args, **kwargs):
263 #    """Update our sequencing rungs
264 #    """
265 #    if not os.path.exists(settings.RESULT_HOME_DIR):
266 #       return
267 #
268 #    instance.update_sequencing_runs()
269 #post_init.connect(update_flowcell_sequencingruns, sender=FlowCell)
270
271 LANE_STATUS_CODES = [(0, 'Failed'),
272                      (1, 'Marginal'),
273                      (2, 'Good'),
274                      (100, 'Not run')]
275 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
276 LANE_STATUS_MAP[None] = "Unknown"
277
278
279 def is_valid_lane(value):
280     if value >= 1 and value <= 8:
281         return True
282     else:
283         return False
284
285
286 class Lane(models.Model):
287     flowcell = models.ForeignKey(FlowCell)
288     lane_number = models.IntegerField()
289     library = models.ForeignKey(Library)
290     pM = models.DecimalField(max_digits=5,
291                              decimal_places=2,
292                              blank=False,
293                              null=False,
294                              default=default_pM)
295     cluster_estimate = models.IntegerField(blank=True, null=True)
296     status = models.IntegerField(choices=LANE_STATUS_CODES,
297                                  null=True,
298                                  blank=True)
299     comment = models.TextField(null=True, blank=True)
300
301     def get_absolute_url(self):
302         return urlresolvers.reverse('flowcell_lane_detail',
303                                     kwargs={'lane_pk': str(self.id)})
304
305     def __str__(self):
306         return self.flowcell.flowcell_id + ':' + str(self.lane_number)
307
308
309 class SequencingRun(models.Model):
310     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
311     runfolder_name = models.CharField(max_length=50)
312     result_dir = models.CharField(max_length=255)
313     last_update_time = models.DateTimeField()
314     run_start_time = models.DateTimeField()
315     cycle_start = models.IntegerField(null=True, blank=True)
316     cycle_stop = models.IntegerField(null=True, blank=True)
317     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
318                                      null=True, blank=True)
319     image_software = models.CharField(max_length=50)
320     image_version = models.CharField(max_length=50)
321     basecall_software = models.CharField(max_length=50)
322     basecall_version = models.CharField(max_length=50)
323     alignment_software = models.CharField(max_length=50)
324     alignment_version = models.CharField(max_length=50)
325     comment = models.TextField(blank=True)
326
327     def update_result_files(self):
328         abs_result_dir = get_absolute_pathname(self.result_dir)
329
330         for dirname, dirnames, filenames in os.walk(abs_result_dir):
331             for filename in filenames:
332                 pathname = os.path.join(dirname, filename)
333                 relative_pathname = get_relative_pathname(pathname)
334                 datafiles = self.datafile_set.filter(
335                     sequencing_run=self,
336                     relative_pathname=relative_pathname)
337                 if len(datafiles) > 0:
338                     continue
339
340                 metadata = find_file_type_metadata_from_filename(filename)
341                 if metadata is not None:
342                     metadata['filename'] = filename
343                     newfile = DataFile()
344                     newfile.sequencing_run = self
345                     newfile.file_type = metadata['file_type']
346                     newfile.relative_pathname = relative_pathname
347
348                     lane_number = metadata.get('lane', None)
349                     if lane_number is not None:
350                         lane = self.flowcell.lane_set.get(
351                             lane_number=lane_number)
352                         newfile.library = lane.library
353
354                     newfile.save()
355                     self.datafile_set.add(newfile)
356
357         self.last_update_time = timezone.now()
358
359     def lane_files(self):
360         lanes = {}
361
362         for datafile in self.datafile_set.all():
363             metadata = datafile.attributes
364             if metadata is not None:
365                 lane = metadata.get('lane', None)
366                 if lane is not None:
367                     lane_file_set = lanes.setdefault(lane, {})
368                     normalized_name = datafile.file_type.normalized_name
369                     lane_file_set[normalized_name] = datafile
370         return lanes
371
372     def ivc_plots(self, lane):
373         ivc_name = ['IVC All', 'IVC Call',
374                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
375
376         plots = {}
377         for rel_filename, metadata in self.get_result_files():
378             if metadata.file_type.name in ivc_name:
379                 plots[metadata.file_type.name] = (rel_filename, metadata)
380
381
382 class FileType(models.Model):
383     """Represent potential file types
384
385     regex is a pattern used to detect if a filename matches this type
386     data run currently assumes that there may be a (?P<lane>) and
387     (?P<end>) pattern in the regular expression.
388     """
389     name = models.CharField(max_length=50)
390     mimetype = models.CharField(max_length=50, null=True, blank=True)
391     # regular expression from glob.fnmatch.translate
392     regex = models.TextField(null=True, blank=True)
393
394     def parse_filename(self, pathname):
395         """Does filename match our pattern?
396
397         Returns None if not, or dictionary of match variables if we do.
398         """
399         path, filename = os.path.split(pathname)
400         if len(self.regex) > 0:
401             match = re.match(self.regex, filename)
402             if match is not None:
403                 # These are (?P<>) names we know about from our
404                 # default regexes.
405                 results = match.groupdict()
406
407                 # convert int parameters
408                 for attribute_name in ['lane', 'end']:
409                     value = results.get(attribute_name, None)
410                     if value is not None:
411                         results[attribute_name] = int(value)
412
413                 return results
414
415     def _get_normalized_name(self):
416         """Crush data file name into identifier friendly name"""
417         return self.name.replace(' ', '_').lower()
418     normalized_name = property(_get_normalized_name)
419
420     def __str__(self):
421         return self.name
422
423
424 def str_uuid():
425     """Helper function to set default UUID in DataFile"""
426     return str(uuid.uuid1())
427
428
429 class DataFile(models.Model):
430     """Store map from random ID to filename"""
431     random_key = models.CharField(max_length=64,
432                                   db_index=True,
433                                   default=str_uuid)
434     sequencing_run = models.ForeignKey(SequencingRun, db_index=True, null=True)
435     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
436     file_type = models.ForeignKey(FileType)
437     relative_pathname = models.CharField(max_length=255, db_index=True)
438
439     def _get_attributes(self):
440         return self.file_type.parse_filename(self.relative_pathname)
441     attributes = property(_get_attributes)
442
443     def _get_pathname(self):
444         return get_absolute_pathname(self.relative_pathname)
445     pathname = property(_get_pathname)
446
447     def get_absolute_url(self):
448         return urlresolvers.reverse('read_result_file', (), {'key': self.random_key})
449
450
451 def find_file_type_metadata_from_filename(pathname):
452     path, filename = os.path.split(pathname)
453     result = None
454     for file_type in FileType.objects.all():
455         result = file_type.parse_filename(filename)
456         if result is not None:
457             result['file_type'] = file_type
458             return result
459
460     return None
461
462
463 def get_relative_pathname(abspath):
464     """Strip off the result home directory from a path
465     """
466     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
467     relative_pathname = abspath.replace(result_home_dir, '')
468     return relative_pathname
469
470
471 def get_absolute_pathname(relative_pathname):
472     """Attach relative path to  results home directory"""
473     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)