efba13a6edf43272be3a6d774a3b34760647dcf0
[htsworkflow.git] / experiments / models.py
1 from __future__ import absolute_import, print_function, unicode_literals
2
3 import datetime
4 import glob
5 import logging
6 import os
7 import re
8 import uuid
9
10 from django.conf import settings
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import pre_save
15
16 from samples.models import Library, HTSUser
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
19
20 import pytz
21
22 LOGGER = logging.getLogger(__name__)
23 default_pM = 5
24 try:
25     default_pM = int(settings.DEFAULT_PM)
26 except AttributeError as e:
27     LOGGER.error("invalid value for default_pm")
28
29 # how many days to wait before trying to re-import a runfolder
30 RESCAN_DELAY = 1
31 try:
32     RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35                  "defaulting to %s" % (RESCAN_DELAY,))
36
37 RUN_STATUS_CHOICES = (
38     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
39     (1, 'Data Pipeline Started'),
40     (2, 'Data Pipeline Interrupted'),
41     (3, 'Data Pipeline Finished'),
42     (4, 'Collect Results Started'),
43     (5, 'Collect Results Finished'),
44     (6, 'QC Started'),
45     (7, 'QC Finished'),
46     (255, 'DONE'),
47   )
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
49
50
51 class ClusterStation(models.Model):
52     """List of cluster stations"""
53     name = models.CharField(max_length=50, unique=True)
54     isdefault = models.BooleanField(default=False, null=False)
55
56     class Meta:
57         ordering = ["-isdefault", "name"]
58
59     def __str__(self):
60         return str(self.name)
61
62     @staticmethod
63     def update_isdefault(sender, instance, **kwargs):
64         """Clear default if needed
65         """
66         if instance.isdefault:
67             for c in ClusterStation.objects.filter(isdefault=True).all():
68                 if c.id != instance.id:
69                     c.isdefault = False
70                     c.save()
71
72 def cluster_station_default():
73     d = ClusterStation.objects.filter(isdefault=True).all()
74     if len(d) > 0:
75         return d[0]
76     d = ClusterStation.objects.order_by('-id').all()
77     if len(d) > 0:
78         return d[0]
79     return None
80
81 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
82
83 class Sequencer(models.Model):
84     """Sequencers we've owned
85     """
86     name = models.CharField(max_length=50, db_index=True)
87     instrument_name = models.CharField(max_length=50, db_index=True)
88     serial_number = models.CharField(max_length=50, db_index=True)
89     model = models.CharField(max_length=255)
90     active = models.BooleanField(default=True, null=False)
91     isdefault = models.BooleanField(default=False, null=False)
92     comment = models.CharField(max_length=255)
93
94     class Meta:
95         ordering = ["-isdefault", "-active", "name"]
96
97     def __str__(self):
98         name = [str(self.name)]
99         if self.instrument_name is not None:
100             name.append("(%s)" % (str(self.instrument_name),))
101         return " ".join(name)
102
103     @models.permalink
104     def get_absolute_url(self):
105         return ('experiments.views.sequencer',
106                 [self.id])
107
108     @staticmethod
109     def update_isdefault(sender, instance, **kwargs):
110         """Clear default if needed
111         """
112         if instance.isdefault:
113             for s in Sequencer.objects.filter(isdefault=True).all():
114                 if s.id != instance.id:
115                     s.isdefault = False
116                     s.save()
117
118 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
119
120 def sequencer_default():
121     d = Sequencer.objects.filter(isdefault=True).all()
122     if len(d) > 0:
123         return d[0]
124     d = Sequencer.objects.order_by('active', '-id').all()
125     if len(d) > 0:
126         return d[0]
127     return None
128
129
130 class FlowCell(models.Model):
131     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
132     run_date = models.DateTimeField()
133     advanced_run = models.BooleanField(default=False)
134     paired_end = models.BooleanField(default=False)
135     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
136     control_lane = models.IntegerField(choices=[(1, 1),
137                                                 (2, 2),
138                                                 (3, 3),
139                                                 (4, 4),
140                                                 (5, 5),
141                                                 (6, 6),
142                                                 (7, 7),
143                                                 (8, 8),
144                                                 (0, 'All Lanes')],
145                                        null=True,
146                                        blank=True)
147
148     cluster_station = models.ForeignKey(ClusterStation,
149                                         default=cluster_station_default)
150     sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
151
152     notes = models.TextField(blank=True)
153
154     def __str__(self):
155         return str(self.flowcell_id)
156
157     def Lanes(self):
158         html = ['<table>']
159         for lane in self.lane_set.order_by('lane_number'):
160             cluster_estimate = lane.cluster_estimate
161             if cluster_estimate is not None:
162                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
163             else:
164                 cluster_estimate = 'None'
165             library_id = lane.library_id
166             library = lane.library
167             element = '<tr><td>%d</td>'\
168                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
169             html.append(element % (lane.lane_number,
170                                    library.get_admin_url(),
171                                    library,
172                                    cluster_estimate))
173         html.append('</table>')
174         return "\n".join(html)
175     Lanes.allow_tags = True
176
177     class Meta:
178         ordering = ["-run_date"]
179
180     def get_admin_url(self):
181         # that's the django way... except it didn't work
182         return urlresolvers.reverse('admin:experiments_flowcell_change',
183                                     args=(self.id,))
184
185     def flowcell_type(self):
186         """Convert our boolean 'is paired' flag to a name
187         """
188         if self.paired_end:
189             return "Paired"
190         else:
191             return "Single"
192
193     @models.permalink
194     def get_absolute_url(self):
195         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
196         return ('experiments.views.flowcell_detail',
197                 [str(flowcell_id)])
198
199     def get_raw_data_directory(self):
200         """Return location of where the raw data is stored"""
201         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
202
203         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
204
205     def update_sequencing_runs(self):
206         result_root = self.get_raw_data_directory()
207         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
208         if result_root is None:
209             return
210
211         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
212         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
213
214         result_dirs = []
215         for dirpath, dirnames, filenames in os.walk(result_root):
216             for filename in filenames:
217                 if run_xml_re.match(filename):
218                     # we have a run directory
219                     relative_pathname = get_relative_pathname(dirpath)
220                     self.import_sequencing_run(relative_pathname, filename)
221
222     def import_sequencing_run(self, relative_pathname, run_xml_name, force=False):
223         """Given a result directory import files"""
224         now = timezone.now()
225         run_dir = get_absolute_pathname(relative_pathname)
226         run_xml_path = os.path.join(run_dir, run_xml_name)
227
228         runs = SequencingRun.objects.filter(result_dir = relative_pathname)
229         if len(runs) == 0:
230             run = SequencingRun()
231             created = True
232         elif len(runs) > 1:
233             raise RuntimeError("Too many data runs for %s" % (
234                 relative_pathname,))
235         else:
236             run = runs[0]
237             created = False
238
239         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
240             LOGGER.debug("Importing run from %s" % (relative_pathname,))
241             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
242             run.flowcell = self
243             run.status = RUN_STATUS_REVERSE_MAP['DONE']
244             run.result_dir = relative_pathname
245             run.runfolder_name = run_xml_data.runfolder_name
246             run.cycle_start = run_xml_data.image_analysis.start
247             run.cycle_stop = run_xml_data.image_analysis.stop
248             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
249             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
250             run.image_software = run_xml_data.image_analysis.software
251             run.image_version = run_xml_data.image_analysis.version
252             run.basecall_software = run_xml_data.bustard.software
253             run.basecall_version = run_xml_data.bustard.version
254             # we're frequently not running alignments
255             if run_xml_data.gerald:
256                 run.alignment_software = run_xml_data.gerald.software
257                 run.alignment_version = run_xml_data.gerald.version
258
259             run.last_update_time = timezone.now()
260             run.save()
261
262             run.update_result_files()
263
264
265 # FIXME: should we automatically update sequencing run?
266 #        Or should we expect someone to call update_sequencing_runs?
267 #def update_flowcell_sequencingruns(sender, instance, *args, **kwargs):
268 #    """Update our sequencing rungs
269 #    """
270 #    if not os.path.exists(settings.RESULT_HOME_DIR):
271 #       return
272 #
273 #    instance.update_sequencing_runs()
274 #post_init.connect(update_flowcell_sequencingruns, sender=FlowCell)
275
276 LANE_STATUS_CODES = [(0, 'Failed'),
277                      (1, 'Marginal'),
278                      (2, 'Good'),
279                      (100, 'Not run')]
280 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
281 LANE_STATUS_MAP[None] = "Unknown"
282
283
284 def is_valid_lane(value):
285     if value >= 1 and value <= 8:
286         return True
287     else:
288         return False
289
290
291 class Lane(models.Model):
292     flowcell = models.ForeignKey(FlowCell)
293     lane_number = models.IntegerField()
294     library = models.ForeignKey(Library)
295     pM = models.DecimalField(max_digits=5,
296                              decimal_places=2,
297                              blank=False,
298                              null=False,
299                              default=default_pM)
300     cluster_estimate = models.IntegerField(blank=True, null=True)
301     status = models.IntegerField(choices=LANE_STATUS_CODES,
302                                  null=True,
303                                  blank=True)
304     comment = models.TextField(null=True, blank=True)
305
306     @models.permalink
307     def get_absolute_url(self):
308         return ('experiments.views.flowcell_lane_detail',
309                 [str(self.id)])
310
311     def __str__(self):
312         return self.flowcell.flowcell_id + ':' + str(self.lane_number)
313
314
315 class SequencingRun(models.Model):
316     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
317     runfolder_name = models.CharField(max_length=50)
318     result_dir = models.CharField(max_length=255)
319     last_update_time = models.DateTimeField()
320     run_start_time = models.DateTimeField()
321     cycle_start = models.IntegerField(null=True, blank=True)
322     cycle_stop = models.IntegerField(null=True, blank=True)
323     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
324                                      null=True, blank=True)
325     image_software = models.CharField(max_length=50)
326     image_version = models.CharField(max_length=50)
327     basecall_software = models.CharField(max_length=50)
328     basecall_version = models.CharField(max_length=50)
329     alignment_software = models.CharField(max_length=50)
330     alignment_version = models.CharField(max_length=50)
331     comment = models.TextField(blank=True)
332
333     def update_result_files(self):
334         abs_result_dir = get_absolute_pathname(self.result_dir)
335
336         for dirname, dirnames, filenames in os.walk(abs_result_dir):
337             for filename in filenames:
338                 pathname = os.path.join(dirname, filename)
339                 relative_pathname = get_relative_pathname(pathname)
340                 datafiles = self.datafile_set.filter(
341                     sequencing_run=self,
342                     relative_pathname=relative_pathname)
343                 if len(datafiles) > 0:
344                     continue
345
346                 metadata = find_file_type_metadata_from_filename(filename)
347                 if metadata is not None:
348                     metadata['filename'] = filename
349                     newfile = DataFile()
350                     newfile.sequencing_run = self
351                     newfile.file_type = metadata['file_type']
352                     newfile.relative_pathname = relative_pathname
353
354                     lane_number = metadata.get('lane', None)
355                     if lane_number is not None:
356                         lane = self.flowcell.lane_set.get(
357                             lane_number=lane_number)
358                         newfile.library = lane.library
359
360                     self.datafile_set.add(newfile)
361
362         self.last_update_time = timezone.now()
363
364     def lane_files(self):
365         lanes = {}
366
367         for datafile in self.datafile_set.all():
368             metadata = datafile.attributes
369             if metadata is not None:
370                 lane = metadata.get('lane', None)
371                 if lane is not None:
372                     lane_file_set = lanes.setdefault(lane, {})
373                     normalized_name = datafile.file_type.normalized_name
374                     lane_file_set[normalized_name] = datafile
375         return lanes
376
377     def ivc_plots(self, lane):
378         ivc_name = ['IVC All', 'IVC Call',
379                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
380
381         plots = {}
382         for rel_filename, metadata in self.get_result_files():
383             if metadata.file_type.name in ivc_name:
384                 plots[metadata.file_type.name] = (rel_filename, metadata)
385
386
387 class FileType(models.Model):
388     """Represent potential file types
389
390     regex is a pattern used to detect if a filename matches this type
391     data run currently assumes that there may be a (?P<lane>) and
392     (?P<end>) pattern in the regular expression.
393     """
394     name = models.CharField(max_length=50)
395     mimetype = models.CharField(max_length=50, null=True, blank=True)
396     # regular expression from glob.fnmatch.translate
397     regex = models.TextField(null=True, blank=True)
398
399     def parse_filename(self, pathname):
400         """Does filename match our pattern?
401
402         Returns None if not, or dictionary of match variables if we do.
403         """
404         path, filename = os.path.split(pathname)
405         if len(self.regex) > 0:
406             match = re.match(self.regex, filename)
407             if match is not None:
408                 # These are (?P<>) names we know about from our
409                 # default regexes.
410                 results = match.groupdict()
411
412                 # convert int parameters
413                 for attribute_name in ['lane', 'end']:
414                     value = results.get(attribute_name, None)
415                     if value is not None:
416                         results[attribute_name] = int(value)
417
418                 return results
419
420     def _get_normalized_name(self):
421         """Crush data file name into identifier friendly name"""
422         return self.name.replace(' ', '_').lower()
423     normalized_name = property(_get_normalized_name)
424
425     def __str__(self):
426         return self.name
427
428
429 def str_uuid():
430     """Helper function to set default UUID in DataFile"""
431     return str(uuid.uuid1())
432
433
434 class DataFile(models.Model):
435     """Store map from random ID to filename"""
436     random_key = models.CharField(max_length=64,
437                                   db_index=True,
438                                   default=str_uuid)
439     sequencing_run = models.ForeignKey(SequencingRun, db_index=True, null=True)
440     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
441     file_type = models.ForeignKey(FileType)
442     relative_pathname = models.CharField(max_length=255, db_index=True)
443
444     def _get_attributes(self):
445         return self.file_type.parse_filename(self.relative_pathname)
446     attributes = property(_get_attributes)
447
448     def _get_pathname(self):
449         return get_absolute_pathname(self.relative_pathname)
450     pathname = property(_get_pathname)
451
452     @models.permalink
453     def get_absolute_url(self):
454         return ('experiments.views.read_result_file',
455                 (), {'key': self.random_key})
456
457
458 def find_file_type_metadata_from_filename(pathname):
459     path, filename = os.path.split(pathname)
460     result = None
461     for file_type in FileType.objects.all():
462         result = file_type.parse_filename(filename)
463         if result is not None:
464             result['file_type'] = file_type
465             return result
466
467     return None
468
469
470 def get_relative_pathname(abspath):
471     """Strip off the result home directory from a path
472     """
473     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
474     relative_pathname = abspath.replace(result_home_dir, '')
475     return relative_pathname
476
477
478 def get_absolute_pathname(relative_pathname):
479     """Attach relative path to  results home directory"""
480     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)