508abc71d770d3dfd53d9d0350cd09262e8b3e13
[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
1 import datetime
2 import glob
3 import logging
4 import os
5 import re
6 import types
7 import uuid
8
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import post_init, pre_save
15
16 from htsworkflow.frontend.samples.models import Library
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
19
20 import pytz
21
22 LOGGER = logging.getLogger(__name__)
23 default_pM = 5
24 try:
25     default_pM = int(settings.DEFAULT_PM)
26 except AttributeError, e:
27     LOGGER.error("invalid value for frontend.default_pm")
28
29 # how many days to wait before trying to re-import a runfolder
30 RESCAN_DELAY = 1
31 try:
32     RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35                  "defaulting to %s" % (RESCAN_DELAY,))
36
37 RUN_STATUS_CHOICES = (
38     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
39     (1, 'Data Pipeline Started'),
40     (2, 'Data Pipeline Interrupted'),
41     (3, 'Data Pipeline Finished'),
42     (4, 'Collect Results Started'),
43     (5, 'Collect Results Finished'),
44     (6, 'QC Started'),
45     (7, 'QC Finished'),
46     (255, 'DONE'),
47   )
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
49
50
51 class ClusterStation(models.Model):
52     """List of cluster stations"""
53     name = models.CharField(max_length=50, unique=True)
54     isdefault = models.BooleanField(default=False, null=False)
55
56     class Meta:
57         ordering = ["-isdefault", "name"]
58
59     def __unicode__(self):
60         return unicode(self.name)
61
62     @classmethod
63     def default(cls):
64         d = cls.objects.filter(isdefault=True).all()
65         if len(d) > 0:
66             return d[0]
67         d = cls.objects.order_by('-id').all()
68         if len(d) > 0:
69             return d[0]
70         return None
71
72     @staticmethod
73     def update_isdefault(sender, instance, **kwargs):
74         """Clear default if needed
75         """
76         if instance.isdefault:
77             for c in ClusterStation.objects.filter(isdefault=True).all():
78                 if c.id != instance.id:
79                     c.isdefault = False
80                     c.save()
81
82 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
83
84 class Sequencer(models.Model):
85     """Sequencers we've owned
86     """
87     name = models.CharField(max_length=50, db_index=True)
88     instrument_name = models.CharField(max_length=50, db_index=True)
89     serial_number = models.CharField(max_length=50, db_index=True)
90     model = models.CharField(max_length=255)
91     active = models.BooleanField(default=True, null=False)
92     isdefault = models.BooleanField(default=False, null=False)
93     comment = models.CharField(max_length=255)
94
95     class Meta:
96         ordering = ["-isdefault", "-active", "name"]
97
98     def __unicode__(self):
99         name = [unicode(self.name)]
100         if self.instrument_name is not None:
101             name.append("(%s)" % (unicode(self.instrument_name),))
102         return " ".join(name)
103
104     @models.permalink
105     def get_absolute_url(self):
106         return ('htsworkflow.frontend.experiments.views.sequencer',
107                 [self.id])
108
109     @classmethod
110     def default(cls):
111         d = cls.objects.filter(isdefault=True).all()
112         if len(d) > 0:
113             return d[0]
114         d = cls.objects.order_by('active', '-id').all()
115         if len(d) > 0:
116             return d[0]
117         return None
118
119     @staticmethod
120     def update_isdefault(sender, instance, **kwargs):
121         """Clear default if needed
122         """
123         if instance.isdefault:
124             for s in Sequencer.objects.filter(isdefault=True).all():
125                 if s.id != instance.id:
126                     s.isdefault = False
127                     s.save()
128
129 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
130
131 class FlowCellModel(models.Model):
132     name = models.TextField()
133     fixed_time = models.IntegerField(default=0, help_text='(seconds)')
134     per_cycle_time = models.IntegerField(default=0, help_text='(seconds)')
135     isdefault = models.BooleanField(default=False)
136
137     def __unicode__(self):
138         return unicode(self.name)
139
140     @classmethod
141     def default(cls):
142         d = cls.objects.filter(isdefault=True).all()
143         if len(d) > 0:
144             return d[-1]
145         return None
146
147     @staticmethod
148     def update_isdefault(sender, instance, **kwargs):
149         """Clear default if needed
150         """
151         if instance.isdefault:
152             for s in FlowCellType.objects.filter(isdefault=True).all():
153                 if s.id != instance.id:
154                     s.isdefault = False
155                     s.save()
156
157 pre_save.connect(FlowCellModel.update_isdefault, sender=FlowCellModel)
158
159 class FlowCell(models.Model):
160     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
161     run_date = models.DateTimeField()
162     advanced_run = models.BooleanField(default=False)
163     paired_end = models.BooleanField(default=False)
164     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
165     control_lane = models.IntegerField(choices=[(1, 1),
166                                                 (2, 2),
167                                                 (3, 3),
168                                                 (4, 4),
169                                                 (5, 5),
170                                                 (6, 6),
171                                                 (7, 7),
172                                                 (8, 8),
173                                                 (0, 'All Lanes')],
174                                        null=True,
175                                        blank=True)
176
177     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
178     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
179     flowcell_model = models.ForeignKey(FlowCellModel, default=FlowCellModel.default)
180
181     notes = models.TextField(blank=True)
182
183     def __unicode__(self):
184         return unicode(self.flowcell_id)
185
186     def Lanes(self):
187         html = ['<table>']
188         for lane in self.lane_set.order_by('lane_number'):
189             cluster_estimate = lane.cluster_estimate
190             if cluster_estimate is not None:
191                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
192             else:
193                 cluster_estimate = 'None'
194             library_id = lane.library_id
195             library = lane.library
196             element = '<tr><td>%d</td>'\
197                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
198             html.append(element % (lane.lane_number,
199                                    library.get_admin_url(),
200                                    library,
201                                    cluster_estimate))
202         html.append('</table>')
203         return "\n".join(html)
204     Lanes.allow_tags = True
205
206     class Meta:
207         ordering = ["-run_date"]
208
209     def get_admin_url(self):
210         # that's the django way... except it didn't work
211         return urlresolvers.reverse('admin:experiments_flowcell_change',
212                                     args=(self.id,))
213
214     def flowcell_type(self):
215         """Convert our boolean 'is paired' flag to a name
216         """
217         if self.paired_end:
218             return u"Paired"
219         else:
220             return u"Single"
221
222     @models.permalink
223     def get_absolute_url(self):
224         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
225         return ('htsworkflow.frontend.experiments.views.flowcell_detail',
226                 [str(flowcell_id)])
227
228     def get_raw_data_directory(self):
229         """Return location of where the raw data is stored"""
230         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
231
232         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
233
234     def update_data_runs(self):
235         result_root = self.get_raw_data_directory()
236         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
237         if result_root is None:
238             return
239
240         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
241         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
242
243         result_dirs = []
244         for dirpath, dirnames, filenames in os.walk(result_root):
245             for filename in filenames:
246                 if run_xml_re.match(filename):
247                     # we have a run directory
248                     relative_pathname = get_relative_pathname(dirpath)
249                     self.import_data_run(relative_pathname, filename)
250
251     def import_data_run(self, relative_pathname, run_xml_name, force=False):
252         """Given a result directory import files"""
253         now = timezone.now()
254         run_dir = get_absolute_pathname(relative_pathname)
255         run_xml_path = os.path.join(run_dir, run_xml_name)
256
257         runs = DataRun.objects.filter(result_dir = relative_pathname)
258         if len(runs) == 0:
259             run = DataRun()
260             created = True
261         elif len(runs) > 1:
262             raise RuntimeError("Too many data runs for %s" % (
263                 relative_pathname,))
264         else:
265             run = runs[0]
266             created = False
267
268         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
269             LOGGER.debug("Importing run from %s" % (relative_pathname,))
270             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
271             run.flowcell = self
272             run.status = RUN_STATUS_REVERSE_MAP['DONE']
273             run.result_dir = relative_pathname
274             run.runfolder_name = run_xml_data.runfolder_name
275             run.cycle_start = run_xml_data.image_analysis.start
276             run.cycle_stop = run_xml_data.image_analysis.stop
277             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
278             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
279             run.image_software = run_xml_data.image_analysis.software
280             run.image_version = run_xml_data.image_analysis.version
281             run.basecall_software = run_xml_data.bustard.software
282             run.basecall_version = run_xml_data.bustard.version
283             # we're frequently not running alignments
284             if run_xml_data.gerald:
285                 run.alignment_software = run_xml_data.gerald.software
286                 run.alignment_version = run_xml_data.gerald.version
287
288             run.last_update_time = timezone.now()
289             run.save()
290
291             run.update_result_files()
292
293
294 # FIXME: should we automatically update dataruns?
295 #        Or should we expect someone to call update_data_runs?
296 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
297 #    """Update our dataruns
298 #    """
299 #    if not os.path.exists(settings.RESULT_HOME_DIR):
300 #       return
301 #
302 #    instance.update_data_runs()
303 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
304
305
306 LANE_STATUS_CODES = [(0, 'Failed'),
307                      (1, 'Marginal'),
308                      (2, 'Good'), ]
309 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
310 LANE_STATUS_MAP[None] = "Unknown"
311
312
313 def is_valid_lane(value):
314     if value >= 1 and value <= 8:
315         return True
316     else:
317         return False
318
319
320 class Lane(models.Model):
321     flowcell = models.ForeignKey(FlowCell)
322     lane_number = models.IntegerField()
323     library = models.ForeignKey(Library)
324     pM = models.DecimalField(max_digits=5,
325                              decimal_places=2,
326                              blank=False,
327                              null=False,
328                              default=default_pM)
329     cluster_estimate = models.IntegerField(blank=True, null=True)
330     status = models.IntegerField(choices=LANE_STATUS_CODES,
331                                  null=True,
332                                  blank=True)
333     comment = models.TextField(null=True, blank=True)
334
335     @models.permalink
336     def get_absolute_url(self):
337         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
338                 [str(self.id)])
339
340     def __unicode__(self):
341         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
342
343
344 class DataRun(models.Model):
345     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
346     runfolder_name = models.CharField(max_length=50)
347     result_dir = models.CharField(max_length=255)
348     last_update_time = models.DateTimeField()
349     run_start_time = models.DateTimeField()
350     cycle_start = models.IntegerField(null=True, blank=True)
351     cycle_stop = models.IntegerField(null=True, blank=True)
352     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
353                                      null=True, blank=True)
354     image_software = models.CharField(max_length=50)
355     image_version = models.CharField(max_length=50)
356     basecall_software = models.CharField(max_length=50)
357     basecall_version = models.CharField(max_length=50)
358     alignment_software = models.CharField(max_length=50)
359     alignment_version = models.CharField(max_length=50)
360     comment = models.TextField(blank=True)
361
362     def update_result_files(self):
363         abs_result_dir = get_absolute_pathname(self.result_dir)
364
365         for dirname, dirnames, filenames in os.walk(abs_result_dir):
366             for filename in filenames:
367                 pathname = os.path.join(dirname, filename)
368                 relative_pathname = get_relative_pathname(pathname)
369                 datafiles = self.datafile_set.filter(
370                     data_run=self,
371                     relative_pathname=relative_pathname)
372                 if len(datafiles) > 0:
373                     continue
374
375                 metadata = find_file_type_metadata_from_filename(filename)
376                 if metadata is not None:
377                     metadata['filename'] = filename
378                     newfile = DataFile()
379                     newfile.data_run = self
380                     newfile.file_type = metadata['file_type']
381                     newfile.relative_pathname = relative_pathname
382
383                     lane_number = metadata.get('lane', None)
384                     if lane_number is not None:
385                         lane = self.flowcell.lane_set.get(
386                             lane_number=lane_number)
387                         newfile.library = lane.library
388
389                     self.datafile_set.add(newfile)
390
391         self.last_update_time = timezone.now()
392
393     def lane_files(self):
394         lanes = {}
395
396         for datafile in self.datafile_set.all():
397             metadata = datafile.attributes
398             if metadata is not None:
399                 lane = metadata.get('lane', None)
400                 if lane is not None:
401                     lane_file_set = lanes.setdefault(lane, {})
402                     normalized_name = datafile.file_type.normalized_name
403                     lane_file_set[normalized_name] = datafile
404         return lanes
405
406     def ivc_plots(self, lane):
407         ivc_name = ['IVC All', 'IVC Call',
408                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
409
410         plots = {}
411         for rel_filename, metadata in self.get_result_files():
412             if metadata.file_type.name in ivc_name:
413                 plots[metadata.file_type.name] = (rel_filename, metadata)
414
415
416 class FileType(models.Model):
417     """Represent potential file types
418
419     regex is a pattern used to detect if a filename matches this type
420     data run currently assumes that there may be a (?P<lane>) and
421     (?P<end>) pattern in the regular expression.
422     """
423     name = models.CharField(max_length=50)
424     mimetype = models.CharField(max_length=50, null=True, blank=True)
425     # regular expression from glob.fnmatch.translate
426     regex = models.TextField(null=True, blank=True)
427
428     def parse_filename(self, pathname):
429         """Does filename match our pattern?
430
431         Returns None if not, or dictionary of match variables if we do.
432         """
433         path, filename = os.path.split(pathname)
434         if len(self.regex) > 0:
435             match = re.match(self.regex, filename)
436             if match is not None:
437                 # These are (?P<>) names we know about from our
438                 # default regexes.
439                 results = match.groupdict()
440
441                 # convert int parameters
442                 for attribute_name in ['lane', 'end']:
443                     value = results.get(attribute_name, None)
444                     if value is not None:
445                         results[attribute_name] = int(value)
446
447                 return results
448
449     def _get_normalized_name(self):
450         """Crush data file name into identifier friendly name"""
451         return self.name.replace(' ', '_').lower()
452     normalized_name = property(_get_normalized_name)
453
454     def __unicode__(self):
455         #return u"<FileType: %s>" % (self.name,)
456         return self.name
457
458
459 def str_uuid():
460     """Helper function to set default UUID in DataFile"""
461     return str(uuid.uuid1())
462
463
464 class DataFile(models.Model):
465     """Store map from random ID to filename"""
466     random_key = models.CharField(max_length=64,
467                                   db_index=True,
468                                   default=str_uuid)
469     data_run = models.ForeignKey(DataRun, db_index=True)
470     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
471     file_type = models.ForeignKey(FileType)
472     relative_pathname = models.CharField(max_length=255, db_index=True)
473
474     def _get_attributes(self):
475         return self.file_type.parse_filename(self.relative_pathname)
476     attributes = property(_get_attributes)
477
478     def _get_pathname(self):
479         return get_absolute_pathname(self.relative_pathname)
480     pathname = property(_get_pathname)
481
482     @models.permalink
483     def get_absolute_url(self):
484         return ('htsworkflow.frontend.experiments.views.read_result_file',
485                 (), {'key': self.random_key})
486
487
488 def find_file_type_metadata_from_filename(pathname):
489     path, filename = os.path.split(pathname)
490     result = None
491     for file_type in FileType.objects.all():
492         result = file_type.parse_filename(filename)
493         if result is not None:
494             result['file_type'] = file_type
495             return result
496
497     return None
498
499
500 def get_relative_pathname(abspath):
501     """Strip off the result home directory from a path
502     """
503     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
504     relative_pathname = abspath.replace(result_home_dir, '')
505     return relative_pathname
506
507
508 def get_absolute_pathname(relative_pathname):
509     """Attach relative path to  results home directory"""
510     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)