Add flowcell model type to the flowcell model
[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
1 import datetime
2 import glob
3 import logging
4 import os
5 import re
6 import types
7 import uuid
8
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import post_init, pre_save
15
16 from htsworkflow.frontend.samples.models import Library
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
19
20 import pytz
21
22 LOGGER = logging.getLogger(__name__)
23 default_pM = 5
24 try:
25     default_pM = int(settings.DEFAULT_PM)
26 except AttributeError, e:
27     LOGGER.error("invalid value for frontend.default_pm")
28
29 # how many days to wait before trying to re-import a runfolder
30 RESCAN_DELAY = 1
31 try:
32     RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35                  "defaulting to %s" % (RESCAN_DELAY,))
36
37 RUN_STATUS_CHOICES = (
38     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
39     (1, 'Data Pipeline Started'),
40     (2, 'Data Pipeline Interrupted'),
41     (3, 'Data Pipeline Finished'),
42     (4, 'Collect Results Started'),
43     (5, 'Collect Results Finished'),
44     (6, 'QC Started'),
45     (7, 'QC Finished'),
46     (255, 'DONE'),
47   )
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
49
50
51 class ClusterStation(models.Model):
52     """List of cluster stations"""
53     name = models.CharField(max_length=50, unique=True)
54     isdefault = models.BooleanField(default=False, null=False)
55
56     class Meta:
57         ordering = ["-isdefault", "name"]
58
59     def __unicode__(self):
60         return unicode(self.name)
61
62     @classmethod
63     def default(cls):
64         d = cls.objects.filter(isdefault=True).all()
65         if len(d) > 0:
66             return d[0]
67         d = cls.objects.order_by('-id').all()
68         if len(d) > 0:
69             return d[0]
70         return None
71
72     @staticmethod
73     def update_isdefault(sender, instance, **kwargs):
74         """Clear default if needed
75         """
76         if instance.isdefault:
77             for c in ClusterStation.objects.filter(isdefault=True).all():
78                 if c.id != instance.id:
79                     c.isdefault = False
80                     c.save()
81
82 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
83
84 class Sequencer(models.Model):
85     """Sequencers we've owned
86     """
87     name = models.CharField(max_length=50, db_index=True)
88     instrument_name = models.CharField(max_length=50, db_index=True)
89     serial_number = models.CharField(max_length=50, db_index=True)
90     model = models.CharField(max_length=255)
91     active = models.BooleanField(default=True, null=False)
92     isdefault = models.BooleanField(default=False, null=False)
93     comment = models.CharField(max_length=255)
94
95     class Meta:
96         ordering = ["-isdefault", "-active", "name"]
97
98     def __unicode__(self):
99         name = [unicode(self.name)]
100         if self.instrument_name is not None:
101             name.append("(%s)" % (unicode(self.instrument_name),))
102         return " ".join(name)
103
104     @models.permalink
105     def get_absolute_url(self):
106         return ('htsworkflow.frontend.experiments.views.sequencer',
107                 [self.id])
108
109     @classmethod
110     def default(cls):
111         d = cls.objects.filter(isdefault=True).all()
112         if len(d) > 0:
113             return d[0]
114         d = cls.objects.order_by('active', '-id').all()
115         if len(d) > 0:
116             return d[0]
117         return None
118
119     @staticmethod
120     def update_isdefault(sender, instance, **kwargs):
121         """Clear default if needed
122         """
123         if instance.isdefault:
124             for s in Sequencer.objects.filter(isdefault=True).all():
125                 if s.id != instance.id:
126                     s.isdefault = False
127                     s.save()
128
129 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
130
131 class FlowCellModel(models.Model):
132     name = models.TextField()
133     fixed_time = models.IntegerField(default=0, help_text='(seconds)')
134     per_cycle_time = models.IntegerField(default=0, help_text='(seconds)')
135     isdefault = models.BooleanField(default=False)
136
137     @classmethod
138     def default(cls):
139         d = cls.objects.filter(isdefault=True).all()
140         if len(d) > 0:
141             return d[-1]
142         return None
143
144     @staticmethod
145     def update_isdefault(sender, instance, **kwargs):
146         """Clear default if needed
147         """
148         if instance.isdefault:
149             for s in FlowCellType.objects.filter(isdefault=True).all():
150                 if s.id != instance.id:
151                     s.isdefault = False
152                     s.save()
153
154 pre_save.connect(FlowCellModel.update_isdefault, sender=FlowCellModel)
155
156 class FlowCell(models.Model):
157     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
158     run_date = models.DateTimeField()
159     advanced_run = models.BooleanField(default=False)
160     paired_end = models.BooleanField(default=False)
161     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
162     control_lane = models.IntegerField(choices=[(1, 1),
163                                                 (2, 2),
164                                                 (3, 3),
165                                                 (4, 4),
166                                                 (5, 5),
167                                                 (6, 6),
168                                                 (7, 7),
169                                                 (8, 8),
170                                                 (0, 'All Lanes')],
171                                        null=True,
172                                        blank=True)
173
174     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
175     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
176     flowcell_model = models.ForeignKey(FlowCellModel, default=FlowCellModel.default)
177
178     notes = models.TextField(blank=True)
179
180     def __unicode__(self):
181         return unicode(self.flowcell_id)
182
183     def Lanes(self):
184         html = ['<table>']
185         for lane in self.lane_set.order_by('lane_number'):
186             cluster_estimate = lane.cluster_estimate
187             if cluster_estimate is not None:
188                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
189             else:
190                 cluster_estimate = 'None'
191             library_id = lane.library_id
192             library = lane.library
193             element = '<tr><td>%d</td>'\
194                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
195             html.append(element % (lane.lane_number,
196                                    library.get_admin_url(),
197                                    library,
198                                    cluster_estimate))
199         html.append('</table>')
200         return "\n".join(html)
201     Lanes.allow_tags = True
202
203     class Meta:
204         ordering = ["-run_date"]
205
206     def get_admin_url(self):
207         # that's the django way... except it didn't work
208         return urlresolvers.reverse('admin:experiments_flowcell_change',
209                                     args=(self.id,))
210
211     def flowcell_type(self):
212         """Convert our boolean 'is paired' flag to a name
213         """
214         if self.paired_end:
215             return u"Paired"
216         else:
217             return u"Single"
218
219     @models.permalink
220     def get_absolute_url(self):
221         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
222         return ('htsworkflow.frontend.experiments.views.flowcell_detail',
223                 [str(flowcell_id)])
224
225     def get_raw_data_directory(self):
226         """Return location of where the raw data is stored"""
227         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
228
229         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
230
231     def update_data_runs(self):
232         result_root = self.get_raw_data_directory()
233         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
234         if result_root is None:
235             return
236
237         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
238         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
239
240         result_dirs = []
241         for dirpath, dirnames, filenames in os.walk(result_root):
242             for filename in filenames:
243                 if run_xml_re.match(filename):
244                     # we have a run directory
245                     relative_pathname = get_relative_pathname(dirpath)
246                     self.import_data_run(relative_pathname, filename)
247
248     def import_data_run(self, relative_pathname, run_xml_name, force=False):
249         """Given a result directory import files"""
250         now = timezone.now()
251         run_dir = get_absolute_pathname(relative_pathname)
252         run_xml_path = os.path.join(run_dir, run_xml_name)
253
254         runs = DataRun.objects.filter(result_dir = relative_pathname)
255         if len(runs) == 0:
256             run = DataRun()
257             created = True
258         elif len(runs) > 1:
259             raise RuntimeError("Too many data runs for %s" % (
260                 relative_pathname,))
261         else:
262             run = runs[0]
263             created = False
264
265         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
266             LOGGER.debug("Importing run from %s" % (relative_pathname,))
267             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
268             run.flowcell = self
269             run.status = RUN_STATUS_REVERSE_MAP['DONE']
270             run.result_dir = relative_pathname
271             run.runfolder_name = run_xml_data.runfolder_name
272             run.cycle_start = run_xml_data.image_analysis.start
273             run.cycle_stop = run_xml_data.image_analysis.stop
274             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
275             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
276             run.image_software = run_xml_data.image_analysis.software
277             run.image_version = run_xml_data.image_analysis.version
278             run.basecall_software = run_xml_data.bustard.software
279             run.basecall_version = run_xml_data.bustard.version
280             # we're frequently not running alignments
281             if run_xml_data.gerald:
282                 run.alignment_software = run_xml_data.gerald.software
283                 run.alignment_version = run_xml_data.gerald.version
284
285             run.last_update_time = timezone.now()
286             run.save()
287
288             run.update_result_files()
289
290
291 # FIXME: should we automatically update dataruns?
292 #        Or should we expect someone to call update_data_runs?
293 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
294 #    """Update our dataruns
295 #    """
296 #    if not os.path.exists(settings.RESULT_HOME_DIR):
297 #       return
298 #
299 #    instance.update_data_runs()
300 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
301
302
303 LANE_STATUS_CODES = [(0, 'Failed'),
304                      (1, 'Marginal'),
305                      (2, 'Good'), ]
306 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
307 LANE_STATUS_MAP[None] = "Unknown"
308
309
310 def is_valid_lane(value):
311     if value >= 1 and value <= 8:
312         return True
313     else:
314         return False
315
316
317 class Lane(models.Model):
318     flowcell = models.ForeignKey(FlowCell)
319     lane_number = models.IntegerField()
320     library = models.ForeignKey(Library)
321     pM = models.DecimalField(max_digits=5,
322                              decimal_places=2,
323                              blank=False,
324                              null=False,
325                              default=default_pM)
326     cluster_estimate = models.IntegerField(blank=True, null=True)
327     status = models.IntegerField(choices=LANE_STATUS_CODES,
328                                  null=True,
329                                  blank=True)
330     comment = models.TextField(null=True, blank=True)
331
332     @models.permalink
333     def get_absolute_url(self):
334         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
335                 [str(self.id)])
336
337     def __unicode__(self):
338         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
339
340
341 class DataRun(models.Model):
342     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
343     runfolder_name = models.CharField(max_length=50)
344     result_dir = models.CharField(max_length=255)
345     last_update_time = models.DateTimeField()
346     run_start_time = models.DateTimeField()
347     cycle_start = models.IntegerField(null=True, blank=True)
348     cycle_stop = models.IntegerField(null=True, blank=True)
349     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
350                                      null=True, blank=True)
351     image_software = models.CharField(max_length=50)
352     image_version = models.CharField(max_length=50)
353     basecall_software = models.CharField(max_length=50)
354     basecall_version = models.CharField(max_length=50)
355     alignment_software = models.CharField(max_length=50)
356     alignment_version = models.CharField(max_length=50)
357     comment = models.TextField(blank=True)
358
359     def update_result_files(self):
360         abs_result_dir = get_absolute_pathname(self.result_dir)
361
362         for dirname, dirnames, filenames in os.walk(abs_result_dir):
363             for filename in filenames:
364                 pathname = os.path.join(dirname, filename)
365                 relative_pathname = get_relative_pathname(pathname)
366                 datafiles = self.datafile_set.filter(
367                     data_run=self,
368                     relative_pathname=relative_pathname)
369                 if len(datafiles) > 0:
370                     continue
371
372                 metadata = find_file_type_metadata_from_filename(filename)
373                 if metadata is not None:
374                     metadata['filename'] = filename
375                     newfile = DataFile()
376                     newfile.data_run = self
377                     newfile.file_type = metadata['file_type']
378                     newfile.relative_pathname = relative_pathname
379
380                     lane_number = metadata.get('lane', None)
381                     if lane_number is not None:
382                         lane = self.flowcell.lane_set.get(
383                             lane_number=lane_number)
384                         newfile.library = lane.library
385
386                     self.datafile_set.add(newfile)
387
388         self.last_update_time = timezone.now()
389
390     def lane_files(self):
391         lanes = {}
392
393         for datafile in self.datafile_set.all():
394             metadata = datafile.attributes
395             if metadata is not None:
396                 lane = metadata.get('lane', None)
397                 if lane is not None:
398                     lane_file_set = lanes.setdefault(lane, {})
399                     normalized_name = datafile.file_type.normalized_name
400                     lane_file_set[normalized_name] = datafile
401         return lanes
402
403     def ivc_plots(self, lane):
404         ivc_name = ['IVC All', 'IVC Call',
405                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
406
407         plots = {}
408         for rel_filename, metadata in self.get_result_files():
409             if metadata.file_type.name in ivc_name:
410                 plots[metadata.file_type.name] = (rel_filename, metadata)
411
412
413 class FileType(models.Model):
414     """Represent potential file types
415
416     regex is a pattern used to detect if a filename matches this type
417     data run currently assumes that there may be a (?P<lane>) and
418     (?P<end>) pattern in the regular expression.
419     """
420     name = models.CharField(max_length=50)
421     mimetype = models.CharField(max_length=50, null=True, blank=True)
422     # regular expression from glob.fnmatch.translate
423     regex = models.TextField(null=True, blank=True)
424
425     def parse_filename(self, pathname):
426         """Does filename match our pattern?
427
428         Returns None if not, or dictionary of match variables if we do.
429         """
430         path, filename = os.path.split(pathname)
431         if len(self.regex) > 0:
432             match = re.match(self.regex, filename)
433             if match is not None:
434                 # These are (?P<>) names we know about from our
435                 # default regexes.
436                 results = match.groupdict()
437
438                 # convert int parameters
439                 for attribute_name in ['lane', 'end']:
440                     value = results.get(attribute_name, None)
441                     if value is not None:
442                         results[attribute_name] = int(value)
443
444                 return results
445
446     def _get_normalized_name(self):
447         """Crush data file name into identifier friendly name"""
448         return self.name.replace(' ', '_').lower()
449     normalized_name = property(_get_normalized_name)
450
451     def __unicode__(self):
452         #return u"<FileType: %s>" % (self.name,)
453         return self.name
454
455
456 def str_uuid():
457     """Helper function to set default UUID in DataFile"""
458     return str(uuid.uuid1())
459
460
461 class DataFile(models.Model):
462     """Store map from random ID to filename"""
463     random_key = models.CharField(max_length=64,
464                                   db_index=True,
465                                   default=str_uuid)
466     data_run = models.ForeignKey(DataRun, db_index=True)
467     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
468     file_type = models.ForeignKey(FileType)
469     relative_pathname = models.CharField(max_length=255, db_index=True)
470
471     def _get_attributes(self):
472         return self.file_type.parse_filename(self.relative_pathname)
473     attributes = property(_get_attributes)
474
475     def _get_pathname(self):
476         return get_absolute_pathname(self.relative_pathname)
477     pathname = property(_get_pathname)
478
479     @models.permalink
480     def get_absolute_url(self):
481         return ('htsworkflow.frontend.experiments.views.read_result_file',
482                 (), {'key': self.random_key})
483
484
485 def find_file_type_metadata_from_filename(pathname):
486     path, filename = os.path.split(pathname)
487     result = None
488     for file_type in FileType.objects.all():
489         result = file_type.parse_filename(filename)
490         if result is not None:
491             result['file_type'] = file_type
492             return result
493
494     return None
495
496
497 def get_relative_pathname(abspath):
498     """Strip off the result home directory from a path
499     """
500     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
501     relative_pathname = abspath.replace(result_home_dir, '')
502     return relative_pathname
503
504
505 def get_absolute_pathname(relative_pathname):
506     """Attach relative path to  results home directory"""
507     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)