Flatten project hierarchy, moving djano applications out of htsworkflow.frontend...
[htsworkflow.git] / experiments / models.py
1 from __future__ import absolute_import, print_function
2
3 import datetime
4 import glob
5 import logging
6 import os
7 import re
8 import types
9 import uuid
10
11 from django.conf import settings
12 from django.core.exceptions import ObjectDoesNotExist
13 from django.core import urlresolvers
14 from django.utils import timezone
15 from django.db import models
16 from django.db.models.signals import post_init, pre_save
17
18 from samples.models import Library
19 from htsworkflow.util.conversion import parse_flowcell_id
20 from htsworkflow.pipelines import runfolder
21
22 import pytz
23
24 LOGGER = logging.getLogger(__name__)
25 default_pM = 5
26 try:
27     default_pM = int(settings.DEFAULT_PM)
28 except AttributeError, e:
29     LOGGER.error("invalid value for default_pm")
30
31 # how many days to wait before trying to re-import a runfolder
32 RESCAN_DELAY = 1
33 try:
34     RESCAN_DELAY = int(settings.RESCAN_DELAY)
35 except (ValueError, AttributeError):
36     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
37                  "defaulting to %s" % (RESCAN_DELAY,))
38
39 RUN_STATUS_CHOICES = (
40     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
41     (1, 'Data Pipeline Started'),
42     (2, 'Data Pipeline Interrupted'),
43     (3, 'Data Pipeline Finished'),
44     (4, 'Collect Results Started'),
45     (5, 'Collect Results Finished'),
46     (6, 'QC Started'),
47     (7, 'QC Finished'),
48     (255, 'DONE'),
49   )
50 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
51
52
53 class ClusterStation(models.Model):
54     """List of cluster stations"""
55     name = models.CharField(max_length=50, unique=True)
56     isdefault = models.BooleanField(default=False, null=False)
57
58     class Meta:
59         ordering = ["-isdefault", "name"]
60
61     def __unicode__(self):
62         return unicode(self.name)
63
64     @classmethod
65     def default(cls):
66         d = cls.objects.filter(isdefault=True).all()
67         if len(d) > 0:
68             return d[0]
69         d = cls.objects.order_by('-id').all()
70         if len(d) > 0:
71             return d[0]
72         return None
73
74     @staticmethod
75     def update_isdefault(sender, instance, **kwargs):
76         """Clear default if needed
77         """
78         if instance.isdefault:
79             for c in ClusterStation.objects.filter(isdefault=True).all():
80                 if c.id != instance.id:
81                     c.isdefault = False
82                     c.save()
83
84 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
85
86 class Sequencer(models.Model):
87     """Sequencers we've owned
88     """
89     name = models.CharField(max_length=50, db_index=True)
90     instrument_name = models.CharField(max_length=50, db_index=True)
91     serial_number = models.CharField(max_length=50, db_index=True)
92     model = models.CharField(max_length=255)
93     active = models.BooleanField(default=True, null=False)
94     isdefault = models.BooleanField(default=False, null=False)
95     comment = models.CharField(max_length=255)
96
97     class Meta:
98         ordering = ["-isdefault", "-active", "name"]
99
100     def __unicode__(self):
101         name = [unicode(self.name)]
102         if self.instrument_name is not None:
103             name.append("(%s)" % (unicode(self.instrument_name),))
104         return " ".join(name)
105
106     @models.permalink
107     def get_absolute_url(self):
108         return ('experiments.views.sequencer',
109                 [self.id])
110
111     @classmethod
112     def default(cls):
113         d = cls.objects.filter(isdefault=True).all()
114         if len(d) > 0:
115             return d[0]
116         d = cls.objects.order_by('active', '-id').all()
117         if len(d) > 0:
118             return d[0]
119         return None
120
121     @staticmethod
122     def update_isdefault(sender, instance, **kwargs):
123         """Clear default if needed
124         """
125         if instance.isdefault:
126             for s in Sequencer.objects.filter(isdefault=True).all():
127                 if s.id != instance.id:
128                     s.isdefault = False
129                     s.save()
130
131 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
132
133
134 class FlowCell(models.Model):
135     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
136     run_date = models.DateTimeField()
137     advanced_run = models.BooleanField(default=False)
138     paired_end = models.BooleanField(default=False)
139     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
140     control_lane = models.IntegerField(choices=[(1, 1),
141                                                 (2, 2),
142                                                 (3, 3),
143                                                 (4, 4),
144                                                 (5, 5),
145                                                 (6, 6),
146                                                 (7, 7),
147                                                 (8, 8),
148                                                 (0, 'All Lanes')],
149                                        null=True,
150                                        blank=True)
151
152     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
153     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
154
155     notes = models.TextField(blank=True)
156
157     def __unicode__(self):
158         return unicode(self.flowcell_id)
159
160     def Lanes(self):
161         html = ['<table>']
162         for lane in self.lane_set.order_by('lane_number'):
163             cluster_estimate = lane.cluster_estimate
164             if cluster_estimate is not None:
165                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
166             else:
167                 cluster_estimate = 'None'
168             library_id = lane.library_id
169             library = lane.library
170             element = '<tr><td>%d</td>'\
171                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
172             html.append(element % (lane.lane_number,
173                                    library.get_admin_url(),
174                                    library,
175                                    cluster_estimate))
176         html.append('</table>')
177         return "\n".join(html)
178     Lanes.allow_tags = True
179
180     class Meta:
181         ordering = ["-run_date"]
182
183     def get_admin_url(self):
184         # that's the django way... except it didn't work
185         return urlresolvers.reverse('admin:experiments_flowcell_change',
186                                     args=(self.id,))
187
188     def flowcell_type(self):
189         """Convert our boolean 'is paired' flag to a name
190         """
191         if self.paired_end:
192             return u"Paired"
193         else:
194             return u"Single"
195
196     @models.permalink
197     def get_absolute_url(self):
198         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
199         return ('experiments.views.flowcell_detail',
200                 [str(flowcell_id)])
201
202     def get_raw_data_directory(self):
203         """Return location of where the raw data is stored"""
204         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
205
206         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
207
208     def update_data_runs(self):
209         result_root = self.get_raw_data_directory()
210         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
211         if result_root is None:
212             return
213
214         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
215         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
216
217         result_dirs = []
218         for dirpath, dirnames, filenames in os.walk(result_root):
219             for filename in filenames:
220                 if run_xml_re.match(filename):
221                     # we have a run directory
222                     relative_pathname = get_relative_pathname(dirpath)
223                     self.import_data_run(relative_pathname, filename)
224
225     def import_data_run(self, relative_pathname, run_xml_name, force=False):
226         """Given a result directory import files"""
227         now = timezone.now()
228         run_dir = get_absolute_pathname(relative_pathname)
229         run_xml_path = os.path.join(run_dir, run_xml_name)
230
231         runs = DataRun.objects.filter(result_dir = relative_pathname)
232         if len(runs) == 0:
233             run = DataRun()
234             created = True
235         elif len(runs) > 1:
236             raise RuntimeError("Too many data runs for %s" % (
237                 relative_pathname,))
238         else:
239             run = runs[0]
240             created = False
241
242         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
243             LOGGER.debug("Importing run from %s" % (relative_pathname,))
244             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
245             run.flowcell = self
246             run.status = RUN_STATUS_REVERSE_MAP['DONE']
247             run.result_dir = relative_pathname
248             run.runfolder_name = run_xml_data.runfolder_name
249             run.cycle_start = run_xml_data.image_analysis.start
250             run.cycle_stop = run_xml_data.image_analysis.stop
251             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
252             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
253             run.image_software = run_xml_data.image_analysis.software
254             run.image_version = run_xml_data.image_analysis.version
255             run.basecall_software = run_xml_data.bustard.software
256             run.basecall_version = run_xml_data.bustard.version
257             # we're frequently not running alignments
258             if run_xml_data.gerald:
259                 run.alignment_software = run_xml_data.gerald.software
260                 run.alignment_version = run_xml_data.gerald.version
261
262             run.last_update_time = timezone.now()
263             run.save()
264
265             run.update_result_files()
266
267
268 # FIXME: should we automatically update dataruns?
269 #        Or should we expect someone to call update_data_runs?
270 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
271 #    """Update our dataruns
272 #    """
273 #    if not os.path.exists(settings.RESULT_HOME_DIR):
274 #       return
275 #
276 #    instance.update_data_runs()
277 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
278
279
280 LANE_STATUS_CODES = [(0, 'Failed'),
281                      (1, 'Marginal'),
282                      (2, 'Good'), ]
283 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
284 LANE_STATUS_MAP[None] = "Unknown"
285
286
287 def is_valid_lane(value):
288     if value >= 1 and value <= 8:
289         return True
290     else:
291         return False
292
293
294 class Lane(models.Model):
295     flowcell = models.ForeignKey(FlowCell)
296     lane_number = models.IntegerField()
297     library = models.ForeignKey(Library)
298     pM = models.DecimalField(max_digits=5,
299                              decimal_places=2,
300                              blank=False,
301                              null=False,
302                              default=default_pM)
303     cluster_estimate = models.IntegerField(blank=True, null=True)
304     status = models.IntegerField(choices=LANE_STATUS_CODES,
305                                  null=True,
306                                  blank=True)
307     comment = models.TextField(null=True, blank=True)
308
309     @models.permalink
310     def get_absolute_url(self):
311         return ('experiments.views.flowcell_lane_detail',
312                 [str(self.id)])
313
314     def __unicode__(self):
315         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
316
317
318 class DataRun(models.Model):
319     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
320     runfolder_name = models.CharField(max_length=50)
321     result_dir = models.CharField(max_length=255)
322     last_update_time = models.DateTimeField()
323     run_start_time = models.DateTimeField()
324     cycle_start = models.IntegerField(null=True, blank=True)
325     cycle_stop = models.IntegerField(null=True, blank=True)
326     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
327                                      null=True, blank=True)
328     image_software = models.CharField(max_length=50)
329     image_version = models.CharField(max_length=50)
330     basecall_software = models.CharField(max_length=50)
331     basecall_version = models.CharField(max_length=50)
332     alignment_software = models.CharField(max_length=50)
333     alignment_version = models.CharField(max_length=50)
334     comment = models.TextField(blank=True)
335
336     def update_result_files(self):
337         abs_result_dir = get_absolute_pathname(self.result_dir)
338
339         for dirname, dirnames, filenames in os.walk(abs_result_dir):
340             for filename in filenames:
341                 pathname = os.path.join(dirname, filename)
342                 relative_pathname = get_relative_pathname(pathname)
343                 datafiles = self.datafile_set.filter(
344                     data_run=self,
345                     relative_pathname=relative_pathname)
346                 if len(datafiles) > 0:
347                     continue
348
349                 metadata = find_file_type_metadata_from_filename(filename)
350                 if metadata is not None:
351                     metadata['filename'] = filename
352                     newfile = DataFile()
353                     newfile.data_run = self
354                     newfile.file_type = metadata['file_type']
355                     newfile.relative_pathname = relative_pathname
356
357                     lane_number = metadata.get('lane', None)
358                     if lane_number is not None:
359                         lane = self.flowcell.lane_set.get(
360                             lane_number=lane_number)
361                         newfile.library = lane.library
362
363                     self.datafile_set.add(newfile)
364
365         self.last_update_time = timezone.now()
366
367     def lane_files(self):
368         lanes = {}
369
370         for datafile in self.datafile_set.all():
371             metadata = datafile.attributes
372             if metadata is not None:
373                 lane = metadata.get('lane', None)
374                 if lane is not None:
375                     lane_file_set = lanes.setdefault(lane, {})
376                     normalized_name = datafile.file_type.normalized_name
377                     lane_file_set[normalized_name] = datafile
378         return lanes
379
380     def ivc_plots(self, lane):
381         ivc_name = ['IVC All', 'IVC Call',
382                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
383
384         plots = {}
385         for rel_filename, metadata in self.get_result_files():
386             if metadata.file_type.name in ivc_name:
387                 plots[metadata.file_type.name] = (rel_filename, metadata)
388
389
390 class FileType(models.Model):
391     """Represent potential file types
392
393     regex is a pattern used to detect if a filename matches this type
394     data run currently assumes that there may be a (?P<lane>) and
395     (?P<end>) pattern in the regular expression.
396     """
397     name = models.CharField(max_length=50)
398     mimetype = models.CharField(max_length=50, null=True, blank=True)
399     # regular expression from glob.fnmatch.translate
400     regex = models.TextField(null=True, blank=True)
401
402     def parse_filename(self, pathname):
403         """Does filename match our pattern?
404
405         Returns None if not, or dictionary of match variables if we do.
406         """
407         path, filename = os.path.split(pathname)
408         if len(self.regex) > 0:
409             match = re.match(self.regex, filename)
410             if match is not None:
411                 # These are (?P<>) names we know about from our
412                 # default regexes.
413                 results = match.groupdict()
414
415                 # convert int parameters
416                 for attribute_name in ['lane', 'end']:
417                     value = results.get(attribute_name, None)
418                     if value is not None:
419                         results[attribute_name] = int(value)
420
421                 return results
422
423     def _get_normalized_name(self):
424         """Crush data file name into identifier friendly name"""
425         return self.name.replace(' ', '_').lower()
426     normalized_name = property(_get_normalized_name)
427
428     def __unicode__(self):
429         #return u"<FileType: %s>" % (self.name,)
430         return self.name
431
432
433 def str_uuid():
434     """Helper function to set default UUID in DataFile"""
435     return str(uuid.uuid1())
436
437
438 class DataFile(models.Model):
439     """Store map from random ID to filename"""
440     random_key = models.CharField(max_length=64,
441                                   db_index=True,
442                                   default=str_uuid)
443     data_run = models.ForeignKey(DataRun, db_index=True)
444     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
445     file_type = models.ForeignKey(FileType)
446     relative_pathname = models.CharField(max_length=255, db_index=True)
447
448     def _get_attributes(self):
449         return self.file_type.parse_filename(self.relative_pathname)
450     attributes = property(_get_attributes)
451
452     def _get_pathname(self):
453         return get_absolute_pathname(self.relative_pathname)
454     pathname = property(_get_pathname)
455
456     @models.permalink
457     def get_absolute_url(self):
458         return ('experiments.views.read_result_file',
459                 (), {'key': self.random_key})
460
461
462 def find_file_type_metadata_from_filename(pathname):
463     path, filename = os.path.split(pathname)
464     result = None
465     for file_type in FileType.objects.all():
466         result = file_type.parse_filename(filename)
467         if result is not None:
468             result['file_type'] = file_type
469             return result
470
471     return None
472
473
474 def get_relative_pathname(abspath):
475     """Strip off the result home directory from a path
476     """
477     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
478     relative_pathname = abspath.replace(result_home_dir, '')
479     return relative_pathname
480
481
482 def get_absolute_pathname(relative_pathname):
483     """Attach relative path to  results home directory"""
484     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)