7a41d3d5902aebe32a56595db04dd859aa6e016d
[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
1 import datetime
2 import glob
3 import logging
4 import os
5 import re
6 import types
7 import uuid
8
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist
11 from django.core import urlresolvers
12 from django.db import models
13 from django.db.models.signals import post_init, pre_save
14
15 from htsworkflow.frontend.samples.models import Library
16 from htsworkflow.util.conversion import parse_flowcell_id
17 from htsworkflow.pipelines import runfolder
18
19 LOGGER = logging.getLogger(__name__)
20 default_pM = 5
21 try:
22     default_pM = int(settings.DEFAULT_PM)
23 except ValueError, e:
24     LOGGER.error("invalid value for frontend.default_pm")
25
26 # how many days to wait before trying to re-import a runfolder
27 RESCAN_DELAY = 1
28 try:
29     RESCAN_DELAY = int(settings.RESCAN_DELAY)
30 except (ValueError, AttributeError):
31     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
32                  "defaulting to %s" % (RESCAN_DELAY,))
33
34 RUN_STATUS_CHOICES = (
35     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
36     (1, 'Data Pipeline Started'),
37     (2, 'Data Pipeline Interrupted'),
38     (3, 'Data Pipeline Finished'),
39     (4, 'Collect Results Started'),
40     (5, 'Collect Results Finished'),
41     (6, 'QC Started'),
42     (7, 'QC Finished'),
43     (255, 'DONE'),
44   )
45 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
46
47
48 class ClusterStation(models.Model):
49     """List of cluster stations"""
50     name = models.CharField(max_length=50, unique=True)
51     isdefault = models.BooleanField(default=False, null=False)
52
53     class Meta:
54         ordering = ["-isdefault", "name"]
55
56     def __unicode__(self):
57         return unicode(self.name)
58
59     @classmethod
60     def default(cls):
61         d = cls.objects.filter(isdefault=True).all()
62         if len(d) > 0:
63             return d[0]
64         d = cls.objects.order_by('-id').all()
65         if len(d) > 0:
66             return d[0]
67         return None
68
69     @staticmethod
70     def update_isdefault(sender, instance, **kwargs):
71         """Clear default if needed
72         """
73         if instance.isdefault:
74             for c in ClusterStation.objects.filter(isdefault=True).all():
75                 if c.id != instance.id:
76                     c.isdefault = False
77                     c.save()
78
79 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
80
81 class Sequencer(models.Model):
82     """Sequencers we've owned
83     """
84     name = models.CharField(max_length=50, db_index=True)
85     instrument_name = models.CharField(max_length=50, db_index=True)
86     serial_number = models.CharField(max_length=50, db_index=True)
87     model = models.CharField(max_length=255)
88     active = models.BooleanField(default=True, null=False)
89     isdefault = models.BooleanField(default=False, null=False)
90     comment = models.CharField(max_length=255)
91
92     class Meta:
93         ordering = ["-isdefault", "-active", "name"]
94
95     def __unicode__(self):
96         name = [unicode(self.name)]
97         if self.instrument_name is not None:
98             name.append("(%s)" % (unicode(self.instrument_name),))
99         return " ".join(name)
100
101     @models.permalink
102     def get_absolute_url(self):
103         return ('htsworkflow.frontend.experiments.views.sequencer',
104                 [self.id])
105
106     @classmethod
107     def default(cls):
108         d = cls.objects.filter(isdefault=True).all()
109         if len(d) > 0:
110             return d[0]
111         d = cls.objects.order_by('active', '-id').all()
112         if len(d) > 0:
113             return d[0]
114         return None
115
116     @staticmethod
117     def update_isdefault(sender, instance, **kwargs):
118         """Clear default if needed
119         """
120         if instance.isdefault:
121             for s in Sequencer.objects.filter(isdefault=True).all():
122                 if s.id != instance.id:
123                     s.isdefault = False
124                     s.save()
125
126 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
127
128
129 class FlowCell(models.Model):
130     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
131     run_date = models.DateTimeField()
132     advanced_run = models.BooleanField(default=False)
133     paired_end = models.BooleanField(default=False)
134     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
135     control_lane = models.IntegerField(choices=[(1, 1),
136                                                 (2, 2),
137                                                 (3, 3),
138                                                 (4, 4),
139                                                 (5, 5),
140                                                 (6, 6),
141                                                 (7, 7),
142                                                 (8, 8),
143                                                 (0, 'All Lanes')],
144                                        null=True,
145                                        blank=True)
146
147     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
148     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
149
150     notes = models.TextField(blank=True)
151
152     def __unicode__(self):
153         return unicode(self.flowcell_id)
154
155     def Lanes(self):
156         html = ['<table>']
157         for lane in self.lane_set.order_by('lane_number'):
158             cluster_estimate = lane.cluster_estimate
159             if cluster_estimate is not None:
160                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
161             else:
162                 cluster_estimate = 'None'
163             library_id = lane.library_id
164             library = lane.library
165             element = '<tr><td>%d</td>'\
166                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
167             html.append(element % (lane.lane_number,
168                                    library.get_admin_url(),
169                                    library,
170                                    cluster_estimate))
171         html.append('</table>')
172         return "\n".join(html)
173     Lanes.allow_tags = True
174
175     class Meta:
176         ordering = ["-run_date"]
177
178     def get_admin_url(self):
179         # that's the django way... except it didn't work
180         return urlresolvers.reverse('admin:experiments_flowcell_change',
181                                     args=(self.id,))
182
183     def flowcell_type(self):
184         """Convert our boolean 'is paired' flag to a name
185         """
186         if self.paired_end:
187             return u"Paired"
188         else:
189             return u"Single"
190
191     @models.permalink
192     def get_absolute_url(self):
193         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
194         return ('htsworkflow.frontend.experiments.views.flowcell_detail',
195                 [str(flowcell_id)])
196
197     def get_raw_data_directory(self):
198         """Return location of where the raw data is stored"""
199         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
200
201         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
202
203     def update_data_runs(self):
204         result_root = self.get_raw_data_directory()
205         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
206         if result_root is None:
207             return
208
209         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
210         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
211
212         result_dirs = []
213         for dirpath, dirnames, filenames in os.walk(result_root):
214             for filename in filenames:
215                 if run_xml_re.match(filename):
216                     # we have a run directory
217                     relative_pathname = get_relative_pathname(dirpath)
218                     self.import_data_run(relative_pathname, filename)
219
220     def import_data_run(self, relative_pathname, run_xml_name, force=False):
221         """Given a result directory import files"""
222         now = datetime.datetime.now()
223         run_dir = get_absolute_pathname(relative_pathname)
224         run_xml_path = os.path.join(run_dir, run_xml_name)
225
226         runs = DataRun.objects.filter(result_dir = relative_pathname)
227         if len(runs) == 0:
228             run = DataRun()
229             created = True
230         elif len(runs) > 1:
231             raise RuntimeError("Too many data runs for %s" % (
232                 relative_pathname,))
233         else:
234             run = runs[0]
235             created = False
236
237         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
238             LOGGER.debug("Importing run from %s" % (relative_pathname,))
239             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
240             run.flowcell = self
241             run.status = RUN_STATUS_REVERSE_MAP['DONE']
242             run.result_dir = relative_pathname
243             run.runfolder_name = run_xml_data.runfolder_name
244             run.cycle_start = run_xml_data.image_analysis.start
245             run.cycle_stop = run_xml_data.image_analysis.stop
246             run.run_start_time = run_xml_data.image_analysis.date
247             run.image_software = run_xml_data.image_analysis.software
248             run.image_version = run_xml_data.image_analysis.version
249             run.basecall_software = run_xml_data.bustard.software
250             run.basecall_version = run_xml_data.bustard.version
251             run.alignment_software = run_xml_data.gerald.software
252             run.alignment_version = run_xml_data.gerald.version
253
254             run.last_update_time = datetime.datetime.now()
255             run.save()
256
257             run.update_result_files()
258
259
260 # FIXME: should we automatically update dataruns?
261 #        Or should we expect someone to call update_data_runs?
262 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
263 #    """Update our dataruns
264 #    """
265 #    if not os.path.exists(settings.RESULT_HOME_DIR):
266 #       return
267 #
268 #    instance.update_data_runs()
269 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
270
271
272 LANE_STATUS_CODES = [(0, 'Failed'),
273                      (1, 'Marginal'),
274                      (2, 'Good'), ]
275 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
276 LANE_STATUS_MAP[None] = "Unknown"
277
278
279 def is_valid_lane(value):
280     if value >= 1 and value <= 8:
281         return True
282     else:
283         return False
284
285
286 class Lane(models.Model):
287     flowcell = models.ForeignKey(FlowCell)
288     lane_number = models.IntegerField()
289     library = models.ForeignKey(Library)
290     pM = models.DecimalField(max_digits=5,
291                              decimal_places=2,
292                              blank=False,
293                              null=False,
294                              default=default_pM)
295     cluster_estimate = models.IntegerField(blank=True, null=True)
296     status = models.IntegerField(choices=LANE_STATUS_CODES,
297                                  null=True,
298                                  blank=True)
299     comment = models.TextField(null=True, blank=True)
300
301     @models.permalink
302     def get_absolute_url(self):
303         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
304                 [str(self.id)])
305
306     def __unicode__(self):
307         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
308
309
310 class DataRun(models.Model):
311     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
312     runfolder_name = models.CharField(max_length=50)
313     result_dir = models.CharField(max_length=255)
314     last_update_time = models.DateTimeField()
315     run_start_time = models.DateTimeField()
316     cycle_start = models.IntegerField(null=True, blank=True)
317     cycle_stop = models.IntegerField(null=True, blank=True)
318     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
319                                      null=True, blank=True)
320     image_software = models.CharField(max_length=50)
321     image_version = models.CharField(max_length=50)
322     basecall_software = models.CharField(max_length=50)
323     basecall_version = models.CharField(max_length=50)
324     alignment_software = models.CharField(max_length=50)
325     alignment_version = models.CharField(max_length=50)
326     comment = models.TextField(blank=True)
327
328     def update_result_files(self):
329         abs_result_dir = get_absolute_pathname(self.result_dir)
330
331         for dirname, dirnames, filenames in os.walk(abs_result_dir):
332             for filename in filenames:
333                 pathname = os.path.join(dirname, filename)
334                 relative_pathname = get_relative_pathname(pathname)
335                 datafiles = self.datafile_set.filter(
336                     data_run=self,
337                     relative_pathname=relative_pathname)
338                 if len(datafiles) > 0:
339                     continue
340
341                 metadata = find_file_type_metadata_from_filename(filename)
342                 if metadata is not None:
343                     metadata['filename'] = filename
344                     newfile = DataFile()
345                     newfile.data_run = self
346                     newfile.file_type = metadata['file_type']
347                     newfile.relative_pathname = relative_pathname
348
349                     lane_number = metadata.get('lane', None)
350                     if lane_number is not None:
351                         lane = self.flowcell.lane_set.get(
352                             lane_number=lane_number)
353                         newfile.library = lane.library
354
355                     self.datafile_set.add(newfile)
356
357         self.last_update_time = datetime.datetime.now()
358
359     def lane_files(self):
360         lanes = {}
361
362         for datafile in self.datafile_set.all():
363             metadata = datafile.attributes
364             if metadata is not None:
365                 lane = metadata.get('lane', None)
366                 if lane is not None:
367                     lane_file_set = lanes.setdefault(lane, {})
368                     normalized_name = datafile.file_type.normalized_name
369                     lane_file_set[normalized_name] = datafile
370         return lanes
371
372     def ivc_plots(self, lane):
373         ivc_name = ['IVC All', 'IVC Call',
374                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
375
376         plots = {}
377         for rel_filename, metadata in self.get_result_files():
378             if metadata.file_type.name in ivc_name:
379                 plots[metadata.file_type.name] = (rel_filename, metadata)
380
381
382 class FileType(models.Model):
383     """Represent potential file types
384
385     regex is a pattern used to detect if a filename matches this type
386     data run currently assumes that there may be a (?P<lane>) and
387     (?P<end>) pattern in the regular expression.
388     """
389     name = models.CharField(max_length=50)
390     mimetype = models.CharField(max_length=50, null=True, blank=True)
391     # regular expression from glob.fnmatch.translate
392     regex = models.CharField(max_length=50, null=True, blank=True)
393
394     def parse_filename(self, pathname):
395         """Does filename match our pattern?
396
397         Returns None if not, or dictionary of match variables if we do.
398         """
399         path, filename = os.path.split(pathname)
400         if len(self.regex) > 0:
401             match = re.match(self.regex, filename)
402             if match is not None:
403                 # These are (?P<>) names we know about from our
404                 # default regexes.
405                 results = match.groupdict()
406
407                 # convert int parameters
408                 for attribute_name in ['lane', 'end']:
409                     value = results.get(attribute_name, None)
410                     if value is not None:
411                         results[attribute_name] = int(value)
412
413                 return results
414
415     def _get_normalized_name(self):
416         """Crush data file name into identifier friendly name"""
417         return self.name.replace(' ', '_').lower()
418     normalized_name = property(_get_normalized_name)
419
420     def __unicode__(self):
421         #return u"<FileType: %s>" % (self.name,)
422         return self.name
423
424
425 def str_uuid():
426     """Helper function to set default UUID in DataFile"""
427     return str(uuid.uuid1())
428
429
430 class DataFile(models.Model):
431     """Store map from random ID to filename"""
432     random_key = models.CharField(max_length=64,
433                                   db_index=True,
434                                   default=str_uuid)
435     data_run = models.ForeignKey(DataRun, db_index=True)
436     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
437     file_type = models.ForeignKey(FileType)
438     relative_pathname = models.CharField(max_length=255, db_index=True)
439
440     def _get_attributes(self):
441         return self.file_type.parse_filename(self.relative_pathname)
442     attributes = property(_get_attributes)
443
444     def _get_pathname(self):
445         return get_absolute_pathname(self.relative_pathname)
446     pathname = property(_get_pathname)
447
448     @models.permalink
449     def get_absolute_url(self):
450         return ('htsworkflow.frontend.experiments.views.read_result_file',
451                 (), {'key': self.random_key})
452
453
454 def find_file_type_metadata_from_filename(pathname):
455     path, filename = os.path.split(pathname)
456     result = None
457     for file_type in FileType.objects.all():
458         result = file_type.parse_filename(filename)
459         if result is not None:
460             result['file_type'] = file_type
461             return result
462
463     return None
464
465
466 def get_relative_pathname(abspath):
467     """Strip off the result home directory from a path
468     """
469     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
470     relative_pathname = abspath.replace(result_home_dir, '')
471     return relative_pathname
472
473
474 def get_absolute_pathname(relative_pathname):
475     """Attach relative path to  results home directory"""
476     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)