Change experiments.FileType.regex to a text field so it can be arbitrarily long.
[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
1 import datetime
2 import glob
3 import logging
4 import os
5 import re
6 import sre_constants
7 import types
8 import uuid
9
10 from django.conf import settings
11 from django.core.exceptions import ObjectDoesNotExist
12 from django.core import urlresolvers
13 from django.utils import timezone
14 from django.db import models
15 from django.db.models.signals import post_init, pre_save
16
17 from htsworkflow.frontend.samples.models import Library
18 from htsworkflow.util.conversion import parse_flowcell_id
19 from htsworkflow.pipelines import runfolder
20
21 import pytz
22
23 LOGGER = logging.getLogger(__name__)
24 default_pM = 5
25 try:
26     default_pM = int(settings.DEFAULT_PM)
27 except AttributeError, e:
28     LOGGER.error("invalid value for frontend.default_pm")
29
30 # how many days to wait before trying to re-import a runfolder
31 RESCAN_DELAY = 1
32 try:
33     RESCAN_DELAY = int(settings.RESCAN_DELAY)
34 except (ValueError, AttributeError):
35     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
36                  "defaulting to %s" % (RESCAN_DELAY,))
37
38 RUN_STATUS_CHOICES = (
39     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
40     (1, 'Data Pipeline Started'),
41     (2, 'Data Pipeline Interrupted'),
42     (3, 'Data Pipeline Finished'),
43     (4, 'Collect Results Started'),
44     (5, 'Collect Results Finished'),
45     (6, 'QC Started'),
46     (7, 'QC Finished'),
47     (255, 'DONE'),
48   )
49 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
50
51
52 class ClusterStation(models.Model):
53     """List of cluster stations"""
54     name = models.CharField(max_length=50, unique=True)
55     isdefault = models.BooleanField(default=False, null=False)
56
57     class Meta:
58         ordering = ["-isdefault", "name"]
59
60     def __unicode__(self):
61         return unicode(self.name)
62
63     @classmethod
64     def default(cls):
65         d = cls.objects.filter(isdefault=True).all()
66         if len(d) > 0:
67             return d[0]
68         d = cls.objects.order_by('-id').all()
69         if len(d) > 0:
70             return d[0]
71         return None
72
73     @staticmethod
74     def update_isdefault(sender, instance, **kwargs):
75         """Clear default if needed
76         """
77         if instance.isdefault:
78             for c in ClusterStation.objects.filter(isdefault=True).all():
79                 if c.id != instance.id:
80                     c.isdefault = False
81                     c.save()
82
83 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
84
85 class Sequencer(models.Model):
86     """Sequencers we've owned
87     """
88     name = models.CharField(max_length=50, db_index=True)
89     instrument_name = models.CharField(max_length=50, db_index=True)
90     serial_number = models.CharField(max_length=50, db_index=True)
91     model = models.CharField(max_length=255)
92     active = models.BooleanField(default=True, null=False)
93     isdefault = models.BooleanField(default=False, null=False)
94     comment = models.CharField(max_length=255)
95
96     class Meta:
97         ordering = ["-isdefault", "-active", "name"]
98
99     def __unicode__(self):
100         name = [unicode(self.name)]
101         if self.instrument_name is not None:
102             name.append("(%s)" % (unicode(self.instrument_name),))
103         return " ".join(name)
104
105     @models.permalink
106     def get_absolute_url(self):
107         return ('htsworkflow.frontend.experiments.views.sequencer',
108                 [self.id])
109
110     @classmethod
111     def default(cls):
112         d = cls.objects.filter(isdefault=True).all()
113         if len(d) > 0:
114             return d[0]
115         d = cls.objects.order_by('active', '-id').all()
116         if len(d) > 0:
117             return d[0]
118         return None
119
120     @staticmethod
121     def update_isdefault(sender, instance, **kwargs):
122         """Clear default if needed
123         """
124         if instance.isdefault:
125             for s in Sequencer.objects.filter(isdefault=True).all():
126                 if s.id != instance.id:
127                     s.isdefault = False
128                     s.save()
129
130 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
131
132 class FlowCellModel(models.Model):
133     name = models.TextField()
134     fixed_time = models.IntegerField(default=0, help_text='(seconds)')
135     per_cycle_time = models.IntegerField(default=0, help_text='(seconds)')
136     isdefault = models.BooleanField(default=False)
137
138     def __unicode__(self):
139         return unicode(self.name)
140
141     @classmethod
142     def default(cls):
143         d = cls.objects.filter(isdefault=True).all()
144         if len(d) > 0:
145             return d[-1]
146         return None
147
148     @staticmethod
149     def update_isdefault(sender, instance, **kwargs):
150         """Clear default if needed
151         """
152         if instance.isdefault:
153             for s in FlowCellType.objects.filter(isdefault=True).all():
154                 if s.id != instance.id:
155                     s.isdefault = False
156                     s.save()
157
158 pre_save.connect(FlowCellModel.update_isdefault, sender=FlowCellModel)
159
160 class FlowCell(models.Model):
161     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
162     run_date = models.DateTimeField()
163     advanced_run = models.BooleanField(default=False)
164     paired_end = models.BooleanField(default=False)
165     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
166     control_lane = models.IntegerField(choices=[(1, 1),
167                                                 (2, 2),
168                                                 (3, 3),
169                                                 (4, 4),
170                                                 (5, 5),
171                                                 (6, 6),
172                                                 (7, 7),
173                                                 (8, 8),
174                                                 (0, 'All Lanes')],
175                                        null=True,
176                                        blank=True)
177
178     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
179     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
180     flowcell_model = models.ForeignKey(FlowCellModel, default=FlowCellModel.default)
181
182     notes = models.TextField(blank=True)
183
184     def __unicode__(self):
185         return unicode(self.flowcell_id)
186
187     def Lanes(self):
188         html = ['<table>']
189         for lane in self.lane_set.order_by('lane_number'):
190             cluster_estimate = lane.cluster_estimate
191             if cluster_estimate is not None:
192                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
193             else:
194                 cluster_estimate = 'None'
195             library_id = lane.library_id
196             library = lane.library
197             element = '<tr><td>%d</td>'\
198                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
199             html.append(element % (lane.lane_number,
200                                    library.get_admin_url(),
201                                    library,
202                                    cluster_estimate))
203         html.append('</table>')
204         return "\n".join(html)
205     Lanes.allow_tags = True
206
207     class Meta:
208         ordering = ["-run_date"]
209
210     def get_admin_url(self):
211         # that's the django way... except it didn't work
212         return urlresolvers.reverse('admin:experiments_flowcell_change',
213                                     args=(self.id,))
214
215     def flowcell_type(self):
216         """Convert our boolean 'is paired' flag to a name
217         """
218         if self.paired_end:
219             return u"Paired"
220         else:
221             return u"Single"
222
223     @models.permalink
224     def get_absolute_url(self):
225         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
226         return ('htsworkflow.frontend.experiments.views.flowcell_detail',
227                 [str(flowcell_id)])
228
229     def get_raw_data_directory(self):
230         """Return location of where the raw data is stored"""
231         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
232
233         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
234
235     def update_data_runs(self):
236         result_root = self.get_raw_data_directory()
237         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
238         if result_root is None:
239             return
240
241         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
242         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
243
244         result_dirs = []
245         for dirpath, dirnames, filenames in os.walk(result_root):
246             for filename in filenames:
247                 if run_xml_re.match(filename):
248                     # we have a run directory
249                     relative_pathname = get_relative_pathname(dirpath)
250                     self.import_data_run(relative_pathname, filename)
251
252     def import_data_run(self, relative_pathname, run_xml_name, force=False):
253         """Given a result directory import files"""
254         now = timezone.now()
255         run_dir = get_absolute_pathname(relative_pathname)
256         run_xml_path = os.path.join(run_dir, run_xml_name)
257
258         runs = DataRun.objects.filter(result_dir = relative_pathname)
259         if len(runs) == 0:
260             run = DataRun()
261             created = True
262         elif len(runs) > 1:
263             raise RuntimeError("Too many data runs for %s" % (
264                 relative_pathname,))
265         else:
266             run = runs[0]
267             created = False
268
269         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
270             LOGGER.debug("Importing run from %s" % (relative_pathname,))
271             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
272             run.flowcell = self
273             run.status = RUN_STATUS_REVERSE_MAP['DONE']
274             run.result_dir = relative_pathname
275             run.runfolder_name = run_xml_data.runfolder_name
276             run.cycle_start = run_xml_data.image_analysis.start
277             run.cycle_stop = run_xml_data.image_analysis.stop
278             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
279             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
280             run.image_software = run_xml_data.image_analysis.software
281             run.image_version = run_xml_data.image_analysis.version
282             run.basecall_software = run_xml_data.bustard.software
283             run.basecall_version = run_xml_data.bustard.version
284             # we're frequently not running alignments
285             if run_xml_data.gerald:
286                 run.alignment_software = run_xml_data.gerald.software
287                 run.alignment_version = run_xml_data.gerald.version
288
289             run.last_update_time = timezone.now()
290             run.save()
291
292             run.update_result_files()
293
294
295 # FIXME: should we automatically update dataruns?
296 #        Or should we expect someone to call update_data_runs?
297 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
298 #    """Update our dataruns
299 #    """
300 #    if not os.path.exists(settings.RESULT_HOME_DIR):
301 #       return
302 #
303 #    instance.update_data_runs()
304 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
305
306
307 LANE_STATUS_CODES = [(0, 'Failed'),
308                      (1, 'Marginal'),
309                      (2, 'Good'), ]
310 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
311 LANE_STATUS_MAP[None] = "Unknown"
312
313
314 def is_valid_lane(value):
315     if value >= 1 and value <= 8:
316         return True
317     else:
318         return False
319
320
321 class Lane(models.Model):
322     flowcell = models.ForeignKey(FlowCell)
323     lane_number = models.IntegerField()
324     library = models.ForeignKey(Library)
325     pM = models.DecimalField(max_digits=5,
326                              decimal_places=2,
327                              blank=False,
328                              null=False,
329                              default=default_pM)
330     cluster_estimate = models.IntegerField(blank=True, null=True)
331     status = models.IntegerField(choices=LANE_STATUS_CODES,
332                                  null=True,
333                                  blank=True)
334     comment = models.TextField(null=True, blank=True)
335
336     @models.permalink
337     def get_absolute_url(self):
338         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
339                 [str(self.id)])
340
341     def __unicode__(self):
342         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
343
344
345 class DataRun(models.Model):
346     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
347     runfolder_name = models.CharField(max_length=50)
348     result_dir = models.CharField(max_length=255)
349     last_update_time = models.DateTimeField()
350     run_start_time = models.DateTimeField()
351     cycle_start = models.IntegerField(null=True, blank=True)
352     cycle_stop = models.IntegerField(null=True, blank=True)
353     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
354                                      null=True, blank=True)
355     image_software = models.CharField(max_length=50)
356     image_version = models.CharField(max_length=50)
357     basecall_software = models.CharField(max_length=50)
358     basecall_version = models.CharField(max_length=50)
359     alignment_software = models.CharField(max_length=50)
360     alignment_version = models.CharField(max_length=50)
361     comment = models.TextField(blank=True)
362
363     def update_result_files(self):
364         abs_result_dir = get_absolute_pathname(self.result_dir)
365
366         for dirname, dirnames, filenames in os.walk(abs_result_dir):
367             for filename in filenames:
368                 pathname = os.path.join(dirname, filename)
369                 relative_pathname = get_relative_pathname(pathname)
370                 datafiles = self.datafile_set.filter(
371                     data_run=self,
372                     relative_pathname=relative_pathname)
373                 if len(datafiles) > 0:
374                     continue
375
376                 metadata = find_file_type_metadata_from_filename(filename)
377                 if metadata is not None:
378                     metadata['filename'] = filename
379                     newfile = DataFile()
380                     newfile.data_run = self
381                     newfile.file_type = metadata['file_type']
382                     newfile.relative_pathname = relative_pathname
383
384                     lane_number = metadata.get('lane', None)
385                     if lane_number is not None:
386                         lane = self.flowcell.lane_set.get(
387                             lane_number=lane_number)
388                         newfile.library = lane.library
389
390                     self.datafile_set.add(newfile)
391
392         self.last_update_time = timezone.now()
393
394     def lane_files(self):
395         lanes = {}
396
397         for datafile in self.datafile_set.all():
398             metadata = datafile.attributes
399             if metadata is not None:
400                 lane = metadata.get('lane', None)
401                 if lane is not None:
402                     lane_file_set = lanes.setdefault(lane, {})
403                     normalized_name = datafile.file_type.normalized_name
404                     lane_file_set[normalized_name] = datafile
405         return lanes
406
407     def ivc_plots(self, lane):
408         ivc_name = ['IVC All', 'IVC Call',
409                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
410
411         plots = {}
412         for rel_filename, metadata in self.get_result_files():
413             if metadata.file_type.name in ivc_name:
414                 plots[metadata.file_type.name] = (rel_filename, metadata)
415
416
417 class FileType(models.Model):
418     """Represent potential file types
419
420     regex is a pattern used to detect if a filename matches this type
421     data run currently assumes that there may be a (?P<lane>) and
422     (?P<end>) pattern in the regular expression.
423     """
424     name = models.CharField(max_length=50)
425     mimetype = models.CharField(max_length=50, null=True, blank=True)
426     # regular expression from glob.fnmatch.translate
427     regex = models.TextField(null=True, blank=True)
428
429     def parse_filename(self, pathname):
430         """Does filename match our pattern?
431
432         Returns None if not, or dictionary of match variables if we do.
433         """
434         path, filename = os.path.split(pathname)
435         if len(self.regex) > 0:
436             match = re.match(self.regex, filename)
437             if match is not None:
438                 # These are (?P<>) names we know about from our
439                 # default regexes.
440                 results = match.groupdict()
441
442                 # convert int parameters
443                 for attribute_name in ['lane', 'end']:
444                     value = results.get(attribute_name, None)
445                     if value is not None:
446                         results[attribute_name] = int(value)
447
448                 return results
449
450     def _get_normalized_name(self):
451         """Crush data file name into identifier friendly name"""
452         return self.name.replace(' ', '_').lower()
453     normalized_name = property(_get_normalized_name)
454
455     def __unicode__(self):
456         #return u"<FileType: %s>" % (self.name,)
457         return self.name
458
459     def regex_is_valid(self):
460         try:
461             regex = re.compile(self.regex)
462         except sre_constants.error as e:
463             return False
464         return re.compile(self.regex) is not None
465
466 def str_uuid():
467     """Helper function to set default UUID in DataFile"""
468     return str(uuid.uuid1())
469
470
471 class DataFile(models.Model):
472     """Store map from random ID to filename"""
473     random_key = models.CharField(max_length=64,
474                                   db_index=True,
475                                   default=str_uuid)
476     data_run = models.ForeignKey(DataRun, db_index=True)
477     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
478     file_type = models.ForeignKey(FileType)
479     relative_pathname = models.CharField(max_length=255, db_index=True)
480
481     def _get_attributes(self):
482         return self.file_type.parse_filename(self.relative_pathname)
483     attributes = property(_get_attributes)
484
485     def _get_pathname(self):
486         return get_absolute_pathname(self.relative_pathname)
487     pathname = property(_get_pathname)
488
489     @models.permalink
490     def get_absolute_url(self):
491         return ('htsworkflow.frontend.experiments.views.read_result_file',
492                 (), {'key': self.random_key})
493
494
495 def find_file_type_metadata_from_filename(pathname):
496     path, filename = os.path.split(pathname)
497     result = None
498     for file_type in FileType.objects.all():
499         result = file_type.parse_filename(filename)
500         if result is not None:
501             result['file_type'] = file_type
502             return result
503
504     return None
505
506
507 def get_relative_pathname(abspath):
508     """Strip off the result home directory from a path
509     """
510     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
511     relative_pathname = abspath.replace(result_home_dir, '')
512     return relative_pathname
513
514
515 def get_absolute_pathname(relative_pathname):
516     """Attach relative path to  results home directory"""
517     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)