flake8 whitespace cleanup
[htsworkflow.git] / experiments / models.py
1 from __future__ import absolute_import, print_function, unicode_literals
2
3 import datetime
4 import glob
5 import logging
6 import os
7 import re
8 import types
9 import uuid
10
11 from django.conf import settings
12 from django.core.exceptions import ObjectDoesNotExist
13 from django.core import urlresolvers
14 from django.utils import timezone
15 from django.db import models
16 from django.db.models.signals import post_init, pre_save
17
18 from samples.models import Library
19 from htsworkflow.util.conversion import parse_flowcell_id
20 from htsworkflow.pipelines import runfolder
21
22 import pytz
23
24 LOGGER = logging.getLogger(__name__)
25 default_pM = 5
26 try:
27     default_pM = int(settings.DEFAULT_PM)
28 except AttributeError as e:
29     LOGGER.error("invalid value for default_pm")
30
31 # how many days to wait before trying to re-import a runfolder
32 RESCAN_DELAY = 1
33 try:
34     RESCAN_DELAY = int(settings.RESCAN_DELAY)
35 except (ValueError, AttributeError):
36     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
37                  "defaulting to %s" % (RESCAN_DELAY,))
38
39 RUN_STATUS_CHOICES = (
40     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
41     (1, 'Data Pipeline Started'),
42     (2, 'Data Pipeline Interrupted'),
43     (3, 'Data Pipeline Finished'),
44     (4, 'Collect Results Started'),
45     (5, 'Collect Results Finished'),
46     (6, 'QC Started'),
47     (7, 'QC Finished'),
48     (255, 'DONE'),
49   )
50 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
51
52
53 class ClusterStation(models.Model):
54     """List of cluster stations"""
55     name = models.CharField(max_length=50, unique=True)
56     isdefault = models.BooleanField(default=False, null=False)
57
58     class Meta:
59         ordering = ["-isdefault", "name"]
60
61     def __str__(self):
62         return str(self.name)
63
64     @staticmethod
65     def update_isdefault(sender, instance, **kwargs):
66         """Clear default if needed
67         """
68         if instance.isdefault:
69             for c in ClusterStation.objects.filter(isdefault=True).all():
70                 if c.id != instance.id:
71                     c.isdefault = False
72                     c.save()
73
74 def cluster_station_default():
75     d = ClusterStation.objects.filter(isdefault=True).all()
76     if len(d) > 0:
77         return d[0]
78     d = ClusterStation.objects.order_by('-id').all()
79     if len(d) > 0:
80         return d[0]
81     return None
82
83 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
84
85 class Sequencer(models.Model):
86     """Sequencers we've owned
87     """
88     name = models.CharField(max_length=50, db_index=True)
89     instrument_name = models.CharField(max_length=50, db_index=True)
90     serial_number = models.CharField(max_length=50, db_index=True)
91     model = models.CharField(max_length=255)
92     active = models.BooleanField(default=True, null=False)
93     isdefault = models.BooleanField(default=False, null=False)
94     comment = models.CharField(max_length=255)
95
96     class Meta:
97         ordering = ["-isdefault", "-active", "name"]
98
99     def __str__(self):
100         name = [str(self.name)]
101         if self.instrument_name is not None:
102             name.append("(%s)" % (str(self.instrument_name),))
103         return " ".join(name)
104
105     @models.permalink
106     def get_absolute_url(self):
107         return ('experiments.views.sequencer',
108                 [self.id])
109
110     @staticmethod
111     def update_isdefault(sender, instance, **kwargs):
112         """Clear default if needed
113         """
114         if instance.isdefault:
115             for s in Sequencer.objects.filter(isdefault=True).all():
116                 if s.id != instance.id:
117                     s.isdefault = False
118                     s.save()
119
120 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
121
122 def sequencer_default():
123     d = Sequencer.objects.filter(isdefault=True).all()
124     if len(d) > 0:
125         return d[0]
126     d = Sequencer.objects.order_by('active', '-id').all()
127     if len(d) > 0:
128         return d[0]
129     return None
130
131
132 class FlowCell(models.Model):
133     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
134     run_date = models.DateTimeField()
135     advanced_run = models.BooleanField(default=False)
136     paired_end = models.BooleanField(default=False)
137     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
138     control_lane = models.IntegerField(choices=[(1, 1),
139                                                 (2, 2),
140                                                 (3, 3),
141                                                 (4, 4),
142                                                 (5, 5),
143                                                 (6, 6),
144                                                 (7, 7),
145                                                 (8, 8),
146                                                 (0, 'All Lanes')],
147                                        null=True,
148                                        blank=True)
149
150     cluster_station = models.ForeignKey(ClusterStation,
151                                         default=cluster_station_default)
152     sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
153
154     notes = models.TextField(blank=True)
155
156     def __str__(self):
157         return str(self.flowcell_id)
158
159     def Lanes(self):
160         html = ['<table>']
161         for lane in self.lane_set.order_by('lane_number'):
162             cluster_estimate = lane.cluster_estimate
163             if cluster_estimate is not None:
164                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
165             else:
166                 cluster_estimate = 'None'
167             library_id = lane.library_id
168             library = lane.library
169             element = '<tr><td>%d</td>'\
170                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
171             html.append(element % (lane.lane_number,
172                                    library.get_admin_url(),
173                                    library,
174                                    cluster_estimate))
175         html.append('</table>')
176         return "\n".join(html)
177     Lanes.allow_tags = True
178
179     class Meta:
180         ordering = ["-run_date"]
181
182     def get_admin_url(self):
183         # that's the django way... except it didn't work
184         return urlresolvers.reverse('admin:experiments_flowcell_change',
185                                     args=(self.id,))
186
187     def flowcell_type(self):
188         """Convert our boolean 'is paired' flag to a name
189         """
190         if self.paired_end:
191             return "Paired"
192         else:
193             return "Single"
194
195     @models.permalink
196     def get_absolute_url(self):
197         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
198         return ('experiments.views.flowcell_detail',
199                 [str(flowcell_id)])
200
201     def get_raw_data_directory(self):
202         """Return location of where the raw data is stored"""
203         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
204
205         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
206
207     def update_data_runs(self):
208         result_root = self.get_raw_data_directory()
209         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
210         if result_root is None:
211             return
212
213         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
214         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
215
216         result_dirs = []
217         for dirpath, dirnames, filenames in os.walk(result_root):
218             for filename in filenames:
219                 if run_xml_re.match(filename):
220                     # we have a run directory
221                     relative_pathname = get_relative_pathname(dirpath)
222                     self.import_data_run(relative_pathname, filename)
223
224     def import_data_run(self, relative_pathname, run_xml_name, force=False):
225         """Given a result directory import files"""
226         now = timezone.now()
227         run_dir = get_absolute_pathname(relative_pathname)
228         run_xml_path = os.path.join(run_dir, run_xml_name)
229
230         runs = DataRun.objects.filter(result_dir = relative_pathname)
231         if len(runs) == 0:
232             run = DataRun()
233             created = True
234         elif len(runs) > 1:
235             raise RuntimeError("Too many data runs for %s" % (
236                 relative_pathname,))
237         else:
238             run = runs[0]
239             created = False
240
241         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
242             LOGGER.debug("Importing run from %s" % (relative_pathname,))
243             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
244             run.flowcell = self
245             run.status = RUN_STATUS_REVERSE_MAP['DONE']
246             run.result_dir = relative_pathname
247             run.runfolder_name = run_xml_data.runfolder_name
248             run.cycle_start = run_xml_data.image_analysis.start
249             run.cycle_stop = run_xml_data.image_analysis.stop
250             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
251             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
252             run.image_software = run_xml_data.image_analysis.software
253             run.image_version = run_xml_data.image_analysis.version
254             run.basecall_software = run_xml_data.bustard.software
255             run.basecall_version = run_xml_data.bustard.version
256             # we're frequently not running alignments
257             if run_xml_data.gerald:
258                 run.alignment_software = run_xml_data.gerald.software
259                 run.alignment_version = run_xml_data.gerald.version
260
261             run.last_update_time = timezone.now()
262             run.save()
263
264             run.update_result_files()
265
266
267 # FIXME: should we automatically update dataruns?
268 #        Or should we expect someone to call update_data_runs?
269 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
270 #    """Update our dataruns
271 #    """
272 #    if not os.path.exists(settings.RESULT_HOME_DIR):
273 #       return
274 #
275 #    instance.update_data_runs()
276 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
277
278 LANE_STATUS_CODES = [(0, 'Failed'),
279                      (1, 'Marginal'),
280                      (2, 'Good'), ]
281 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
282 LANE_STATUS_MAP[None] = "Unknown"
283
284
285 def is_valid_lane(value):
286     if value >= 1 and value <= 8:
287         return True
288     else:
289         return False
290
291
292 class Lane(models.Model):
293     flowcell = models.ForeignKey(FlowCell)
294     lane_number = models.IntegerField()
295     library = models.ForeignKey(Library)
296     pM = models.DecimalField(max_digits=5,
297                              decimal_places=2,
298                              blank=False,
299                              null=False,
300                              default=default_pM)
301     cluster_estimate = models.IntegerField(blank=True, null=True)
302     status = models.IntegerField(choices=LANE_STATUS_CODES,
303                                  null=True,
304                                  blank=True)
305     comment = models.TextField(null=True, blank=True)
306
307     @models.permalink
308     def get_absolute_url(self):
309         return ('experiments.views.flowcell_lane_detail',
310                 [str(self.id)])
311
312     def __str__(self):
313         return self.flowcell.flowcell_id + ':' + str(self.lane_number)
314
315
316 class DataRun(models.Model):
317     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
318     runfolder_name = models.CharField(max_length=50)
319     result_dir = models.CharField(max_length=255)
320     last_update_time = models.DateTimeField()
321     run_start_time = models.DateTimeField()
322     cycle_start = models.IntegerField(null=True, blank=True)
323     cycle_stop = models.IntegerField(null=True, blank=True)
324     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
325                                      null=True, blank=True)
326     image_software = models.CharField(max_length=50)
327     image_version = models.CharField(max_length=50)
328     basecall_software = models.CharField(max_length=50)
329     basecall_version = models.CharField(max_length=50)
330     alignment_software = models.CharField(max_length=50)
331     alignment_version = models.CharField(max_length=50)
332     comment = models.TextField(blank=True)
333
334     def update_result_files(self):
335         abs_result_dir = get_absolute_pathname(self.result_dir)
336
337         for dirname, dirnames, filenames in os.walk(abs_result_dir):
338             for filename in filenames:
339                 pathname = os.path.join(dirname, filename)
340                 relative_pathname = get_relative_pathname(pathname)
341                 datafiles = self.datafile_set.filter(
342                     data_run=self,
343                     relative_pathname=relative_pathname)
344                 if len(datafiles) > 0:
345                     continue
346
347                 metadata = find_file_type_metadata_from_filename(filename)
348                 if metadata is not None:
349                     metadata['filename'] = filename
350                     newfile = DataFile()
351                     newfile.data_run = self
352                     newfile.file_type = metadata['file_type']
353                     newfile.relative_pathname = relative_pathname
354
355                     lane_number = metadata.get('lane', None)
356                     if lane_number is not None:
357                         lane = self.flowcell.lane_set.get(
358                             lane_number=lane_number)
359                         newfile.library = lane.library
360
361                     self.datafile_set.add(newfile)
362
363         self.last_update_time = timezone.now()
364
365     def lane_files(self):
366         lanes = {}
367
368         for datafile in self.datafile_set.all():
369             metadata = datafile.attributes
370             if metadata is not None:
371                 lane = metadata.get('lane', None)
372                 if lane is not None:
373                     lane_file_set = lanes.setdefault(lane, {})
374                     normalized_name = datafile.file_type.normalized_name
375                     lane_file_set[normalized_name] = datafile
376         return lanes
377
378     def ivc_plots(self, lane):
379         ivc_name = ['IVC All', 'IVC Call',
380                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
381
382         plots = {}
383         for rel_filename, metadata in self.get_result_files():
384             if metadata.file_type.name in ivc_name:
385                 plots[metadata.file_type.name] = (rel_filename, metadata)
386
387
388 class FileType(models.Model):
389     """Represent potential file types
390
391     regex is a pattern used to detect if a filename matches this type
392     data run currently assumes that there may be a (?P<lane>) and
393     (?P<end>) pattern in the regular expression.
394     """
395     name = models.CharField(max_length=50)
396     mimetype = models.CharField(max_length=50, null=True, blank=True)
397     # regular expression from glob.fnmatch.translate
398     regex = models.TextField(null=True, blank=True)
399
400     def parse_filename(self, pathname):
401         """Does filename match our pattern?
402
403         Returns None if not, or dictionary of match variables if we do.
404         """
405         path, filename = os.path.split(pathname)
406         if len(self.regex) > 0:
407             match = re.match(self.regex, filename)
408             if match is not None:
409                 # These are (?P<>) names we know about from our
410                 # default regexes.
411                 results = match.groupdict()
412
413                 # convert int parameters
414                 for attribute_name in ['lane', 'end']:
415                     value = results.get(attribute_name, None)
416                     if value is not None:
417                         results[attribute_name] = int(value)
418
419                 return results
420
421     def _get_normalized_name(self):
422         """Crush data file name into identifier friendly name"""
423         return self.name.replace(' ', '_').lower()
424     normalized_name = property(_get_normalized_name)
425
426     def __str__(self):
427         #return "<FileType: %s>" % (self.name,)
428         return self.name
429
430
431 def str_uuid():
432     """Helper function to set default UUID in DataFile"""
433     return str(uuid.uuid1())
434
435
436 class DataFile(models.Model):
437     """Store map from random ID to filename"""
438     random_key = models.CharField(max_length=64,
439                                   db_index=True,
440                                   default=str_uuid)
441     data_run = models.ForeignKey(DataRun, db_index=True)
442     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
443     file_type = models.ForeignKey(FileType)
444     relative_pathname = models.CharField(max_length=255, db_index=True)
445
446     def _get_attributes(self):
447         return self.file_type.parse_filename(self.relative_pathname)
448     attributes = property(_get_attributes)
449
450     def _get_pathname(self):
451         return get_absolute_pathname(self.relative_pathname)
452     pathname = property(_get_pathname)
453
454     @models.permalink
455     def get_absolute_url(self):
456         return ('experiments.views.read_result_file',
457                 (), {'key': self.random_key})
458
459
460 def find_file_type_metadata_from_filename(pathname):
461     path, filename = os.path.split(pathname)
462     result = None
463     for file_type in FileType.objects.all():
464         result = file_type.parse_filename(filename)
465         if result is not None:
466             result['file_type'] = file_type
467             return result
468
469     return None
470
471
472 def get_relative_pathname(abspath):
473     """Strip off the result home directory from a path
474     """
475     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
476     relative_pathname = abspath.replace(result_home_dir, '')
477     return relative_pathname
478
479
480 def get_absolute_pathname(relative_pathname):
481     """Attach relative path to  results home directory"""
482     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)