Add the ability to specify default cluster station & sequencer in the database
[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
1 import datetime
2 import glob
3 import logging
4 import os
5 import re
6 import types
7 import uuid
8
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist
11 from django.core import urlresolvers
12 from django.db import models
13 from django.db.models.signals import post_init
14
15 from htsworkflow.frontend.samples.models import Library
16 from htsworkflow.util.conversion import parse_flowcell_id
17 from htsworkflow.pipelines import runfolder
18
19 LOGGER = logging.getLogger(__name__)
20 default_pM = 5
21 try:
22     default_pM = int(settings.DEFAULT_PM)
23 except ValueError, e:
24     LOGGER.error("invalid value for frontend.default_pm")
25
26 # how many days to wait before trying to re-import a runfolder
27 RESCAN_DELAY = 1
28 try:
29     RESCAN_DELAY = int(settings.RESCAN_DELAY)
30 except (ValueError, AttributeError):
31     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
32                  "defaulting to %s" % (RESCAN_DELAY,))
33
34 RUN_STATUS_CHOICES = (
35     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
36     (1, 'Data Pipeline Started'),
37     (2, 'Data Pipeline Interrupted'),
38     (3, 'Data Pipeline Finished'),
39     (4, 'Collect Results Started'),
40     (5, 'Collect Results Finished'),
41     (6, 'QC Started'),
42     (7, 'QC Finished'),
43     (255, 'DONE'),
44   )
45 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
46
47
48 class ClusterStation(models.Model):
49     """List of cluster stations"""
50     name = models.CharField(max_length=50, unique=True)
51     isdefault = models.BooleanField(default=False, null=False)
52
53     class Meta:
54         ordering = ["-isdefault", "name"]
55
56     def __unicode__(self):
57         return unicode(self.name)
58
59     @classmethod
60     def default(cls):
61         d = cls.objects.filter(isdefault=True).all()
62         if len(d) > 0:
63             return d[0]
64         d = cls.objects.order_by('-id').all()
65         if len(d) > 0:
66             return d[0]
67         return None
68
69
70 class Sequencer(models.Model):
71     """Sequencers we've owned
72     """
73     name = models.CharField(max_length=50, db_index=True)
74     instrument_name = models.CharField(max_length=50, db_index=True)
75     serial_number = models.CharField(max_length=50, db_index=True)
76     model = models.CharField(max_length=255)
77     active = models.BooleanField(default=True, null=False)
78     isdefault = models.BooleanField(default=False, null=False)
79     comment = models.CharField(max_length=255)
80
81     class Meta:
82         ordering = ["-isdefault", "-active", "name"]
83
84     def __unicode__(self):
85         name = [unicode(self.name)]
86         if self.instrument_name is not None:
87             name.append("(%s)" % (unicode(self.instrument_name),))
88         return " ".join(name)
89
90     @models.permalink
91     def get_absolute_url(self):
92         return ('htsworkflow.frontend.experiments.views.sequencer',
93                 [self.id])
94
95     @classmethod
96     def default(cls):
97         d = cls.objects.filter(isdefault=True).all()
98         if len(d) > 0:
99             return d[0]
100         d = cls.objects.order_by('active', '-id').all()
101         if len(d) > 0:
102             return d[0]
103         return None
104
105
106 class FlowCell(models.Model):
107     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
108     run_date = models.DateTimeField()
109     advanced_run = models.BooleanField(default=False)
110     paired_end = models.BooleanField(default=False)
111     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
112     control_lane = models.IntegerField(choices=[(1, 1),
113                                                 (2, 2),
114                                                 (3, 3),
115                                                 (4, 4),
116                                                 (5, 5),
117                                                 (6, 6),
118                                                 (7, 7),
119                                                 (8, 8),
120                                                 (0, 'All Lanes')],
121                                        null=True,
122                                        blank=True)
123
124     cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
125     sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
126
127     notes = models.TextField(blank=True)
128
129     def __unicode__(self):
130         return unicode(self.flowcell_id)
131
132     def Lanes(self):
133         html = ['<table>']
134         for lane in self.lane_set.order_by('lane_number'):
135             cluster_estimate = lane.cluster_estimate
136             if cluster_estimate is not None:
137                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
138             else:
139                 cluster_estimate = 'None'
140             library_id = lane.library_id
141             library = lane.library
142             element = '<tr><td>%d</td>'\
143                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
144             html.append(element % (lane.lane_number,
145                                    library.get_admin_url(),
146                                    library,
147                                    cluster_estimate))
148         html.append('</table>')
149         return "\n".join(html)
150     Lanes.allow_tags = True
151
152     class Meta:
153         ordering = ["-run_date"]
154
155     def get_admin_url(self):
156         # that's the django way... except it didn't work
157         return urlresolvers.reverse('admin:experiments_flowcell_change',
158                                     args=(self.id,))
159
160     def flowcell_type(self):
161         """Convert our boolean 'is paired' flag to a name
162         """
163         if self.paired_end:
164             return u"Paired"
165         else:
166             return u"Single"
167
168     @models.permalink
169     def get_absolute_url(self):
170         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
171         return ('htsworkflow.frontend.experiments.views.flowcell_detail',
172                 [str(flowcell_id)])
173
174     def get_raw_data_directory(self):
175         """Return location of where the raw data is stored"""
176         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
177
178         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
179
180     def update_data_runs(self):
181         result_root = self.get_raw_data_directory()
182         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
183         if result_root is None:
184             return
185
186         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
187         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
188
189         dataruns = dict([(x.result_dir, x) for x in self.datarun_set.all()])
190
191         result_dirs = []
192         for dirpath, dirnames, filenames in os.walk(result_root):
193             for filename in filenames:
194                 if run_xml_re.match(filename):
195                     # we have a run directory
196                     relative_pathname = get_relative_pathname(dirpath)
197                     cached_run = dataruns.get(relative_pathname, None)
198                     now = datetime.datetime.now()
199                     if (cached_run is None):
200                         self.import_data_run(relative_pathname, filename)
201                     elif (now - cached_run.last_update_time).days > \
202                              RESCAN_DELAY:
203                         self.import_data_run(relative_pathname,
204                                              filename, cached_run)
205
206     def import_data_run(self, relative_pathname, run_xml_name, run=None):
207         """Given a result directory import files"""
208         run_dir = get_absolute_pathname(relative_pathname)
209         run_xml_path = os.path.join(run_dir, run_xml_name)
210         run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
211         LOGGER.debug("Importing run from %s" % (relative_pathname,))
212
213         if run is None:
214             run = DataRun()
215             run.flowcell = self
216             run.status = RUN_STATUS_REVERSE_MAP['DONE']
217             run.result_dir = relative_pathname
218             run.runfolder_name = run_xml_data.runfolder_name
219             run.cycle_start = run_xml_data.image_analysis.start
220             run.cycle_stop = run_xml_data.image_analysis.stop
221             run.run_start_time = run_xml_data.image_analysis.date
222             run.image_software = run_xml_data.image_analysis.software
223             run.image_version = run_xml_data.image_analysis.version
224             run.basecall_software = run_xml_data.bustard.software
225             run.basecall_version = run_xml_data.bustard.version
226             run.alignment_software = run_xml_data.gerald.software
227             run.alignment_version = run_xml_data.gerald.version
228
229         run.last_update_time = datetime.datetime.now()
230         run.save()
231
232         run.update_result_files()
233
234
235 # FIXME: should we automatically update dataruns?
236 #        Or should we expect someone to call update_data_runs?
237 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
238 #    """Update our dataruns
239 #    """
240 #    if not os.path.exists(settings.RESULT_HOME_DIR):
241 #       return
242 #
243 #    instance.update_data_runs()
244 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
245
246
247 LANE_STATUS_CODES = [(0, 'Failed'),
248                      (1, 'Marginal'),
249                      (2, 'Good'), ]
250 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
251 LANE_STATUS_MAP[None] = "Unknown"
252
253
254 def is_valid_lane(value):
255     if value >= 1 and value <= 8:
256         return True
257     else:
258         return False
259
260
261 class Lane(models.Model):
262     flowcell = models.ForeignKey(FlowCell)
263     lane_number = models.IntegerField()
264     library = models.ForeignKey(Library)
265     pM = models.DecimalField(max_digits=5,
266                              decimal_places=2,
267                              blank=False,
268                              null=False,
269                              default=default_pM)
270     cluster_estimate = models.IntegerField(blank=True, null=True)
271     status = models.IntegerField(choices=LANE_STATUS_CODES,
272                                  null=True,
273                                  blank=True)
274     comment = models.TextField(null=True, blank=True)
275
276     @models.permalink
277     def get_absolute_url(self):
278         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
279                 [str(self.id)])
280
281     def __unicode__(self):
282         return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
283
284
285 class DataRun(models.Model):
286     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
287     runfolder_name = models.CharField(max_length=50)
288     result_dir = models.CharField(max_length=255)
289     last_update_time = models.DateTimeField()
290     run_start_time = models.DateTimeField()
291     cycle_start = models.IntegerField(null=True, blank=True)
292     cycle_stop = models.IntegerField(null=True, blank=True)
293     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
294                                      null=True, blank=True)
295     image_software = models.CharField(max_length=50)
296     image_version = models.CharField(max_length=50)
297     basecall_software = models.CharField(max_length=50)
298     basecall_version = models.CharField(max_length=50)
299     alignment_software = models.CharField(max_length=50)
300     alignment_version = models.CharField(max_length=50)
301     comment = models.TextField(blank=True)
302
303     def update_result_files(self):
304         abs_result_dir = get_absolute_pathname(self.result_dir)
305
306         for dirname, dirnames, filenames in os.walk(abs_result_dir):
307             for filename in filenames:
308                 pathname = os.path.join(dirname, filename)
309                 relative_pathname = get_relative_pathname(pathname)
310                 datafiles = self.datafile_set.filter(
311                     data_run=self,
312                     relative_pathname=relative_pathname)
313                 if len(datafiles) > 0:
314                     continue
315
316                 metadata = find_file_type_metadata_from_filename(filename)
317                 if metadata is not None:
318                     metadata['filename'] = filename
319                     newfile = DataFile()
320                     newfile.data_run = self
321                     newfile.file_type = metadata['file_type']
322                     newfile.relative_pathname = relative_pathname
323
324                     lane_number = metadata.get('lane', None)
325                     if lane_number is not None:
326                         lane = self.flowcell.lane_set.get(
327                             lane_number=lane_number)
328                         newfile.library = lane.library
329
330                     self.datafile_set.add(newfile)
331
332         self.last_update_time = datetime.datetime.now()
333
334     def lane_files(self):
335         lanes = {}
336
337         for datafile in self.datafile_set.all():
338             metadata = datafile.attributes
339             if metadata is not None:
340                 lane = metadata.get('lane', None)
341                 if lane is not None:
342                     lane_file_set = lanes.setdefault(lane, {})
343                     normalized_name = datafile.file_type.normalized_name
344                     lane_file_set[normalized_name] = datafile
345         return lanes
346
347     def ivc_plots(self, lane):
348         ivc_name = ['IVC All', 'IVC Call',
349                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
350
351         plots = {}
352         for rel_filename, metadata in self.get_result_files():
353             if metadata.file_type.name in ivc_name:
354                 plots[metadata.file_type.name] = (rel_filename, metadata)
355
356
357 class FileType(models.Model):
358     """Represent potential file types
359
360     regex is a pattern used to detect if a filename matches this type
361     data run currently assumes that there may be a (?P<lane>) and
362     (?P<end>) pattern in the regular expression.
363     """
364     name = models.CharField(max_length=50)
365     mimetype = models.CharField(max_length=50, null=True, blank=True)
366     # regular expression from glob.fnmatch.translate
367     regex = models.CharField(max_length=50, null=True, blank=True)
368
369     def parse_filename(self, pathname):
370         """Does filename match our pattern?
371
372         Returns None if not, or dictionary of match variables if we do.
373         """
374         path, filename = os.path.split(pathname)
375         if len(self.regex) > 0:
376             match = re.match(self.regex, filename)
377             if match is not None:
378                 # These are (?P<>) names we know about from our
379                 # default regexes.
380                 results = match.groupdict()
381
382                 # convert int parameters
383                 for attribute_name in ['lane', 'end']:
384                     value = results.get(attribute_name, None)
385                     if value is not None:
386                         results[attribute_name] = int(value)
387
388                 return results
389
390     def _get_normalized_name(self):
391         """Crush data file name into identifier friendly name"""
392         return self.name.replace(' ', '_').lower()
393     normalized_name = property(_get_normalized_name)
394
395     def __unicode__(self):
396         #return u"<FileType: %s>" % (self.name,)
397         return self.name
398
399
400 def str_uuid():
401     """Helper function to set default UUID in DataFile"""
402     return str(uuid.uuid1())
403
404
405 class DataFile(models.Model):
406     """Store map from random ID to filename"""
407     random_key = models.CharField(max_length=64,
408                                   db_index=True,
409                                   default=str_uuid)
410     data_run = models.ForeignKey(DataRun, db_index=True)
411     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
412     file_type = models.ForeignKey(FileType)
413     relative_pathname = models.CharField(max_length=255, db_index=True)
414
415     def _get_attributes(self):
416         return self.file_type.parse_filename(self.relative_pathname)
417     attributes = property(_get_attributes)
418
419     def _get_pathname(self):
420         return get_absolute_pathname(self.relative_pathname)
421     pathname = property(_get_pathname)
422
423     @models.permalink
424     def get_absolute_url(self):
425         return ('htsworkflow.frontend.experiments.views.read_result_file',
426                 (), {'key': self.random_key})
427
428
429 def find_file_type_metadata_from_filename(pathname):
430     path, filename = os.path.split(pathname)
431     result = None
432     for file_type in FileType.objects.all():
433         result = file_type.parse_filename(filename)
434         if result is not None:
435             result['file_type'] = file_type
436             return result
437
438     return None
439
440
441 def get_relative_pathname(abspath):
442     """Strip off the result home directory from a path
443     """
444     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
445     relative_pathname = abspath.replace(result_home_dir, '')
446     return relative_pathname
447
448
449 def get_absolute_pathname(relative_pathname):
450     """Attach relative path to  results home directory"""
451     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)