htsworkflow/frontend/experiments/models.py

   1 import datetime
   2 import glob
   3 import logging
   4 import os
   5 import re
   6 import types
   7 import uuid
   8
   9 from django.conf import settings
  10 from django.core.exceptions import ObjectDoesNotExist
  11 from django.core import urlresolvers
  12 from django.db import models
  13 from django.db.models.signals import post_init
  14
  15 from htsworkflow.frontend.samples.models import Library
  16 from htsworkflow.util.conversion import parse_flowcell_id
  17 from htsworkflow.pipelines import runfolder
  18
  19 logger = logging.getLogger(__name__)
  20 default_pM = 5
  21 try:
  22     default_pM = int(settings.DEFAULT_PM)
  23 except ValueError,e:
  24     logger.error("invalid value for frontend.default_pm")
  25
  26 # how many days to wait before trying to re-import a runfolder
  27 RESCAN_DELAY = 1
  28 try:
  29     RESCAN_DELAY = int(settings.RESCAN_DELAY)
  30 except (ValueError, AttributeError):
  31     logger.error("Missing or invalid settings.RESCAN_DELAY, "\
  32                  "defaulting to %s" % (RESCAN_DELAY,))
  33
  34 RUN_STATUS_CHOICES = (
  35     (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
  36     (1, 'Data Pipeline Started'),
  37     (2, 'Data Pipeline Interrupted'),
  38     (3, 'Data Pipeline Finished'),
  39     (4, 'Collect Results Started'),
  40     (5, 'Collect Results Finished'),
  41     (6, 'QC Started'),
  42     (7, 'QC Finished'),
  43     (255, 'DONE'),
  44   )
  45 RUN_STATUS_REVERSE_MAP = dict(((v,k) for k,v in RUN_STATUS_CHOICES))
  46
  47 class ClusterStation(models.Model):
  48   name = models.CharField(max_length=50, unique=True)
  49
  50   def __unicode__(self):
  51     return unicode(self.name)
  52
  53 class Sequencer(models.Model):
  54   name = models.CharField(max_length=50, db_index=True)
  55   instrument_name = models.CharField(max_length=50, db_index=True)
  56   serial_number = models.CharField(max_length=50, db_index=True)
  57   model = models.CharField(max_length=255)
  58   active = models.BooleanField(default=True, null=False)
  59   comment = models.CharField(max_length=255)
  60
  61   class Meta:
  62     ordering = ["-active", "name"]
  63
  64   def __unicode__(self):
  65       name = [unicode(self.name)]
  66       if self.instrument_name is not None:
  67           name.append("(%s)" % (unicode(self.instrument_name),))
  68       return " ".join(name)
  69
  70
  71   @models.permalink
  72   def get_absolute_url(self):
  73       return ('htsworkflow.frontend.experiments.views.sequencer',
  74               [self.id])
  75
  76
  77 class FlowCell(models.Model):
  78   flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
  79   run_date = models.DateTimeField()
  80   advanced_run = models.BooleanField(default=False)
  81   paired_end = models.BooleanField(default=False)
  82   read_length = models.IntegerField(default=32) #Stanford is currenlty 25
  83   control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True, blank=True)
  84
  85   cluster_station = models.ForeignKey(ClusterStation, default=3)
  86   sequencer = models.ForeignKey(Sequencer, default=1)
  87
  88   notes = models.TextField(blank=True)
  89
  90   def __unicode__(self):
  91       return unicode(self.flowcell_id)
  92
  93   def Lanes(self):
  94     html = ['<table>']
  95     for lane in self.lane_set.order_by('lane_number'):
  96         cluster_estimate = lane.cluster_estimate
  97         if cluster_estimate is not None:
  98             cluster_estimate = "%s k" % ((int(cluster_estimate)/1000), )
  99         else:
 100             cluster_estimate = 'None'
 101         library_id = lane.library_id
 102         library = lane.library
 103         element = '<tr><td>%d</td><td><a href="%s">%s</a></td><td>%s</td></tr>'
 104         html.append(element % (lane.lane_number,
 105                                library.get_admin_url(),
 106                                library,
 107                                cluster_estimate))
 108     html.append('</table>')
 109     return "\n".join(html)
 110   Lanes.allow_tags = True
 111
 112   class Meta:
 113     ordering = ["-run_date"]
 114
 115   def get_admin_url(self):
 116     # that's the django way... except it didn't work
 117     return urlresolvers.reverse('admin:experiments_flowcell_change',
 118                                 args=(self.id,))
 119
 120   def flowcell_type(self):
 121     """
 122     Convert our boolean 'is paired' flag to a name
 123     """
 124     if self.paired_end:
 125       return u"Paired"
 126     else:
 127       return u"Single"
 128
 129   @models.permalink
 130   def get_absolute_url(self):
 131       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 132       return ('htsworkflow.frontend.experiments.views.flowcell_detail',
 133               [str(flowcell_id)])
 134
 135   def get_raw_data_directory(self):
 136       """Return location of where the raw data is stored"""
 137       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 138
 139       return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
 140
 141   def update_data_runs(self):
 142       result_root = self.get_raw_data_directory()
 143       logger.debug("Update data runs flowcell root: %s" % (result_root,))
 144       if result_root is None:
 145           return
 146
 147       result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
 148       run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
 149
 150       dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ])
 151
 152       result_dirs = []
 153       for dirpath, dirnames, filenames in os.walk(result_root):
 154           for filename in filenames:
 155               if run_xml_re.match(filename):
 156                   # we have a run directory
 157                   relative_pathname = get_relative_pathname(dirpath)
 158                   cached_run = dataruns.get(relative_pathname, None)
 159                   now = datetime.datetime.now()
 160                   if (cached_run is None):
 161                       self.import_data_run(relative_pathname, filename)
 162                   elif (now - cached_run.last_update_time).days > RESCAN_DELAY:
 163                       self.import_data_run(relative_pathname,
 164                                            filename, cached_run)
 165
 166   def import_data_run(self, relative_pathname, run_xml_name, run=None):
 167       """Given a result directory import files"""
 168       run_dir = get_absolute_pathname(relative_pathname)
 169       run_xml_path = os.path.join(run_dir, run_xml_name)
 170       run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
 171       logger.debug("Importing run from %s" % (relative_pathname,))
 172
 173       if run is None:
 174           run = DataRun()
 175           run.flowcell = self
 176           run.status = RUN_STATUS_REVERSE_MAP['DONE']
 177           run.result_dir = relative_pathname
 178           run.runfolder_name = run_xml_data.runfolder_name
 179           run.cycle_start = run_xml_data.image_analysis.start
 180           run.cycle_stop = run_xml_data.image_analysis.stop
 181           run.run_start_time = run_xml_data.image_analysis.date
 182           run.image_software = run_xml_data.image_analysis.software
 183           run.image_version = run_xml_data.image_analysis.version
 184           run.basecall_software = run_xml_data.bustard.software
 185           run.basecall_version = run_xml_data.bustard.version
 186           run.alignment_software = run_xml_data.gerald.software
 187           run.alignment_version = run_xml_data.gerald.version
 188
 189       run.last_update_time = datetime.datetime.now()
 190       run.save()
 191
 192       run.update_result_files()
 193
 194
 195 # FIXME: should we automatically update dataruns?
 196 #        Or should we expect someone to call update_data_runs?
 197 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
 198 #    """Update our dataruns
 199 #    """
 200 #    if not os.path.exists(settings.RESULT_HOME_DIR):
 201 #       return
 202 #
 203 #    instance.update_data_runs()
 204 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
 205
 206
 207
 208 LANE_STATUS_CODES = [(0, 'Failed'),
 209                     (1, 'Marginal'),
 210                     (2, 'Good'),]
 211 LANE_STATUS_MAP = dict((int(k),v) for k,v in LANE_STATUS_CODES )
 212 LANE_STATUS_MAP[None] = "Unknown"
 213
 214 def is_valid_lane(value):
 215     if value >= 1 and value <= 8:
 216         return True
 217     else:
 218           return False
 219
 220 class Lane(models.Model):
 221   flowcell = models.ForeignKey(FlowCell)
 222   lane_number = models.IntegerField()
 223   library = models.ForeignKey(Library)
 224   pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM)
 225   cluster_estimate = models.IntegerField(blank=True, null=True)
 226   status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True)
 227   comment = models.TextField(null=True, blank=True)
 228
 229   @models.permalink
 230   def get_absolute_url(self):
 231        return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
 232                [str(self.id)])
 233
 234   def __unicode__(self):
 235       return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
 236
 237 ### -----------------------
 238 class DataRun(models.Model):
 239     flowcell = models.ForeignKey(FlowCell,verbose_name="Flowcell Id")
 240     runfolder_name = models.CharField(max_length=50)
 241     result_dir = models.CharField(max_length=255)
 242     last_update_time = models.DateTimeField()
 243     run_start_time = models.DateTimeField()
 244     cycle_start = models.IntegerField(null=True, blank=True)
 245     cycle_stop = models.IntegerField(null=True, blank=True)
 246     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
 247                                      null=True, blank=True)
 248     image_software = models.CharField(max_length=50)
 249     image_version = models.CharField(max_length=50)
 250     basecall_software = models.CharField(max_length=50)
 251     basecall_version = models.CharField(max_length=50)
 252     alignment_software = models.CharField(max_length=50)
 253     alignment_version = models.CharField(max_length=50)
 254     comment = models.TextField(blank=True)
 255
 256     def update_result_files(self):
 257         abs_result_dir = get_absolute_pathname(self.result_dir)
 258
 259         for dirname, dirnames, filenames in os.walk(abs_result_dir):
 260             for filename in filenames:
 261                 pathname = os.path.join(dirname, filename)
 262                 relative_pathname = get_relative_pathname(pathname)
 263                 datafiles = self.datafile_set.filter(
 264                   data_run = self,
 265                   relative_pathname=relative_pathname)
 266                 if len(datafiles) > 0:
 267                     continue
 268
 269                 metadata = find_file_type_metadata_from_filename(filename)
 270                 if metadata is not None:
 271                     metadata['filename'] = filename
 272                     newfile = DataFile()
 273                     newfile.data_run = self
 274                     newfile.file_type = metadata['file_type']
 275                     newfile.relative_pathname = relative_pathname
 276
 277                     lane_number = metadata.get('lane', None)
 278                     if lane_number is not None:
 279                         lane = self.flowcell.lane_set.get(lane_number = lane_number)
 280                         newfile.library = lane.library
 281
 282                     self.datafile_set.add(newfile)
 283
 284         self.last_update_time = datetime.datetime.now()
 285
 286     def lane_files(self):
 287         lanes = {}
 288
 289         for datafile in self.datafile_set.all():
 290             metadata = datafile.attributes
 291             if metadata is not None:
 292                 lane = metadata.get('lane', None)
 293                 if lane is not None:
 294                     lane_file_set = lanes.setdefault(lane, {})
 295                     lane_file_set[datafile.file_type.normalized_name] = datafile
 296         return lanes
 297
 298     def ivc_plots(self, lane):
 299         ivc_name = ['IVC All', 'IVC Call',
 300                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
 301
 302         plots = {}
 303         for rel_filename, metadata in self.get_result_files():
 304             if metadata.file_type.name in ivc_name:
 305                 plots[metadata.file_type.name] = (rel_filename, metadata)
 306
 307 class FileType(models.Model):
 308     """Represent potential file types
 309
 310     regex is a pattern used to detect if a filename matches this type
 311     data run currently assumes that there may be a (?P<lane>) and
 312     (?P<end>) pattern in the regular expression.
 313     """
 314     name = models.CharField(max_length=50)
 315     mimetype = models.CharField(max_length=50, null=True, blank=True)
 316     # regular expression from glob.fnmatch.translate
 317     regex = models.CharField(max_length=50, null=True, blank=True)
 318
 319     def parse_filename(self, pathname):
 320         """Does filename match our pattern?
 321
 322         Returns None if not, or dictionary of match variables if we do.
 323         """
 324         path, filename = os.path.split(pathname)
 325         if len(self.regex) > 0:
 326             match = re.match(self.regex, filename)
 327             if match is not None:
 328                 # These are (?P<>) names we know about from our default regexes.
 329                 results = match.groupdict()
 330
 331                 # convert int parameters
 332                 for attribute_name in ['lane', 'end']:
 333                     value = results.get(attribute_name, None)
 334                     if value is not None:
 335                         results[attribute_name] = int(value)
 336
 337                 return results
 338
 339     def _get_normalized_name(self):
 340         """Crush data file name into identifier friendly name"""
 341         return self.name.replace(' ', '_').lower()
 342     normalized_name = property(_get_normalized_name)
 343
 344     def __unicode__(self):
 345         #return u"<FileType: %s>" % (self.name,)
 346         return self.name
 347
 348 def str_uuid():
 349     """Helper function to set default UUID in DataFile"""
 350     return str(uuid.uuid1())
 351
 352 class DataFile(models.Model):
 353     """Store map from random ID to filename"""
 354     random_key = models.CharField(max_length=64,
 355                                   db_index=True,
 356                                   default=str_uuid)
 357     data_run = models.ForeignKey(DataRun, db_index=True)
 358     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
 359     file_type = models.ForeignKey(FileType)
 360     relative_pathname = models.CharField(max_length=255, db_index=True)
 361
 362     def _get_attributes(self):
 363         return self.file_type.parse_filename(self.relative_pathname)
 364     attributes = property(_get_attributes)
 365
 366     def _get_pathname(self):
 367         return get_absolute_pathname(self.relative_pathname)
 368     pathname = property(_get_pathname)
 369
 370     @models.permalink
 371     def get_absolute_url(self):
 372         return ('htsworkflow.frontend.experiments.views.read_result_file',
 373                 (), {'key': self.random_key })
 374
 375 def find_file_type_metadata_from_filename(pathname):
 376     path, filename = os.path.split(pathname)
 377     result = None
 378     for file_type in FileType.objects.all():
 379         result = file_type.parse_filename(filename)
 380         if result is not None:
 381             result['file_type'] = file_type
 382             return result
 383
 384     return None
 385
 386 def get_relative_pathname(abspath):
 387     """Strip off the result home directory from a path
 388     """
 389     result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
 390     relative_pathname = abspath.replace(result_home_dir,'')
 391     return relative_pathname
 392
 393 def get_absolute_pathname(relative_pathname):
 394     """Attach relative path to  results home directory"""
 395     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)
 396