htsworkflow/frontend/experiments/models.py

   1 import datetime
   2 import glob
   3 import logging
   4 import os
   5 import re
   6 import types
   7 import uuid
   8
   9 from django.conf import settings
  10 from django.core.exceptions import ObjectDoesNotExist
  11 from django.core import urlresolvers
  12 from django.db import models
  13 from django.db.models.signals import post_init
  14
  15 from htsworkflow.frontend.samples.models import Library
  16 from htsworkflow.util.conversion import parse_flowcell_id
  17 from htsworkflow.pipelines import runfolder
  18
  19 logger = logging.getLogger(__name__)
  20 default_pM = 5
  21 try:
  22     default_pM = int(settings.DEFAULT_PM)
  23 except ValueError,e:
  24     logger.error("invalid value for frontend.default_pm")
  25
  26 # how many days to wait before trying to re-import a runfolder
  27 RESCAN_DELAY = 1
  28 try:
  29     RESCAN_DELAY = int(settings.RESCAN_DELAY)
  30 except (ValueError, AttributeError):
  31     logger.error("Missing or invalid settings.RESCAN_DELAY, "\
  32                  "defaulting to %s" % (RESCAN_DELAY,))
  33
  34 RUN_STATUS_CHOICES = (
  35     (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
  36     (1, 'Data Pipeline Started'),
  37     (2, 'Data Pipeline Interrupted'),
  38     (3, 'Data Pipeline Finished'),
  39     (4, 'Collect Results Started'),
  40     (5, 'Collect Results Finished'),
  41     (6, 'QC Started'),
  42     (7, 'QC Finished'),
  43     (255, 'DONE'),
  44   )
  45 RUN_STATUS_REVERSE_MAP = dict(((v,k) for k,v in RUN_STATUS_CHOICES))
  46
  47 class ClusterStation(models.Model):
  48   name = models.CharField(max_length=50, unique=True)
  49
  50   def __unicode__(self):
  51     return unicode(self.name)
  52
  53 class Sequencer(models.Model):
  54   name = models.CharField(max_length=50, db_index=True)
  55   instrument_name = models.CharField(max_length=50, db_index=True)
  56   serial_number = models.CharField(max_length=50, db_index=True)
  57   model = models.CharField(max_length=255)
  58   comment = models.CharField(max_length=255)
  59
  60   def __unicode__(self):
  61       name = [unicode(self.name)]
  62       if self.instrument_name is not None:
  63           name.append("(%s)" % (unicode(self.instrument_name),))
  64       return " ".join(name)
  65
  66   @models.permalink
  67   def get_absolute_url(self):
  68       return ('htsworkflow.frontend.experiments.views.sequencer',
  69               [self.id])
  70
  71
  72 class FlowCell(models.Model):
  73   flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
  74   run_date = models.DateTimeField()
  75   advanced_run = models.BooleanField(default=False)
  76   paired_end = models.BooleanField(default=False)
  77   read_length = models.IntegerField(default=32) #Stanford is currenlty 25
  78   control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True, blank=True)
  79
  80   cluster_station = models.ForeignKey(ClusterStation, default=3)
  81   sequencer = models.ForeignKey(Sequencer, default=1)
  82
  83   notes = models.TextField(blank=True)
  84
  85   def __unicode__(self):
  86       return unicode(self.flowcell_id)
  87
  88   def Lanes(self):
  89     html = ['<table>']
  90     for lane in self.lane_set.order_by('lane_number'):
  91         cluster_estimate = lane.cluster_estimate
  92         if cluster_estimate is not None:
  93             cluster_estimate = "%s k" % ((int(cluster_estimate)/1000), )
  94         else:
  95             cluster_estimate = 'None'
  96         library_id = lane.library_id
  97         library = lane.library
  98         element = '<tr><td>%d</td><td><a href="%s">%s</a></td><td>%s</td></tr>'
  99         html.append(element % (lane.lane_number,
 100                                library.get_admin_url(),
 101                                library,
 102                                cluster_estimate))
 103     html.append('</table>')
 104     return "\n".join(html)
 105   Lanes.allow_tags = True
 106
 107   class Meta:
 108     ordering = ["-run_date"]
 109
 110   def get_admin_url(self):
 111     # that's the django way... except it didn't work
 112     return urlresolvers.reverse('admin:experiments_flowcell_change',
 113                                 args=(self.id,))
 114
 115   def flowcell_type(self):
 116     """
 117     Convert our boolean 'is paired' flag to a name
 118     """
 119     if self.paired_end:
 120       return u"Paired"
 121     else:
 122       return u"Single"
 123
 124   @models.permalink
 125   def get_absolute_url(self):
 126       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 127       return ('htsworkflow.frontend.experiments.views.flowcell_detail',
 128               [str(flowcell_id)])
 129
 130   def get_raw_data_directory(self):
 131       """Return location of where the raw data is stored"""
 132       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 133
 134       return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
 135
 136   def update_data_runs(self):
 137       result_root = self.get_raw_data_directory()
 138       logger.debug("Update data runs flowcell root: %s" % (result_root,))
 139       if result_root is None:
 140           return
 141
 142       result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
 143       run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
 144
 145       dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ])
 146
 147       result_dirs = []
 148       for dirpath, dirnames, filenames in os.walk(result_root):
 149           for filename in filenames:
 150               if run_xml_re.match(filename):
 151                   # we have a run directory
 152                   relative_pathname = get_relative_pathname(dirpath)
 153                   cached_run = dataruns.get(relative_pathname, None)
 154                   now = datetime.datetime.now()
 155                   if (cached_run is None):
 156                       self.import_data_run(relative_pathname, filename)
 157                   elif (now - cached_run.last_update_time).days > RESCAN_DELAY:
 158                       self.import_data_run(relative_pathname,
 159                                            filename, cached_run)
 160
 161   def import_data_run(self, relative_pathname, run_xml_name, run=None):
 162       """Given a result directory import files"""
 163       run_dir = get_absolute_pathname(relative_pathname)
 164       run_xml_path = os.path.join(run_dir, run_xml_name)
 165       run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
 166       logger.debug("Importing run from %s" % (relative_pathname,))
 167
 168       if run is None:
 169           run = DataRun()
 170           run.flowcell = self
 171           run.status = RUN_STATUS_REVERSE_MAP['DONE']
 172           run.result_dir = relative_pathname
 173           run.runfolder_name = run_xml_data.runfolder_name
 174           run.cycle_start = run_xml_data.image_analysis.start
 175           run.cycle_stop = run_xml_data.image_analysis.stop
 176           run.run_start_time = run_xml_data.image_analysis.date
 177           run.image_software = run_xml_data.image_analysis.software
 178           run.image_version = run_xml_data.image_analysis.version
 179           run.basecall_software = run_xml_data.bustard.software
 180           run.basecall_version = run_xml_data.bustard.version
 181           run.alignment_software = run_xml_data.gerald.software
 182           run.alignment_version = run_xml_data.gerald.version
 183
 184       run.last_update_time = datetime.datetime.now()
 185       run.save()
 186
 187       run.update_result_files()
 188
 189
 190 # FIXME: should we automatically update dataruns?
 191 #        Or should we expect someone to call update_data_runs?
 192 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
 193 #    """Update our dataruns
 194 #    """
 195 #    if not os.path.exists(settings.RESULT_HOME_DIR):
 196 #       return
 197 #
 198 #    instance.update_data_runs()
 199 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
 200
 201
 202
 203 LANE_STATUS_CODES = [(0, 'Failed'),
 204                     (1, 'Marginal'),
 205                     (2, 'Good'),]
 206 LANE_STATUS_MAP = dict((int(k),v) for k,v in LANE_STATUS_CODES )
 207 LANE_STATUS_MAP[None] = "Unknown"
 208
 209 def is_valid_lane(value):
 210     if value >= 1 and value <= 8:
 211         return True
 212     else:
 213           return False
 214
 215 class Lane(models.Model):
 216   flowcell = models.ForeignKey(FlowCell)
 217   lane_number = models.IntegerField()
 218   library = models.ForeignKey(Library)
 219   pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM)
 220   cluster_estimate = models.IntegerField(blank=True, null=True)
 221   status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True)
 222   comment = models.TextField(null=True, blank=True)
 223
 224   @models.permalink
 225   def get_absolute_url(self):
 226        return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
 227                [str(self.id)])
 228
 229   def __unicode__(self):
 230       return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
 231
 232 ### -----------------------
 233 class DataRun(models.Model):
 234     flowcell = models.ForeignKey(FlowCell,verbose_name="Flowcell Id")
 235     runfolder_name = models.CharField(max_length=50)
 236     result_dir = models.CharField(max_length=255)
 237     last_update_time = models.DateTimeField()
 238     run_start_time = models.DateTimeField()
 239     cycle_start = models.IntegerField(null=True, blank=True)
 240     cycle_stop = models.IntegerField(null=True, blank=True)
 241     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
 242                                      null=True, blank=True)
 243     image_software = models.CharField(max_length=50)
 244     image_version = models.CharField(max_length=50)
 245     basecall_software = models.CharField(max_length=50)
 246     basecall_version = models.CharField(max_length=50)
 247     alignment_software = models.CharField(max_length=50)
 248     alignment_version = models.CharField(max_length=50)
 249     comment = models.TextField(blank=True)
 250
 251     def update_result_files(self):
 252         abs_result_dir = get_absolute_pathname(self.result_dir)
 253
 254         for dirname, dirnames, filenames in os.walk(abs_result_dir):
 255             for filename in filenames:
 256                 pathname = os.path.join(dirname, filename)
 257                 relative_pathname = get_relative_pathname(pathname)
 258                 datafiles = self.datafile_set.filter(
 259                   data_run = self,
 260                   relative_pathname=relative_pathname)
 261                 if len(datafiles) > 0:
 262                     continue
 263
 264                 metadata = find_file_type_metadata_from_filename(filename)
 265                 if metadata is not None:
 266                     metadata['filename'] = filename
 267                     newfile = DataFile()
 268                     newfile.data_run = self
 269                     newfile.file_type = metadata['file_type']
 270                     newfile.relative_pathname = relative_pathname
 271
 272                     lane_number = metadata.get('lane', None)
 273                     if lane_number is not None:
 274                         lane = self.flowcell.lane_set.get(lane_number = lane_number)
 275                         newfile.library = lane.library
 276
 277                     self.datafile_set.add(newfile)
 278
 279         self.last_update_time = datetime.datetime.now()
 280
 281     def lane_files(self):
 282         lanes = {}
 283
 284         for datafile in self.datafile_set.all():
 285             metadata = datafile.attributes
 286             if metadata is not None:
 287                 lane = metadata.get('lane', None)
 288                 if lane is not None:
 289                     lane_file_set = lanes.setdefault(lane, {})
 290                     lane_file_set[datafile.file_type.normalized_name] = datafile
 291         return lanes
 292
 293     def ivc_plots(self, lane):
 294         ivc_name = ['IVC All', 'IVC Call',
 295                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
 296
 297         plots = {}
 298         for rel_filename, metadata in self.get_result_files():
 299             if metadata.file_type.name in ivc_name:
 300                 plots[metadata.file_type.name] = (rel_filename, metadata)
 301
 302 class FileType(models.Model):
 303     """Represent potential file types
 304
 305     regex is a pattern used to detect if a filename matches this type
 306     data run currently assumes that there may be a (?P<lane>) and
 307     (?P<end>) pattern in the regular expression.
 308     """
 309     name = models.CharField(max_length=50)
 310     mimetype = models.CharField(max_length=50, null=True, blank=True)
 311     # regular expression from glob.fnmatch.translate
 312     regex = models.CharField(max_length=50, null=True, blank=True)
 313
 314     def parse_filename(self, pathname):
 315         """Does filename match our pattern?
 316
 317         Returns None if not, or dictionary of match variables if we do.
 318         """
 319         path, filename = os.path.split(pathname)
 320         if len(self.regex) > 0:
 321             match = re.match(self.regex, filename)
 322             if match is not None:
 323                 # These are (?P<>) names we know about from our default regexes.
 324                 results = match.groupdict()
 325
 326                 # convert int parameters
 327                 for attribute_name in ['lane', 'end']:
 328                     value = results.get(attribute_name, None)
 329                     if value is not None:
 330                         results[attribute_name] = int(value)
 331
 332                 return results
 333
 334     def _get_normalized_name(self):
 335         """Crush data file name into identifier friendly name"""
 336         return self.name.replace(' ', '_').lower()
 337     normalized_name = property(_get_normalized_name)
 338
 339     def __unicode__(self):
 340         #return u"<FileType: %s>" % (self.name,)
 341         return self.name
 342
 343 def str_uuid():
 344     """Helper function to set default UUID in DataFile"""
 345     return str(uuid.uuid1())
 346
 347 class DataFile(models.Model):
 348     """Store map from random ID to filename"""
 349     random_key = models.CharField(max_length=64,
 350                                   db_index=True,
 351                                   default=str_uuid)
 352     data_run = models.ForeignKey(DataRun, db_index=True)
 353     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
 354     file_type = models.ForeignKey(FileType)
 355     relative_pathname = models.CharField(max_length=255, db_index=True)
 356
 357     def _get_attributes(self):
 358         return self.file_type.parse_filename(self.relative_pathname)
 359     attributes = property(_get_attributes)
 360
 361     def _get_pathname(self):
 362         return get_absolute_pathname(self.relative_pathname)
 363     pathname = property(_get_pathname)
 364
 365     @models.permalink
 366     def get_absolute_url(self):
 367         return ('htsworkflow.frontend.experiments.views.read_result_file',
 368                 (), {'key': self.random_key })
 369
 370 def find_file_type_metadata_from_filename(pathname):
 371     path, filename = os.path.split(pathname)
 372     result = None
 373     for file_type in FileType.objects.all():
 374         result = file_type.parse_filename(filename)
 375         if result is not None:
 376             result['file_type'] = file_type
 377             return result
 378
 379     return None
 380
 381 def get_relative_pathname(abspath):
 382     """Strip off the result home directory from a path
 383     """
 384     result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
 385     relative_pathname = abspath.replace(result_home_dir,'')
 386     return relative_pathname
 387
 388 def get_absolute_pathname(relative_pathname):
 389     """Attach relative path to  results home directory"""
 390     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)
 391