experiments/models.py

   1 from __future__ import absolute_import, print_function, unicode_literals
   2
   3 import datetime
   4 import glob
   5 import logging
   6 import os
   7 import re
   8 import types
   9 import uuid
  10
  11 from django.conf import settings
  12 from django.core.exceptions import ObjectDoesNotExist
  13 from django.core import urlresolvers
  14 from django.utils import timezone
  15 from django.db import models
  16 from django.db.models.signals import post_init, pre_save
  17
  18 from samples.models import Library
  19 from htsworkflow.util.conversion import parse_flowcell_id
  20 from htsworkflow.pipelines import runfolder
  21
  22 import pytz
  23
  24 LOGGER = logging.getLogger(__name__)
  25 default_pM = 5
  26 try:
  27     default_pM = int(settings.DEFAULT_PM)
  28 except AttributeError as e:
  29     LOGGER.error("invalid value for default_pm")
  30
  31 # how many days to wait before trying to re-import a runfolder
  32 RESCAN_DELAY = 1
  33 try:
  34     RESCAN_DELAY = int(settings.RESCAN_DELAY)
  35 except (ValueError, AttributeError):
  36     LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
  37                  "defaulting to %s" % (RESCAN_DELAY,))
  38
  39 RUN_STATUS_CHOICES = (
  40     (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
  41     (1, 'Data Pipeline Started'),
  42     (2, 'Data Pipeline Interrupted'),
  43     (3, 'Data Pipeline Finished'),
  44     (4, 'Collect Results Started'),
  45     (5, 'Collect Results Finished'),
  46     (6, 'QC Started'),
  47     (7, 'QC Finished'),
  48     (255, 'DONE'),
  49   )
  50 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
  51
  52
  53 class ClusterStation(models.Model):
  54     """List of cluster stations"""
  55     name = models.CharField(max_length=50, unique=True)
  56     isdefault = models.BooleanField(default=False, null=False)
  57
  58     class Meta:
  59         ordering = ["-isdefault", "name"]
  60
  61     def __str__(self):
  62         return str(self.name)
  63
  64     @staticmethod
  65     def update_isdefault(sender, instance, **kwargs):
  66         """Clear default if needed
  67         """
  68         if instance.isdefault:
  69             for c in ClusterStation.objects.filter(isdefault=True).all():
  70                 if c.id != instance.id:
  71                     c.isdefault = False
  72                     c.save()
  73
  74 def cluster_station_default():
  75     d = ClusterStation.objects.filter(isdefault=True).all()
  76     if len(d) > 0:
  77         return d[0]
  78     d = ClusterStation.objects.order_by('-id').all()
  79     if len(d) > 0:
  80         return d[0]
  81     return None
  82
  83 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
  84
  85 class Sequencer(models.Model):
  86     """Sequencers we've owned
  87     """
  88     name = models.CharField(max_length=50, db_index=True)
  89     instrument_name = models.CharField(max_length=50, db_index=True)
  90     serial_number = models.CharField(max_length=50, db_index=True)
  91     model = models.CharField(max_length=255)
  92     active = models.BooleanField(default=True, null=False)
  93     isdefault = models.BooleanField(default=False, null=False)
  94     comment = models.CharField(max_length=255)
  95
  96     class Meta:
  97         ordering = ["-isdefault", "-active", "name"]
  98
  99     def __str__(self):
 100         name = [str(self.name)]
 101         if self.instrument_name is not None:
 102             name.append("(%s)" % (str(self.instrument_name),))
 103         return " ".join(name)
 104
 105     @models.permalink
 106     def get_absolute_url(self):
 107         return ('experiments.views.sequencer',
 108                 [self.id])
 109
 110     @staticmethod
 111     def update_isdefault(sender, instance, **kwargs):
 112         """Clear default if needed
 113         """
 114         if instance.isdefault:
 115             for s in Sequencer.objects.filter(isdefault=True).all():
 116                 if s.id != instance.id:
 117                     s.isdefault = False
 118                     s.save()
 119
 120 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
 121
 122 def sequencer_default():
 123     d = Sequencer.objects.filter(isdefault=True).all()
 124     if len(d) > 0:
 125         return d[0]
 126     d = Sequencer.objects.order_by('active', '-id').all()
 127     if len(d) > 0:
 128         return d[0]
 129     return None
 130
 131
 132 class FlowCell(models.Model):
 133     flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
 134     run_date = models.DateTimeField()
 135     advanced_run = models.BooleanField(default=False)
 136     paired_end = models.BooleanField(default=False)
 137     read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
 138     control_lane = models.IntegerField(choices=[(1, 1),
 139                                                 (2, 2),
 140                                                 (3, 3),
 141                                                 (4, 4),
 142                                                 (5, 5),
 143                                                 (6, 6),
 144                                                 (7, 7),
 145                                                 (8, 8),
 146                                                 (0, 'All Lanes')],
 147                                        null=True,
 148                                        blank=True)
 149
 150     cluster_station = models.ForeignKey(ClusterStation,
 151                                         default=cluster_station_default)
 152     sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
 153
 154     notes = models.TextField(blank=True)
 155
 156     def __str__(self):
 157         return str(self.flowcell_id)
 158
 159     def Lanes(self):
 160         html = ['<table>']
 161         for lane in self.lane_set.order_by('lane_number'):
 162             cluster_estimate = lane.cluster_estimate
 163             if cluster_estimate is not None:
 164                 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
 165             else:
 166                 cluster_estimate = 'None'
 167             library_id = lane.library_id
 168             library = lane.library
 169             element = '<tr><td>%d</td>'\
 170                       '<td><a href="%s">%s</a></td><td>%s</td></tr>'
 171             html.append(element % (lane.lane_number,
 172                                    library.get_admin_url(),
 173                                    library,
 174                                    cluster_estimate))
 175         html.append('</table>')
 176         return "\n".join(html)
 177     Lanes.allow_tags = True
 178
 179     class Meta:
 180         ordering = ["-run_date"]
 181
 182     def get_admin_url(self):
 183         # that's the django way... except it didn't work
 184         return urlresolvers.reverse('admin:experiments_flowcell_change',
 185                                     args=(self.id,))
 186
 187     def flowcell_type(self):
 188         """Convert our boolean 'is paired' flag to a name
 189         """
 190         if self.paired_end:
 191             return "Paired"
 192         else:
 193             return "Single"
 194
 195     @models.permalink
 196     def get_absolute_url(self):
 197         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 198         return ('experiments.views.flowcell_detail',
 199                 [str(flowcell_id)])
 200
 201     def get_raw_data_directory(self):
 202         """Return location of where the raw data is stored"""
 203         flowcell_id, status = parse_flowcell_id(self.flowcell_id)
 204
 205         return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
 206
 207     def update_data_runs(self):
 208         result_root = self.get_raw_data_directory()
 209         LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
 210         if result_root is None:
 211             return
 212
 213         result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
 214         run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
 215
 216         result_dirs = []
 217         for dirpath, dirnames, filenames in os.walk(result_root):
 218             for filename in filenames:
 219                 if run_xml_re.match(filename):
 220                     # we have a run directory
 221                     relative_pathname = get_relative_pathname(dirpath)
 222                     self.import_data_run(relative_pathname, filename)
 223
 224     def import_data_run(self, relative_pathname, run_xml_name, force=False):
 225         """Given a result directory import files"""
 226         now = timezone.now()
 227         run_dir = get_absolute_pathname(relative_pathname)
 228         run_xml_path = os.path.join(run_dir, run_xml_name)
 229
 230         runs = DataRun.objects.filter(result_dir = relative_pathname)
 231         if len(runs) == 0:
 232             run = DataRun()
 233             created = True
 234         elif len(runs) > 1:
 235             raise RuntimeError("Too many data runs for %s" % (
 236                 relative_pathname,))
 237         else:
 238             run = runs[0]
 239             created = False
 240
 241         if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
 242             LOGGER.debug("Importing run from %s" % (relative_pathname,))
 243             run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
 244             run.flowcell = self
 245             run.status = RUN_STATUS_REVERSE_MAP['DONE']
 246             run.result_dir = relative_pathname
 247             run.runfolder_name = run_xml_data.runfolder_name
 248             run.cycle_start = run_xml_data.image_analysis.start
 249             run.cycle_stop = run_xml_data.image_analysis.stop
 250             naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
 251             run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
 252             run.image_software = run_xml_data.image_analysis.software
 253             run.image_version = run_xml_data.image_analysis.version
 254             run.basecall_software = run_xml_data.bustard.software
 255             run.basecall_version = run_xml_data.bustard.version
 256             # we're frequently not running alignments
 257             if run_xml_data.gerald:
 258                 run.alignment_software = run_xml_data.gerald.software
 259                 run.alignment_version = run_xml_data.gerald.version
 260
 261             run.last_update_time = timezone.now()
 262             run.save()
 263
 264             run.update_result_files()
 265
 266
 267 # FIXME: should we automatically update dataruns?
 268 #        Or should we expect someone to call update_data_runs?
 269 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
 270 #    """Update our dataruns
 271 #    """
 272 #    if not os.path.exists(settings.RESULT_HOME_DIR):
 273 #       return
 274 #
 275 #    instance.update_data_runs()
 276 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
 277
 278
 279 LANE_STATUS_CODES = [(0, 'Failed'),
 280                      (1, 'Marginal'),
 281                      (2, 'Good'), ]
 282 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
 283 LANE_STATUS_MAP[None] = "Unknown"
 284
 285
 286 def is_valid_lane(value):
 287     if value >= 1 and value <= 8:
 288         return True
 289     else:
 290         return False
 291
 292
 293 class Lane(models.Model):
 294     flowcell = models.ForeignKey(FlowCell)
 295     lane_number = models.IntegerField()
 296     library = models.ForeignKey(Library)
 297     pM = models.DecimalField(max_digits=5,
 298                              decimal_places=2,
 299                              blank=False,
 300                              null=False,
 301                              default=default_pM)
 302     cluster_estimate = models.IntegerField(blank=True, null=True)
 303     status = models.IntegerField(choices=LANE_STATUS_CODES,
 304                                  null=True,
 305                                  blank=True)
 306     comment = models.TextField(null=True, blank=True)
 307
 308     @models.permalink
 309     def get_absolute_url(self):
 310         return ('experiments.views.flowcell_lane_detail',
 311                 [str(self.id)])
 312
 313     def __str__(self):
 314         return self.flowcell.flowcell_id + ':' + str(self.lane_number)
 315
 316
 317 class DataRun(models.Model):
 318     flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
 319     runfolder_name = models.CharField(max_length=50)
 320     result_dir = models.CharField(max_length=255)
 321     last_update_time = models.DateTimeField()
 322     run_start_time = models.DateTimeField()
 323     cycle_start = models.IntegerField(null=True, blank=True)
 324     cycle_stop = models.IntegerField(null=True, blank=True)
 325     run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
 326                                      null=True, blank=True)
 327     image_software = models.CharField(max_length=50)
 328     image_version = models.CharField(max_length=50)
 329     basecall_software = models.CharField(max_length=50)
 330     basecall_version = models.CharField(max_length=50)
 331     alignment_software = models.CharField(max_length=50)
 332     alignment_version = models.CharField(max_length=50)
 333     comment = models.TextField(blank=True)
 334
 335     def update_result_files(self):
 336         abs_result_dir = get_absolute_pathname(self.result_dir)
 337
 338         for dirname, dirnames, filenames in os.walk(abs_result_dir):
 339             for filename in filenames:
 340                 pathname = os.path.join(dirname, filename)
 341                 relative_pathname = get_relative_pathname(pathname)
 342                 datafiles = self.datafile_set.filter(
 343                     data_run=self,
 344                     relative_pathname=relative_pathname)
 345                 if len(datafiles) > 0:
 346                     continue
 347
 348                 metadata = find_file_type_metadata_from_filename(filename)
 349                 if metadata is not None:
 350                     metadata['filename'] = filename
 351                     newfile = DataFile()
 352                     newfile.data_run = self
 353                     newfile.file_type = metadata['file_type']
 354                     newfile.relative_pathname = relative_pathname
 355
 356                     lane_number = metadata.get('lane', None)
 357                     if lane_number is not None:
 358                         lane = self.flowcell.lane_set.get(
 359                             lane_number=lane_number)
 360                         newfile.library = lane.library
 361
 362                     self.datafile_set.add(newfile)
 363
 364         self.last_update_time = timezone.now()
 365
 366     def lane_files(self):
 367         lanes = {}
 368
 369         for datafile in self.datafile_set.all():
 370             metadata = datafile.attributes
 371             if metadata is not None:
 372                 lane = metadata.get('lane', None)
 373                 if lane is not None:
 374                     lane_file_set = lanes.setdefault(lane, {})
 375                     normalized_name = datafile.file_type.normalized_name
 376                     lane_file_set[normalized_name] = datafile
 377         return lanes
 378
 379     def ivc_plots(self, lane):
 380         ivc_name = ['IVC All', 'IVC Call',
 381                     'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
 382
 383         plots = {}
 384         for rel_filename, metadata in self.get_result_files():
 385             if metadata.file_type.name in ivc_name:
 386                 plots[metadata.file_type.name] = (rel_filename, metadata)
 387
 388
 389 class FileType(models.Model):
 390     """Represent potential file types
 391
 392     regex is a pattern used to detect if a filename matches this type
 393     data run currently assumes that there may be a (?P<lane>) and
 394     (?P<end>) pattern in the regular expression.
 395     """
 396     name = models.CharField(max_length=50)
 397     mimetype = models.CharField(max_length=50, null=True, blank=True)
 398     # regular expression from glob.fnmatch.translate
 399     regex = models.TextField(null=True, blank=True)
 400
 401     def parse_filename(self, pathname):
 402         """Does filename match our pattern?
 403
 404         Returns None if not, or dictionary of match variables if we do.
 405         """
 406         path, filename = os.path.split(pathname)
 407         if len(self.regex) > 0:
 408             match = re.match(self.regex, filename)
 409             if match is not None:
 410                 # These are (?P<>) names we know about from our
 411                 # default regexes.
 412                 results = match.groupdict()
 413
 414                 # convert int parameters
 415                 for attribute_name in ['lane', 'end']:
 416                     value = results.get(attribute_name, None)
 417                     if value is not None:
 418                         results[attribute_name] = int(value)
 419
 420                 return results
 421
 422     def _get_normalized_name(self):
 423         """Crush data file name into identifier friendly name"""
 424         return self.name.replace(' ', '_').lower()
 425     normalized_name = property(_get_normalized_name)
 426
 427     def __str__(self):
 428         #return "<FileType: %s>" % (self.name,)
 429         return self.name
 430
 431
 432 def str_uuid():
 433     """Helper function to set default UUID in DataFile"""
 434     return str(uuid.uuid1())
 435
 436
 437 class DataFile(models.Model):
 438     """Store map from random ID to filename"""
 439     random_key = models.CharField(max_length=64,
 440                                   db_index=True,
 441                                   default=str_uuid)
 442     data_run = models.ForeignKey(DataRun, db_index=True)
 443     library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
 444     file_type = models.ForeignKey(FileType)
 445     relative_pathname = models.CharField(max_length=255, db_index=True)
 446
 447     def _get_attributes(self):
 448         return self.file_type.parse_filename(self.relative_pathname)
 449     attributes = property(_get_attributes)
 450
 451     def _get_pathname(self):
 452         return get_absolute_pathname(self.relative_pathname)
 453     pathname = property(_get_pathname)
 454
 455     @models.permalink
 456     def get_absolute_url(self):
 457         return ('experiments.views.read_result_file',
 458                 (), {'key': self.random_key})
 459
 460
 461 def find_file_type_metadata_from_filename(pathname):
 462     path, filename = os.path.split(pathname)
 463     result = None
 464     for file_type in FileType.objects.all():
 465         result = file_type.parse_filename(filename)
 466         if result is not None:
 467             result['file_type'] = file_type
 468             return result
 469
 470     return None
 471
 472
 473 def get_relative_pathname(abspath):
 474     """Strip off the result home directory from a path
 475     """
 476     result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
 477     relative_pathname = abspath.replace(result_home_dir, '')
 478     return relative_pathname
 479
 480
 481 def get_absolute_pathname(relative_pathname):
 482     """Attach relative path to  results home directory"""
 483     return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)