Attempt to be robust to not having an alignment in our RunXml file

[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py

old mode 100755 (executable)

new mode 100644 (file)

index 5a97798..5152c40
--- a/htsworkflow/frontend/experiments/models.py
+++ b/htsworkflow/frontend/experiments/models.py
@@ -10,21 +10,29 @@ from django.conf import settings
  from django.core.exceptions import ObjectDoesNotExist
  from django.core import urlresolvers
  from django.db import models
-from django.db.models.signals import post_init
+from django.db.models.signals import post_init, pre_save
  
  from htsworkflow.frontend.samples.models import Library
-from htsworkflow.frontend.samples.results import parse_flowcell_id
+from htsworkflow.util.conversion import parse_flowcell_id
  from htsworkflow.pipelines import runfolder
  
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
  default_pM = 5
  try:
-  default_pM = int(settings.DEFAULT_PM)
-except ValueError,e:
-  logger.error("invalid value for frontend.default_pm")
+    default_pM = int(settings.DEFAULT_PM)
+except ValueError, e:
+    LOGGER.error("invalid value for frontend.default_pm")
+
+# how many days to wait before trying to re-import a runfolder
+RESCAN_DELAY = 1
+try:
+    RESCAN_DELAY = int(settings.RESCAN_DELAY)
+except (ValueError, AttributeError):
+    LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
+                 "defaulting to %s" % (RESCAN_DELAY,))
  
  RUN_STATUS_CHOICES = (
-    (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
+    (0, 'Sequencer running'),  # Solexa Data Pipeline Not Yet Started'),
      (1, 'Data Pipeline Started'),
      (2, 'Data Pipeline Interrupted'),
      (3, 'Data Pipeline Finished'),
@@ -34,125 +42,223 @@ RUN_STATUS_CHOICES = (
      (7, 'QC Finished'),
      (255, 'DONE'),
    )
-RUN_STATUS_REVERSE_MAP = dict(((v,k) for k,v in RUN_STATUS_CHOICES))
+RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
+
  
  class ClusterStation(models.Model):
-  name = models.CharField(max_length=50, unique=True)
+    """List of cluster stations"""
+    name = models.CharField(max_length=50, unique=True)
+    isdefault = models.BooleanField(default=False, null=False)
+
+    class Meta:
+        ordering = ["-isdefault", "name"]
  
-  def __unicode__(self):
-    return unicode(self.name)
+    def __unicode__(self):
+        return unicode(self.name)
+
+    @classmethod
+    def default(cls):
+        d = cls.objects.filter(isdefault=True).all()
+        if len(d) > 0:
+            return d[0]
+        d = cls.objects.order_by('-id').all()
+        if len(d) > 0:
+            return d[0]
+        return None
+
+    @staticmethod
+    def update_isdefault(sender, instance, **kwargs):
+        """Clear default if needed
+        """
+        if instance.isdefault:
+            for c in ClusterStation.objects.filter(isdefault=True).all():
+                if c.id != instance.id:
+                    c.isdefault = False
+                    c.save()
+
+pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
  
  class Sequencer(models.Model):
-  name = models.CharField(max_length=50, unique=True)
+    """Sequencers we've owned
+    """
+    name = models.CharField(max_length=50, db_index=True)
+    instrument_name = models.CharField(max_length=50, db_index=True)
+    serial_number = models.CharField(max_length=50, db_index=True)
+    model = models.CharField(max_length=255)
+    active = models.BooleanField(default=True, null=False)
+    isdefault = models.BooleanField(default=False, null=False)
+    comment = models.CharField(max_length=255)
+
+    class Meta:
+        ordering = ["-isdefault", "-active", "name"]
+
+    def __unicode__(self):
+        name = [unicode(self.name)]
+        if self.instrument_name is not None:
+            name.append("(%s)" % (unicode(self.instrument_name),))
+        return " ".join(name)
+
+    @models.permalink
+    def get_absolute_url(self):
+        return ('htsworkflow.frontend.experiments.views.sequencer',
+                [self.id])
+
+    @classmethod
+    def default(cls):
+        d = cls.objects.filter(isdefault=True).all()
+        if len(d) > 0:
+            return d[0]
+        d = cls.objects.order_by('active', '-id').all()
+        if len(d) > 0:
+            return d[0]
+        return None
+
+    @staticmethod
+    def update_isdefault(sender, instance, **kwargs):
+        """Clear default if needed
+        """
+        if instance.isdefault:
+            for s in Sequencer.objects.filter(isdefault=True).all():
+                if s.id != instance.id:
+                    s.isdefault = False
+                    s.save()
+
+pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
  
-  def __unicode__(self):
-    return unicode(self.name)
  
  class FlowCell(models.Model):
-  flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
-  run_date = models.DateTimeField()
-  advanced_run = models.BooleanField(default=False)
-  paired_end = models.BooleanField(default=False)
-  read_length = models.IntegerField(default=32) #Stanford is currenlty 25
-  control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True, blank=True)
-
-  cluster_station = models.ForeignKey(ClusterStation, default=3)
-  sequencer = models.ForeignKey(Sequencer, default=1)
-  
-  notes = models.TextField(blank=True)
-
-  def __unicode__(self):
-      return unicode(self.flowcell_id) 
-
-  def Lanes(self):
-    html = ['<table>']
-    for lane in self.lane_set.all():
-        cluster_estimate = lane.cluster_estimate
-        if cluster_estimate is not None:
-            cluster_estimate = "%s k" % ((int(cluster_estimate)/1000), )
+    flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
+    run_date = models.DateTimeField()
+    advanced_run = models.BooleanField(default=False)
+    paired_end = models.BooleanField(default=False)
+    read_length = models.IntegerField(default=32)  # Stanford is currenlty 25
+    control_lane = models.IntegerField(choices=[(1, 1),
+                                                (2, 2),
+                                                (3, 3),
+                                                (4, 4),
+                                                (5, 5),
+                                                (6, 6),
+                                                (7, 7),
+                                                (8, 8),
+                                                (0, 'All Lanes')],
+                                       null=True,
+                                       blank=True)
+
+    cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
+    sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
+
+    notes = models.TextField(blank=True)
+
+    def __unicode__(self):
+        return unicode(self.flowcell_id)
+
+    def Lanes(self):
+        html = ['<table>']
+        for lane in self.lane_set.order_by('lane_number'):
+            cluster_estimate = lane.cluster_estimate
+            if cluster_estimate is not None:
+                cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
+            else:
+                cluster_estimate = 'None'
+            library_id = lane.library_id
+            library = lane.library
+            element = '<tr><td>%d</td>'\
+                      '<td><a href="%s">%s</a></td><td>%s</td></tr>'
+            html.append(element % (lane.lane_number,
+                                   library.get_admin_url(),
+                                   library,
+                                   cluster_estimate))
+        html.append('</table>')
+        return "\n".join(html)
+    Lanes.allow_tags = True
+
+    class Meta:
+        ordering = ["-run_date"]
+
+    def get_admin_url(self):
+        # that's the django way... except it didn't work
+        return urlresolvers.reverse('admin:experiments_flowcell_change',
+                                    args=(self.id,))
+
+    def flowcell_type(self):
+        """Convert our boolean 'is paired' flag to a name
+        """
+        if self.paired_end:
+            return u"Paired"
          else:
-            cluster_estimate = 'None'
-        library_id = lane.library_id
-        library = lane.library
-        element = '<tr><td>%d</td><td><a href="%s">%s</a></td><td>%s</td></tr>'
-        html.append(element % (lane.lane_number,
-                               library.get_admin_url(),
-                               library,
-                               cluster_estimate))
-    html.append('</table>')
-    return "\n".join(html)
-  Lanes.allow_tags = True
-
-  class Meta:
-    ordering = ["-run_date"]
-
-  def get_admin_url(self):
-    # that's the django way... except it didn't work
-    return urlresolvers.reverse('admin:experiments_flowcell_change',
-                                args=(self.id,))
-
-  def flowcell_type(self):
-    """
-    Convert our boolean 'is paired' flag to a name
-    """
-    if self.paired_end:
-      return u"Paired"
-    else:
-      return u"Single"
-
-  @models.permalink
-  def get_absolute_url(self):
-      flowcell_id, status = parse_flowcell_id(self.flowcell_id)
-      return ('htsworkflow.frontend.experiments.views.flowcell_detail',
-              [str(flowcell_id)])
-    
-  def get_raw_data_directory(self):
-      """Return location of where the raw data is stored"""
-      flowcell_id, status = parse_flowcell_id(self.flowcell_id)
-
-      return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
-
-  def update_data_runs(self):
-      result_root = self.get_raw_data_directory()
-      if result_root is None:
-          return
-
-      result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
-      run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
-      
-      dataruns = self.datarun_set.all()
-      datarun_result_dirs = [ x.result_dir for x in dataruns ]
-
-      result_dirs = []
-      for dirpath, dirnames, filenames in os.walk(result_root):
-          for filename in filenames:
-              if run_xml_re.match(filename):
-                  # we have a run directory
-                  relative_pathname = get_relative_pathname(dirpath)
-                  if relative_pathname not in datarun_result_dirs:
-                      self.import_data_run(relative_pathname, filename)
-                
-  def import_data_run(self, relative_pathname, run_xml_name):
-      """Given a result directory import files"""
-      run_dir = get_absolute_pathname(relative_pathname)
-      run_xml_path = os.path.join(run_dir, run_xml_name)
-      run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
-                                  
-      run = DataRun()
-      run.flowcell = self
-      run.status = RUN_STATUS_REVERSE_MAP['DONE']
-      run.result_dir = relative_pathname
-      run.runfolder_name = run_xml_data.runfolder_name
-      run.cycle_start = run_xml_data.image_analysis.start
-      run.cycle_stop = run_xml_data.image_analysis.stop
-      run.run_start_time = run_xml_data.image_analysis.date
-
-      run.last_update_time = datetime.datetime.now()
-      run.save()
-
-      run.update_result_files()
-
-      
+            return u"Single"
+
+    @models.permalink
+    def get_absolute_url(self):
+        flowcell_id, status = parse_flowcell_id(self.flowcell_id)
+        return ('htsworkflow.frontend.experiments.views.flowcell_detail',
+                [str(flowcell_id)])
+
+    def get_raw_data_directory(self):
+        """Return location of where the raw data is stored"""
+        flowcell_id, status = parse_flowcell_id(self.flowcell_id)
+
+        return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
+
+    def update_data_runs(self):
+        result_root = self.get_raw_data_directory()
+        LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
+        if result_root is None:
+            return
+
+        result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
+        run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
+
+        result_dirs = []
+        for dirpath, dirnames, filenames in os.walk(result_root):
+            for filename in filenames:
+                if run_xml_re.match(filename):
+                    # we have a run directory
+                    relative_pathname = get_relative_pathname(dirpath)
+                    self.import_data_run(relative_pathname, filename)
+
+    def import_data_run(self, relative_pathname, run_xml_name, force=False):
+        """Given a result directory import files"""
+        now = datetime.datetime.now()
+        run_dir = get_absolute_pathname(relative_pathname)
+        run_xml_path = os.path.join(run_dir, run_xml_name)
+
+        runs = DataRun.objects.filter(result_dir = relative_pathname)
+        if len(runs) == 0:
+            run = DataRun()
+            created = True
+        elif len(runs) > 1:
+            raise RuntimeError("Too many data runs for %s" % (
+                relative_pathname,))
+        else:
+            run = runs[0]
+            created = False
+
+        if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
+            LOGGER.debug("Importing run from %s" % (relative_pathname,))
+            run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
+            run.flowcell = self
+            run.status = RUN_STATUS_REVERSE_MAP['DONE']
+            run.result_dir = relative_pathname
+            run.runfolder_name = run_xml_data.runfolder_name
+            run.cycle_start = run_xml_data.image_analysis.start
+            run.cycle_stop = run_xml_data.image_analysis.stop
+            run.run_start_time = run_xml_data.image_analysis.date
+            run.image_software = run_xml_data.image_analysis.software
+            run.image_version = run_xml_data.image_analysis.version
+            run.basecall_software = run_xml_data.bustard.software
+            run.basecall_version = run_xml_data.bustard.version
+            # we're frequently not running alignments
+            if run_xml_data.gerald:
+                run.alignment_software = run_xml_data.gerald.software
+                run.alignment_version = run_xml_data.gerald.version
+
+            run.last_update_time = datetime.datetime.now()
+            run.save()
+
+            run.update_result_files()
+
+
  # FIXME: should we automatically update dataruns?
  #        Or should we expect someone to call update_data_runs?
  #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
@@ -161,43 +267,64 @@ class FlowCell(models.Model):
  #    if not os.path.exists(settings.RESULT_HOME_DIR):
  #       return
  #
-#    instance.update_data_runs()    
+#    instance.update_data_runs()
  #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
  
  
-
  LANE_STATUS_CODES = [(0, 'Failed'),
-                    (1, 'Marginal'),
-                    (2, 'Good'),]
-LANE_STATUS_MAP = dict((int(k),v) for k,v in LANE_STATUS_CODES )
+                     (1, 'Marginal'),
+                     (2, 'Good'), ]
+LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
  LANE_STATUS_MAP[None] = "Unknown"
  
+
+def is_valid_lane(value):
+    if value >= 1 and value <= 8:
+        return True
+    else:
+        return False
+
+
  class Lane(models.Model):
-  flowcell = models.ForeignKey(FlowCell)
-  lane_number = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8)])
-  library = models.ForeignKey(Library)
-  pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM)
-  cluster_estimate = models.IntegerField(blank=True, null=True)                                       
-  status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) 
-  comment = models.TextField(null=True, blank=True)
-
-  @models.permalink
-  def get_absolute_url(self):
-       return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
-               [str(self.flowcell.flowcell_id), str(self.lane_number)])
-
-                        
-### -----------------------
+    flowcell = models.ForeignKey(FlowCell)
+    lane_number = models.IntegerField()
+    library = models.ForeignKey(Library)
+    pM = models.DecimalField(max_digits=5,
+                             decimal_places=2,
+                             blank=False,
+                             null=False,
+                             default=default_pM)
+    cluster_estimate = models.IntegerField(blank=True, null=True)
+    status = models.IntegerField(choices=LANE_STATUS_CODES,
+                                 null=True,
+                                 blank=True)
+    comment = models.TextField(null=True, blank=True)
+
+    @models.permalink
+    def get_absolute_url(self):
+        return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
+                [str(self.id)])
+
+    def __unicode__(self):
+        return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
+
+
  class DataRun(models.Model):
-    flowcell = models.ForeignKey(FlowCell,verbose_name="Flowcell Id")
+    flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
      runfolder_name = models.CharField(max_length=50)
      result_dir = models.CharField(max_length=255)
      last_update_time = models.DateTimeField()
      run_start_time = models.DateTimeField()
      cycle_start = models.IntegerField(null=True, blank=True)
      cycle_stop = models.IntegerField(null=True, blank=True)
-    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, 
+    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
                                       null=True, blank=True)
+    image_software = models.CharField(max_length=50)
+    image_version = models.CharField(max_length=50)
+    basecall_software = models.CharField(max_length=50)
+    basecall_version = models.CharField(max_length=50)
+    alignment_software = models.CharField(max_length=50)
+    alignment_version = models.CharField(max_length=50)
      comment = models.TextField(blank=True)
  
      def update_result_files(self):
@@ -208,11 +335,11 @@ class DataRun(models.Model):
                  pathname = os.path.join(dirname, filename)
                  relative_pathname = get_relative_pathname(pathname)
                  datafiles = self.datafile_set.filter(
-                  data_run = self,
-                  relative_pathname=relative_pathname)
+                    data_run=self,
+                    relative_pathname=relative_pathname)
                  if len(datafiles) > 0:
                      continue
-                  
+
                  metadata = find_file_type_metadata_from_filename(filename)
                  if metadata is not None:
                      metadata['filename'] = filename
@@ -223,23 +350,25 @@ class DataRun(models.Model):
  
                      lane_number = metadata.get('lane', None)
                      if lane_number is not None:
-                        lane = self.flowcell.lane_set.get(lane_number = lane_number)
+                        lane = self.flowcell.lane_set.get(
+                            lane_number=lane_number)
                          newfile.library = lane.library
-                    
+
                      self.datafile_set.add(newfile)
-                    
+
          self.last_update_time = datetime.datetime.now()
  
      def lane_files(self):
          lanes = {}
-        
+
          for datafile in self.datafile_set.all():
              metadata = datafile.attributes
              if metadata is not None:
                  lane = metadata.get('lane', None)
                  if lane is not None:
                      lane_file_set = lanes.setdefault(lane, {})
-                    lane_file_set[datafile.file_type.normalized_name] = datafile
+                    normalized_name = datafile.file_type.normalized_name
+                    lane_file_set[normalized_name] = datafile
          return lanes
  
      def ivc_plots(self, lane):
@@ -250,7 +379,8 @@ class DataRun(models.Model):
          for rel_filename, metadata in self.get_result_files():
              if metadata.file_type.name in ivc_name:
                  plots[metadata.file_type.name] = (rel_filename, metadata)
-                
+
+
  class FileType(models.Model):
      """Represent potential file types
  
@@ -272,7 +402,8 @@ class FileType(models.Model):
          if len(self.regex) > 0:
              match = re.match(self.regex, filename)
              if match is not None:
-                # These are (?P<>) names we know about from our default regexes.
+                # These are (?P<>) names we know about from our
+                # default regexes.
                  results = match.groupdict()
  
                  # convert int parameters
@@ -280,22 +411,24 @@ class FileType(models.Model):
                      value = results.get(attribute_name, None)
                      if value is not None:
                          results[attribute_name] = int(value)
-                    
+
                  return results
  
      def _get_normalized_name(self):
          """Crush data file name into identifier friendly name"""
          return self.name.replace(' ', '_').lower()
      normalized_name = property(_get_normalized_name)
-              
+
      def __unicode__(self):
          #return u"<FileType: %s>" % (self.name,)
          return self.name
  
+
  def str_uuid():
      """Helper function to set default UUID in DataFile"""
      return str(uuid.uuid1())
  
+
  class DataFile(models.Model):
      """Store map from random ID to filename"""
      random_key = models.CharField(max_length=64,
@@ -317,7 +450,8 @@ class DataFile(models.Model):
      @models.permalink
      def get_absolute_url(self):
          return ('htsworkflow.frontend.experiments.views.read_result_file',
-                (), {'key': self.random_key })
+                (), {'key': self.random_key})
+
  
  def find_file_type_metadata_from_filename(pathname):
      path, filename = os.path.split(pathname)
@@ -329,15 +463,16 @@ def find_file_type_metadata_from_filename(pathname):
              return result
  
      return None
-  
+
+
  def get_relative_pathname(abspath):
      """Strip off the result home directory from a path
      """
-    result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
-    relative_pathname = abspath.replace(result_home_dir,'')
+    result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
+    relative_pathname = abspath.replace(result_home_dir, '')
      return relative_pathname
  
+
  def get_absolute_pathname(relative_pathname):
      """Attach relative path to  results home directory"""
      return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)
-