Convert Rami's DataRun feature to something useful to us.

[htsworkflow.git] / htsworkflow / frontend / experiments / models.py
diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py

index 8a71e252331c600ce12eeaea8b20511f337ad98c..4e060f9330229ed5125313ffadc5f2a7e2ff4121 100755 (executable)
--- a/htsworkflow/frontend/experiments/models.py
+++ b/htsworkflow/frontend/experiments/models.py
@@ -1,12 +1,39 @@
+import datetime
+import glob
  import logging
+import os
+import re
+import types
+import uuid
  
+from django.conf import settings
  from django.core.exceptions import ObjectDoesNotExist
  from django.core import urlresolvers
  from django.db import models
+from django.db.models.signals import post_init
  
-from htsworkflow.frontend.samples.models import *
-#from htsworkflow.frontend.settings import options
-from django.conf import settings
+from htsworkflow.frontend.samples.models import Library
+from htsworkflow.frontend.samples.results import parse_flowcell_id
+from htsworkflow.pipelines import runfolder
+
+default_pM = 5
+try:
+  default_pM = int(settings.DEFAULT_PM)
+except ValueError,e:
+  logging.error("invalid value for frontend.default_pm")
+
+RUN_STATUS_CHOICES = (
+    (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
+    (1, 'Data Pipeline Started'),
+    (2, 'Data Pipeline Interrupted'),
+    (3, 'Data Pipeline Finished'),
+    (4, 'Collect Results Started'),
+    (5, 'Collect Results Finished'),
+    (6, 'QC Started'),
+    (7, 'QC Finished'),
+    (255, 'DONE'),
+  )
+RUN_STATUS_REVERSE_MAP = dict(((v,k) for k,v in RUN_STATUS_CHOICES))
  
  class ClusterStation(models.Model):
    name = models.CharField(max_length=50, unique=True)
@@ -20,20 +47,13 @@ class Sequencer(models.Model):
    def __unicode__(self):
      return unicode(self.name)
  
-default_pM = 5
-try:
-  default_pM = int(settings.DEFAULT_PM)
-except ValueError,e:
-  logging.error("invalid value for frontend.default_pm")
-
  class FlowCell(models.Model):
-  
    flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
    run_date = models.DateTimeField()
    advanced_run = models.BooleanField(default=False)
    paired_end = models.BooleanField(default=False)
    read_length = models.IntegerField(default=32) #Stanford is currenlty 25
-  control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True)
+  control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True, blank=True)
  
    cluster_station = models.ForeignKey(ClusterStation, default=3)
    sequencer = models.ForeignKey(Sequencer, default=1)
@@ -43,21 +63,8 @@ class FlowCell(models.Model):
    def __unicode__(self):
        return unicode(self.flowcell_id) 
  
-  def Create_LOG(self):
-    str = ''
-    str +='<a target=_balnk href="/experiments/'+self.flowcell_id+'" title="Create XLS like sheet for this Flowcell ..." ">Create LOG</a>'
-    try:
-      t = DataRun.objects.get(fcid=self.id)
-      str +='<br/><a target=_self href="/admin/experiments/datarun/?q='+self.flowcell_id+'" title="Check Data Runs ..." ">DataRun ..</a>'
-    except ObjectDoesNotExist:
-      str += '<br/><span style="color:red">not sequenced</span>'
-    return str
-  Create_LOG.allow_tags = True 
-
    def Lanes(self):
-    library_url = '/admin/samples/library/%s' 
      html = ['<table>']
-    #for i in range(1,9):
      for lane in self.lane_set.all():
          cluster_estimate = lane.cluster_estimate
          if cluster_estimate is not None:
@@ -67,8 +74,10 @@ class FlowCell(models.Model):
          library_id = lane.library_id
          library = lane.library
          element = '<tr><td>%d</td><td><a href="%s">%s</a></td><td>%s</td></tr>'
-        expanded_library_url = library_url %(library_id,)
-        html.append(element % (lane.lane_number, expanded_library_url, library, cluster_estimate))
+        html.append(element % (lane.lane_number,
+                               library.get_admin_url(),
+                               library,
+                               cluster_estimate))
      html.append('</table>')
      return "\n".join(html)
    Lanes.allow_tags = True
@@ -78,8 +87,8 @@ class FlowCell(models.Model):
  
    def get_admin_url(self):
      # that's the django way... except it didn't work
-    #return urlresolvers.reverse('admin_experiments_FlowCell_change', args=(self.id,))
-    return '/admin/experiments/flowcell/%s/' % (self.id,)
+    return urlresolvers.reverse('admin:experiments_flowcell_change',
+                                args=(self.id,))
  
    def flowcell_type(self):
      """
@@ -95,59 +104,63 @@ class FlowCell(models.Model):
        return ('htsworkflow.frontend.experiments.views.flowcell_detail',
                [str(self.flowcell_id)])
      
-### -----------------------
-class DataRun(models.Model):
-  ConfTemplate = "CONFIG PARAMS WILL BE GENERATED BY THE PIPELINE SCRIPT.\nYOU'LL BE ABLE TO EDIT AFTER IF NEEDED."
-  run_folder = models.CharField(max_length=50,unique=True, db_index=True)
-  fcid = models.ForeignKey(FlowCell,verbose_name="Flowcell Id")
-  config_params = models.TextField(default=ConfTemplate)
-  run_start_time = models.DateTimeField()
-  RUN_STATUS_CHOICES = (
-      (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
-      (1, 'Data Pipeline Started'),
-      (2, 'Data Pipeline Interrupted'),
-      (3, 'Data Pipeline Finished'),
-      (4, 'CollectReads Started'),
-      (5, 'CollectReads Finished'),
-      (6, 'QC Finished'),
-      (7, 'DONE'),
-    )
-  run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, default=0)
-  run_note = models.TextField(blank=True)
-
-
-  def main_status(self):
-    str = '<div'
-    if self.run_status >= 5:
-      str += ' style="color:green">'
-      str += '<b>'+self.RUN_STATUS_CHOICES[self.run_status][1]+'</b>'
-      str += '<br/><br/>' #<span style="color:red;font-size:80%;">New!</span>'
-      str +='<br/><a target=_balnk href="'+settings.TASKS_PROJS_SERVER+'/Flowcells/'+self.fcid.flowcell_id+'/'+self.fcid.flowcell_id+'_QC_Summary.html" title="View QC Summaries of this run ..." ">View QC Page</a>'
-    else:
-      str += '>'+self.RUN_STATUS_CHOICES[self.run_status][1]
+  def get_raw_data_directory(self):
+      """Return location of where the raw data is stored"""
+      flowcell_id, status = parse_flowcell_id(self.flowcell_id)
  
-    str += '</div>'
-    return str
-  main_status.allow_tags = True
+      return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
  
-  main_status.allow_tags = True
+  def update_data_runs(self):
+      result_root = self.get_raw_data_directory()
+      if result_root is None:
+          return
    
-  def Flowcell_Info(self):
-    str = '<b>'+self.fcid.__str__()+'</b>'
-    str += '  (c: '+self.fcid.cluster_mac_id+',  s: '+self.fcid.seq_mac_id+')'
-    str += '<div style="margin-top:5px;">'    
-    str +='<a title="View Lane List here ..."  onClick="el = document.getElementById(\'LanesOf'+self.fcid.__str__()+'\');if(el) (el.style.display==\'none\'?el.style.display=\'block\':el.style.display=\'none\')" style="cursor:pointer;color: #5b80b2;">View/hide lanes</a>'
-    str += '<div id="LanesOf'+self.fcid.__str__()+'" style="display:block;border:solid #cccccc 1px;width:350px">'
-    LanesList = '1: '+self.fcid.lane_1_library.__str__()+' ('+self.fcid.lane_1_library.library_species.use_genome_build+')<br/>2: '+self.fcid.lane_2_library.__str__()+' ('+self.fcid.lane_2_library.library_species.use_genome_build+')<br/>3: '+self.fcid.lane_3_library.__str__()+' ('+self.fcid.lane_3_library.library_species.use_genome_build+')<br/>4: '+self.fcid.lane_4_library.__str__()+' ('+self.fcid.lane_4_library.library_species.use_genome_build+')<br/>5: '+self.fcid.lane_5_library.__str__()+' ('+self.fcid.lane_5_library.library_species.use_genome_build+')<br/>6: '+self.fcid.lane_6_library.__str__()+' ('+self.fcid.lane_6_library.library_species.use_genome_build+')<br/>7: '+self.fcid.lane_7_library.__str__()+' ('+self.fcid.lane_7_library.library_species.use_genome_build+')<br/>8: '+self.fcid.lane_8_library.__str__()+' ('+self.fcid.lane_8_library.library_species.use_genome_build+')'
-    str += LanesList ## self.fcid.Lanes()
-    str += '</div>'
-    str += '<div><a title="open Flowcell record" href="/admin/exp_track/flowcell/'+self.fcid.id.__str__()+'/" target=_self>Edit Flowcell record</a>'
-    #str += '<span style="color:red;font-size:80%;margin-left:15px;margin-right:3px">New!</span>'
-    str +='<a style="margin-left:15px;" target=_balnk href="/exp_track/'+self.fcid.flowcell_id+'" title="View XLS like sheet for this Flowcell LOG ..." ">GA LOG Page</a>'
-    str += '</div>'
-    str += '</div>'    
-    return str
-  Flowcell_Info.allow_tags = True
+      result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
+      run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
+      
+      dataruns = self.datarun_set.all()
+      datarun_result_dirs = [ x.result_dir for x in dataruns ]
+
+      result_dirs = []
+      for dirpath, dirnames, filenames in os.walk(result_root):
+          for filename in filenames:
+              if run_xml_re.match(filename):
+                  # we have a run directory
+                  relative_pathname = get_relative_pathname(dirpath)
+                  if relative_pathname not in datarun_result_dirs:
+                      self.import_data_run(relative_pathname, filename)
+                
+  def import_data_run(self, relative_pathname, run_xml_name):
+      """Given a result directory import files"""
+      run_dir = get_absolute_pathname(relative_pathname)
+      run_xml_path = os.path.join(run_dir, run_xml_name)
+      run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
+                                  
+      run = DataRun()
+      run.flowcell = self
+      run.status = RUN_STATUS_REVERSE_MAP['DONE']
+      run.result_dir = relative_pathname
+      run.runfolder_name = run_xml_data.runfolder_name
+      run.cycle_start = run_xml_data.image_analysis.start
+      run.cycle_stop = run_xml_data.image_analysis.stop
+      run.run_start_time = run_xml_data.image_analysis.date
+      run.last_update_time = datetime.datetime.now()
+      run.save()
+
+      run.update_result_files()
+      
+# FIXME: should we automatically update dataruns?
+#        Or should we expect someone to call update_data_runs?
+#def update_flowcell_dataruns(sender, instance, *args, **kwargs):
+#    """Update our dataruns
+#    """
+#    if not os.path.exists(settings.RESULT_HOME_DIR):
+#       return
+#
+#    instance.update_data_runs()    
+#post_init.connect(update_flowcell_dataruns, sender=FlowCell)
+
+
  
  LANE_STATUS_CODES = [(0, 'Failed'),
                      (1, 'Marginal'),
@@ -168,3 +181,156 @@ class Lane(models.Model):
    def get_absolute_url(self):
         return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail',
                 [str(self.flowcell.flowcell_id), str(self.lane_number)])
+
+                        
+### -----------------------
+class DataRun(models.Model):
+    flowcell = models.ForeignKey(FlowCell,verbose_name="Flowcell Id")
+    runfolder_name = models.CharField(max_length=50)
+    result_dir = models.CharField(max_length=255)
+    last_update_time = models.DateTimeField()
+    run_start_time = models.DateTimeField()
+    cycle_start = models.IntegerField(null=True, blank=True)
+    cycle_stop = models.IntegerField(null=True, blank=True)
+    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, 
+                                     null=True, blank=True)
+    comment = models.TextField(blank=True)
+
+    def update_result_files(self):
+        abs_result_dir = get_absolute_pathname(self.result_dir)
+        
+        for dirname, dirnames, filenames in os.walk(abs_result_dir):
+            for filename in filenames:
+                pathname = os.path.join(dirname, filename)
+                relative_pathname = get_relative_pathname(pathname)
+                datafiles = self.datafile_set.filter(
+                  data_run = self,
+                  relative_pathname=relative_pathname)
+                if len(datafiles) > 0:
+                    continue
+                  
+                metadata = find_file_type_metadata_from_filename(filename)
+                if metadata is not None:
+                    metadata['filename'] = filename
+                    newfile = DataFile()
+                    newfile.data_run = self
+                    newfile.file_type = metadata['file_type']
+                    newfile.relative_pathname = relative_pathname
+
+                    lane_number = metadata.get('lane', None)
+                    if lane_number is not None:
+                        lane = self.flowcell.lane_set.get(lane_number = lane_number)
+                        newfile.library = lane.library
+                    
+                    self.datafile_set.add(newfile)
+                    
+        self.last_update_time = datetime.datetime.now()
+
+    def lane_files(self):
+        lanes = {}
+        
+        for datafile in self.datafile_set.all():
+            metadata = datafile.attributes
+            if metadata is not None:
+                lane = metadata.get('lane', None)
+                if lane is not None:
+                    lane_file_set = lanes.setdefault(lane, {})
+                    lane_file_set[datafile.file_type.normalized_name] = datafile
+        return lanes
+
+    def ivc_plots(self, lane):
+        ivc_name = ['IVC All', 'IVC Call',
+                    'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
+
+        plots = {}
+        for rel_filename, metadata in self.get_result_files():
+            if metadata.file_type.name in ivc_name:
+                plots[metadata.file_type.name] = (rel_filename, metadata)
+                
+class FileType(models.Model):
+    """Represent potential file types
+
+    regex is a pattern used to detect if a filename matches this type
+    data run currently assumes that there may be a (?P<lane>) and
+    (?P<end>) pattern in the regular expression.
+    """
+    name = models.CharField(max_length=50)
+    mimetype = models.CharField(max_length=50, null=True, blank=True)
+    # regular expression from glob.fnmatch.translate
+    regex = models.CharField(max_length=50, null=True, blank=True)
+
+    def parse_filename(self, pathname):
+        """Does filename match our pattern?
+
+        Returns None if not, or dictionary of match variables if we do.
+        """
+        path, filename = os.path.split(pathname)
+        if len(self.regex) > 0:
+            match = re.match(self.regex, filename)
+            if match is not None:
+                # These are (?P<>) names we know about from our default regexes.
+                results = match.groupdict()
+
+                # convert int parameters
+                for attribute_name in ['lane', 'end']:
+                    value = results.get(attribute_name, None)
+                    if value is not None:
+                        results[attribute_name] = int(value)
+                    
+                return results
+
+    def _get_normalized_name(self):
+        """Crush data file name into identifier friendly name"""
+        return self.name.replace(' ', '_').lower()
+    normalized_name = property(_get_normalized_name)
+              
+    def __unicode__(self):
+        #return u"<FileType: %s>" % (self.name,)
+        return self.name
+
+
+class DataFile(models.Model):
+    """Store map from random ID to filename"""
+    random_key = models.CharField(max_length=16,
+                                  db_index=True,
+                                  default=uuid.uuid1)
+    data_run = models.ForeignKey(DataRun, db_index=True)
+    library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
+    file_type = models.ForeignKey(FileType)
+    relative_pathname = models.CharField(max_length=255, db_index=True)
+
+    def _get_attributes(self):
+        return self.file_type.parse_filename(self.relative_pathname)
+    attributes = property(_get_attributes)
+
+    def _get_pathname(self):
+        return get_absolute_pathname(self.relative_pathname)
+    pathname = property(_get_pathname)
+
+    @models.permalink
+    def get_absolute_url(self):
+        return ('htsworkflow.frontend.experiments.views.read_result_file',
+                (), {'key': self.random_key })
+
+def find_file_type_metadata_from_filename(pathname):
+    path, filename = os.path.split(pathname)
+    result = None
+    for file_type in FileType.objects.all():
+        result = file_type.parse_filename(filename)
+        if result is not None:
+            result['file_type'] = file_type
+            return result
+
+    return None
+  
+def get_relative_pathname(abspath):
+    """Strip off the result home directory from a path
+    """
+    result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
+    relative_pathname = abspath.replace(result_home_dir,'')
+    return relative_pathname
+
+def get_absolute_pathname(relative_pathname):
+    """Attach relative path to  results home directory"""
+    return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)
+