Rescan a flowcell data run once per (setting.RESCAN_DELAY) days for files.

author Diane Trout <diane@caltech.edu>

Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)

committer Diane Trout <diane@caltech.edu>

Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
author Diane Trout <diane@caltech.edu>
Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
committer Diane Trout <diane@caltech.edu>
Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py

index d77b3eb78d37b0be44b04d8842ebecdf3eae2f56..c66a639cae0ab669ce52b3fb3b17e7b765930bbe 100755 (executable)
--- a/htsworkflow/frontend/experiments/models.py
+++ b/htsworkflow/frontend/experiments/models.py
@@ -19,9 +19,16 @@ from htsworkflow.pipelines import runfolder
  logger = logging.getLogger(__name__)
  default_pM = 5
  try:
-  default_pM = int(settings.DEFAULT_PM)
+    default_pM = int(settings.DEFAULT_PM)
  except ValueError,e:
-  logger.error("invalid value for frontend.default_pm")
+    logger.error("invalid value for frontend.default_pm")
+
+# how many days to wait before trying to re-import a runfolder
+RESCAN_DELAY = 1
+try:
+    RESCAN_DELAY = int(settings.RESCAN_DELAY)
+except (ValueError, AttributeError):
+    logger.error("Missing or invalid settings.RESCAN_DELAY")
  
  RUN_STATUS_CHOICES = (
      (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
@@ -58,11 +65,11 @@ class FlowCell(models.Model):
  
    cluster_station = models.ForeignKey(ClusterStation, default=3)
    sequencer = models.ForeignKey(Sequencer, default=1)
-  
+
    notes = models.TextField(blank=True)
  
    def __unicode__(self):
-      return unicode(self.flowcell_id) 
+      return unicode(self.flowcell_id)
  
    def Lanes(self):
      html = ['<table>']
@@ -105,7 +112,7 @@ class FlowCell(models.Model):
        flowcell_id, status = parse_flowcell_id(self.flowcell_id)
        return ('htsworkflow.frontend.experiments.views.flowcell_detail',
                [str(flowcell_id)])
-    
+
    def get_raw_data_directory(self):
        """Return location of where the raw data is stored"""
        flowcell_id, status = parse_flowcell_id(self.flowcell_id)
@@ -114,14 +121,14 @@ class FlowCell(models.Model):
  
    def update_data_runs(self):
        result_root = self.get_raw_data_directory()
+      logger.debug("Update data runs flowcell root: %s" % (result_root,))
        if result_root is None:
            return
  
        result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
        run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
-      
-      dataruns = self.datarun_set.all()
-      datarun_result_dirs = [ x.result_dir for x in dataruns ]
+
+      dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ])
  
        result_dirs = []
        for dirpath, dirnames, filenames in os.walk(result_root):
@@ -129,30 +136,37 @@ class FlowCell(models.Model):
                if run_xml_re.match(filename):
                    # we have a run directory
                    relative_pathname = get_relative_pathname(dirpath)
-                  if relative_pathname not in datarun_result_dirs:
+                  cached_run = dataruns.get(relative_pathname, None)
+                  now = datetime.datetime.now()
+                  if (cached_run is None):
                        self.import_data_run(relative_pathname, filename)
-                
-  def import_data_run(self, relative_pathname, run_xml_name):
+                  elif (now - cached_run.last_update_time).days > RESCAN_DELAY:
+                      self.import_data_run(relative_pathname,
+                                           filename, cached_run)
+
+  def import_data_run(self, relative_pathname, run_xml_name, run=None):
        """Given a result directory import files"""
        run_dir = get_absolute_pathname(relative_pathname)
        run_xml_path = os.path.join(run_dir, run_xml_name)
        run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
-                                  
-      run = DataRun()
-      run.flowcell = self
-      run.status = RUN_STATUS_REVERSE_MAP['DONE']
-      run.result_dir = relative_pathname
-      run.runfolder_name = run_xml_data.runfolder_name
-      run.cycle_start = run_xml_data.image_analysis.start
-      run.cycle_stop = run_xml_data.image_analysis.stop
-      run.run_start_time = run_xml_data.image_analysis.date
+      logger.debug("Importing run from %s" % (relative_pathname,))
+
+      if run is None:
+          run = DataRun()
+          run.flowcell = self
+          run.status = RUN_STATUS_REVERSE_MAP['DONE']
+          run.result_dir = relative_pathname
+          run.runfolder_name = run_xml_data.runfolder_name
+          run.cycle_start = run_xml_data.image_analysis.start
+          run.cycle_stop = run_xml_data.image_analysis.stop
+          run.run_start_time = run_xml_data.image_analysis.date
  
        run.last_update_time = datetime.datetime.now()
        run.save()
  
        run.update_result_files()
  
-      
+
  # FIXME: should we automatically update dataruns?
  #        Or should we expect someone to call update_data_runs?
  #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
@@ -161,7 +175,7 @@ class FlowCell(models.Model):
  #    if not os.path.exists(settings.RESULT_HOME_DIR):
  #       return
  #
-#    instance.update_data_runs()    
+#    instance.update_data_runs()
  #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
  
  
@@ -180,11 +194,11 @@ def is_valid_lane(value):
  
  class Lane(models.Model):
    flowcell = models.ForeignKey(FlowCell)
-  lane_number = models.IntegerField() 
+  lane_number = models.IntegerField()
    library = models.ForeignKey(Library)
    pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM)
-  cluster_estimate = models.IntegerField(blank=True, null=True)                                       
-  status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) 
+  cluster_estimate = models.IntegerField(blank=True, null=True)
+  status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True)
    comment = models.TextField(null=True, blank=True)
  
    @models.permalink
@@ -204,7 +218,7 @@ class DataRun(models.Model):
      run_start_time = models.DateTimeField()
      cycle_start = models.IntegerField(null=True, blank=True)
      cycle_stop = models.IntegerField(null=True, blank=True)
-    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, 
+    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
                                       null=True, blank=True)
      comment = models.TextField(blank=True)
  
@@ -220,7 +234,7 @@ class DataRun(models.Model):
                    relative_pathname=relative_pathname)
                  if len(datafiles) > 0:
                      continue
-                  
+
                  metadata = find_file_type_metadata_from_filename(filename)
                  if metadata is not None:
                      metadata['filename'] = filename
@@ -233,14 +247,14 @@ class DataRun(models.Model):
                      if lane_number is not None:
                          lane = self.flowcell.lane_set.get(lane_number = lane_number)
                          newfile.library = lane.library
-                    
+
                      self.datafile_set.add(newfile)
-                    
+
          self.last_update_time = datetime.datetime.now()
  
      def lane_files(self):
          lanes = {}
-        
+
          for datafile in self.datafile_set.all():
              metadata = datafile.attributes
              if metadata is not None:
@@ -258,7 +272,7 @@ class DataRun(models.Model):
          for rel_filename, metadata in self.get_result_files():
              if metadata.file_type.name in ivc_name:
                  plots[metadata.file_type.name] = (rel_filename, metadata)
-                
+
  class FileType(models.Model):
      """Represent potential file types
  
@@ -288,14 +302,14 @@ class FileType(models.Model):
                      value = results.get(attribute_name, None)
                      if value is not None:
                          results[attribute_name] = int(value)
-                    
+
                  return results
  
      def _get_normalized_name(self):
          """Crush data file name into identifier friendly name"""
          return self.name.replace(' ', '_').lower()
      normalized_name = property(_get_normalized_name)
-              
+
      def __unicode__(self):
          #return u"<FileType: %s>" % (self.name,)
          return self.name
@@ -337,7 +351,7 @@ def find_file_type_metadata_from_filename(pathname):
              return result
  
      return None
-  
+
  def get_relative_pathname(abspath):
      """Strip off the result home directory from a path
      """
author	Diane Trout <diane@caltech.edu>
	Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
committer	Diane Trout <diane@caltech.edu>
	Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)