Rescan a flowcell data run once per (setting.RESCAN_DELAY) days for files.
authorDiane Trout <diane@caltech.edu>
Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
committerDiane Trout <diane@caltech.edu>
Thu, 18 Aug 2011 22:57:33 +0000 (15:57 -0700)
Defaults to 1 day.

htsworkflow/frontend/experiments/models.py

index d77b3eb78d37b0be44b04d8842ebecdf3eae2f56..c66a639cae0ab669ce52b3fb3b17e7b765930bbe 100755 (executable)
@@ -19,9 +19,16 @@ from htsworkflow.pipelines import runfolder
 logger = logging.getLogger(__name__)
 default_pM = 5
 try:
-  default_pM = int(settings.DEFAULT_PM)
+    default_pM = int(settings.DEFAULT_PM)
 except ValueError,e:
-  logger.error("invalid value for frontend.default_pm")
+    logger.error("invalid value for frontend.default_pm")
+
+# how many days to wait before trying to re-import a runfolder
+RESCAN_DELAY = 1
+try:
+    RESCAN_DELAY = int(settings.RESCAN_DELAY)
+except (ValueError, AttributeError):
+    logger.error("Missing or invalid settings.RESCAN_DELAY")
 
 RUN_STATUS_CHOICES = (
     (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'),
@@ -58,11 +65,11 @@ class FlowCell(models.Model):
 
   cluster_station = models.ForeignKey(ClusterStation, default=3)
   sequencer = models.ForeignKey(Sequencer, default=1)
-  
+
   notes = models.TextField(blank=True)
 
   def __unicode__(self):
-      return unicode(self.flowcell_id) 
+      return unicode(self.flowcell_id)
 
   def Lanes(self):
     html = ['<table>']
@@ -105,7 +112,7 @@ class FlowCell(models.Model):
       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
       return ('htsworkflow.frontend.experiments.views.flowcell_detail',
               [str(flowcell_id)])
-    
+
   def get_raw_data_directory(self):
       """Return location of where the raw data is stored"""
       flowcell_id, status = parse_flowcell_id(self.flowcell_id)
@@ -114,14 +121,14 @@ class FlowCell(models.Model):
 
   def update_data_runs(self):
       result_root = self.get_raw_data_directory()
+      logger.debug("Update data runs flowcell root: %s" % (result_root,))
       if result_root is None:
           return
 
       result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'')
       run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
-      
-      dataruns = self.datarun_set.all()
-      datarun_result_dirs = [ x.result_dir for x in dataruns ]
+
+      dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ])
 
       result_dirs = []
       for dirpath, dirnames, filenames in os.walk(result_root):
@@ -129,30 +136,37 @@ class FlowCell(models.Model):
               if run_xml_re.match(filename):
                   # we have a run directory
                   relative_pathname = get_relative_pathname(dirpath)
-                  if relative_pathname not in datarun_result_dirs:
+                  cached_run = dataruns.get(relative_pathname, None)
+                  now = datetime.datetime.now()
+                  if (cached_run is None):
                       self.import_data_run(relative_pathname, filename)
-                
-  def import_data_run(self, relative_pathname, run_xml_name):
+                  elif (now - cached_run.last_update_time).days > RESCAN_DELAY:
+                      self.import_data_run(relative_pathname,
+                                           filename, cached_run)
+
+  def import_data_run(self, relative_pathname, run_xml_name, run=None):
       """Given a result directory import files"""
       run_dir = get_absolute_pathname(relative_pathname)
       run_xml_path = os.path.join(run_dir, run_xml_name)
       run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
-                                  
-      run = DataRun()
-      run.flowcell = self
-      run.status = RUN_STATUS_REVERSE_MAP['DONE']
-      run.result_dir = relative_pathname
-      run.runfolder_name = run_xml_data.runfolder_name
-      run.cycle_start = run_xml_data.image_analysis.start
-      run.cycle_stop = run_xml_data.image_analysis.stop
-      run.run_start_time = run_xml_data.image_analysis.date
+      logger.debug("Importing run from %s" % (relative_pathname,))
+
+      if run is None:
+          run = DataRun()
+          run.flowcell = self
+          run.status = RUN_STATUS_REVERSE_MAP['DONE']
+          run.result_dir = relative_pathname
+          run.runfolder_name = run_xml_data.runfolder_name
+          run.cycle_start = run_xml_data.image_analysis.start
+          run.cycle_stop = run_xml_data.image_analysis.stop
+          run.run_start_time = run_xml_data.image_analysis.date
 
       run.last_update_time = datetime.datetime.now()
       run.save()
 
       run.update_result_files()
 
-      
+
 # FIXME: should we automatically update dataruns?
 #        Or should we expect someone to call update_data_runs?
 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
@@ -161,7 +175,7 @@ class FlowCell(models.Model):
 #    if not os.path.exists(settings.RESULT_HOME_DIR):
 #       return
 #
-#    instance.update_data_runs()    
+#    instance.update_data_runs()
 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
 
 
@@ -180,11 +194,11 @@ def is_valid_lane(value):
 
 class Lane(models.Model):
   flowcell = models.ForeignKey(FlowCell)
-  lane_number = models.IntegerField() 
+  lane_number = models.IntegerField()
   library = models.ForeignKey(Library)
   pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM)
-  cluster_estimate = models.IntegerField(blank=True, null=True)                                       
-  status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) 
+  cluster_estimate = models.IntegerField(blank=True, null=True)
+  status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True)
   comment = models.TextField(null=True, blank=True)
 
   @models.permalink
@@ -204,7 +218,7 @@ class DataRun(models.Model):
     run_start_time = models.DateTimeField()
     cycle_start = models.IntegerField(null=True, blank=True)
     cycle_stop = models.IntegerField(null=True, blank=True)
-    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, 
+    run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
                                      null=True, blank=True)
     comment = models.TextField(blank=True)
 
@@ -220,7 +234,7 @@ class DataRun(models.Model):
                   relative_pathname=relative_pathname)
                 if len(datafiles) > 0:
                     continue
-                  
+
                 metadata = find_file_type_metadata_from_filename(filename)
                 if metadata is not None:
                     metadata['filename'] = filename
@@ -233,14 +247,14 @@ class DataRun(models.Model):
                     if lane_number is not None:
                         lane = self.flowcell.lane_set.get(lane_number = lane_number)
                         newfile.library = lane.library
-                    
+
                     self.datafile_set.add(newfile)
-                    
+
         self.last_update_time = datetime.datetime.now()
 
     def lane_files(self):
         lanes = {}
-        
+
         for datafile in self.datafile_set.all():
             metadata = datafile.attributes
             if metadata is not None:
@@ -258,7 +272,7 @@ class DataRun(models.Model):
         for rel_filename, metadata in self.get_result_files():
             if metadata.file_type.name in ivc_name:
                 plots[metadata.file_type.name] = (rel_filename, metadata)
-                
+
 class FileType(models.Model):
     """Represent potential file types
 
@@ -288,14 +302,14 @@ class FileType(models.Model):
                     value = results.get(attribute_name, None)
                     if value is not None:
                         results[attribute_name] = int(value)
-                    
+
                 return results
 
     def _get_normalized_name(self):
         """Crush data file name into identifier friendly name"""
         return self.name.replace(' ', '_').lower()
     normalized_name = property(_get_normalized_name)
-              
+
     def __unicode__(self):
         #return u"<FileType: %s>" % (self.name,)
         return self.name
@@ -337,7 +351,7 @@ def find_file_type_metadata_from_filename(pathname):
             return result
 
     return None
-  
+
 def get_relative_pathname(abspath):
     """Strip off the result home directory from a path
     """