From: Diane Trout Date: Thu, 18 Aug 2011 22:57:33 +0000 (-0700) Subject: Rescan a flowcell data run once per (setting.RESCAN_DELAY) days for files. X-Git-Tag: 0.5.3~10 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=04fa795ba46d8c21b00211cb0ae82aaaad4ede3b Rescan a flowcell data run once per (setting.RESCAN_DELAY) days for files. Defaults to 1 day. --- diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py index d77b3eb..c66a639 100755 --- a/htsworkflow/frontend/experiments/models.py +++ b/htsworkflow/frontend/experiments/models.py @@ -19,9 +19,16 @@ from htsworkflow.pipelines import runfolder logger = logging.getLogger(__name__) default_pM = 5 try: - default_pM = int(settings.DEFAULT_PM) + default_pM = int(settings.DEFAULT_PM) except ValueError,e: - logger.error("invalid value for frontend.default_pm") + logger.error("invalid value for frontend.default_pm") + +# how many days to wait before trying to re-import a runfolder +RESCAN_DELAY = 1 +try: + RESCAN_DELAY = int(settings.RESCAN_DELAY) +except (ValueError, AttributeError): + logger.error("Missing or invalid settings.RESCAN_DELAY") RUN_STATUS_CHOICES = ( (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'), @@ -58,11 +65,11 @@ class FlowCell(models.Model): cluster_station = models.ForeignKey(ClusterStation, default=3) sequencer = models.ForeignKey(Sequencer, default=1) - + notes = models.TextField(blank=True) def __unicode__(self): - return unicode(self.flowcell_id) + return unicode(self.flowcell_id) def Lanes(self): html = [''] @@ -105,7 +112,7 @@ class FlowCell(models.Model): flowcell_id, status = parse_flowcell_id(self.flowcell_id) return ('htsworkflow.frontend.experiments.views.flowcell_detail', [str(flowcell_id)]) - + def get_raw_data_directory(self): """Return location of where the raw data is stored""" flowcell_id, status = parse_flowcell_id(self.flowcell_id) @@ -114,14 +121,14 @@ class FlowCell(models.Model): def update_data_runs(self): result_root = self.get_raw_data_directory() + logger.debug("Update data runs flowcell root: %s" % (result_root,)) if result_root is None: return result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'') run_xml_re = re.compile(glob.fnmatch.translate('run*.xml')) - - dataruns = self.datarun_set.all() - datarun_result_dirs = [ x.result_dir for x in dataruns ] + + dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ]) result_dirs = [] for dirpath, dirnames, filenames in os.walk(result_root): @@ -129,30 +136,37 @@ class FlowCell(models.Model): if run_xml_re.match(filename): # we have a run directory relative_pathname = get_relative_pathname(dirpath) - if relative_pathname not in datarun_result_dirs: + cached_run = dataruns.get(relative_pathname, None) + now = datetime.datetime.now() + if (cached_run is None): self.import_data_run(relative_pathname, filename) - - def import_data_run(self, relative_pathname, run_xml_name): + elif (now - cached_run.last_update_time).days > RESCAN_DELAY: + self.import_data_run(relative_pathname, + filename, cached_run) + + def import_data_run(self, relative_pathname, run_xml_name, run=None): """Given a result directory import files""" run_dir = get_absolute_pathname(relative_pathname) run_xml_path = os.path.join(run_dir, run_xml_name) run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path) - - run = DataRun() - run.flowcell = self - run.status = RUN_STATUS_REVERSE_MAP['DONE'] - run.result_dir = relative_pathname - run.runfolder_name = run_xml_data.runfolder_name - run.cycle_start = run_xml_data.image_analysis.start - run.cycle_stop = run_xml_data.image_analysis.stop - run.run_start_time = run_xml_data.image_analysis.date + logger.debug("Importing run from %s" % (relative_pathname,)) + + if run is None: + run = DataRun() + run.flowcell = self + run.status = RUN_STATUS_REVERSE_MAP['DONE'] + run.result_dir = relative_pathname + run.runfolder_name = run_xml_data.runfolder_name + run.cycle_start = run_xml_data.image_analysis.start + run.cycle_stop = run_xml_data.image_analysis.stop + run.run_start_time = run_xml_data.image_analysis.date run.last_update_time = datetime.datetime.now() run.save() run.update_result_files() - + # FIXME: should we automatically update dataruns? # Or should we expect someone to call update_data_runs? #def update_flowcell_dataruns(sender, instance, *args, **kwargs): @@ -161,7 +175,7 @@ class FlowCell(models.Model): # if not os.path.exists(settings.RESULT_HOME_DIR): # return # -# instance.update_data_runs() +# instance.update_data_runs() #post_init.connect(update_flowcell_dataruns, sender=FlowCell) @@ -180,11 +194,11 @@ def is_valid_lane(value): class Lane(models.Model): flowcell = models.ForeignKey(FlowCell) - lane_number = models.IntegerField() + lane_number = models.IntegerField() library = models.ForeignKey(Library) pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM) - cluster_estimate = models.IntegerField(blank=True, null=True) - status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) + cluster_estimate = models.IntegerField(blank=True, null=True) + status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) comment = models.TextField(null=True, blank=True) @models.permalink @@ -204,7 +218,7 @@ class DataRun(models.Model): run_start_time = models.DateTimeField() cycle_start = models.IntegerField(null=True, blank=True) cycle_stop = models.IntegerField(null=True, blank=True) - run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, + run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, null=True, blank=True) comment = models.TextField(blank=True) @@ -220,7 +234,7 @@ class DataRun(models.Model): relative_pathname=relative_pathname) if len(datafiles) > 0: continue - + metadata = find_file_type_metadata_from_filename(filename) if metadata is not None: metadata['filename'] = filename @@ -233,14 +247,14 @@ class DataRun(models.Model): if lane_number is not None: lane = self.flowcell.lane_set.get(lane_number = lane_number) newfile.library = lane.library - + self.datafile_set.add(newfile) - + self.last_update_time = datetime.datetime.now() def lane_files(self): lanes = {} - + for datafile in self.datafile_set.all(): metadata = datafile.attributes if metadata is not None: @@ -258,7 +272,7 @@ class DataRun(models.Model): for rel_filename, metadata in self.get_result_files(): if metadata.file_type.name in ivc_name: plots[metadata.file_type.name] = (rel_filename, metadata) - + class FileType(models.Model): """Represent potential file types @@ -288,14 +302,14 @@ class FileType(models.Model): value = results.get(attribute_name, None) if value is not None: results[attribute_name] = int(value) - + return results def _get_normalized_name(self): """Crush data file name into identifier friendly name""" return self.name.replace(' ', '_').lower() normalized_name = property(_get_normalized_name) - + def __unicode__(self): #return u"" % (self.name,) return self.name @@ -337,7 +351,7 @@ def find_file_type_metadata_from_filename(pathname): return result return None - + def get_relative_pathname(abspath): """Strip off the result home directory from a path """