X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=htsworkflow%2Ffrontend%2Fexperiments%2Fmodels.py;h=5152c406e2fc55c87bd997ed7d4ee14197339859;hb=4a2529df5d2bdaa83d1661730c3791560d58df37;hp=c66a639cae0ab669ce52b3fb3b17e7b765930bbe;hpb=8749d93aec498725bc4a3bc18fae0388e64844fc;p=htsworkflow.git diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py index c66a639..5152c40 100644 --- a/htsworkflow/frontend/experiments/models.py +++ b/htsworkflow/frontend/experiments/models.py @@ -10,28 +10,29 @@ from django.conf import settings from django.core.exceptions import ObjectDoesNotExist from django.core import urlresolvers from django.db import models -from django.db.models.signals import post_init +from django.db.models.signals import post_init, pre_save from htsworkflow.frontend.samples.models import Library from htsworkflow.util.conversion import parse_flowcell_id from htsworkflow.pipelines import runfolder -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) default_pM = 5 try: default_pM = int(settings.DEFAULT_PM) -except ValueError,e: - logger.error("invalid value for frontend.default_pm") +except ValueError, e: + LOGGER.error("invalid value for frontend.default_pm") # how many days to wait before trying to re-import a runfolder RESCAN_DELAY = 1 try: RESCAN_DELAY = int(settings.RESCAN_DELAY) except (ValueError, AttributeError): - logger.error("Missing or invalid settings.RESCAN_DELAY") + LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\ + "defaulting to %s" % (RESCAN_DELAY,)) RUN_STATUS_CHOICES = ( - (0, 'Sequencer running'), ##Solexa Data Pipeline Not Yet Started'), + (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'), (1, 'Data Pipeline Started'), (2, 'Data Pipeline Interrupted'), (3, 'Data Pipeline Finished'), @@ -41,130 +42,221 @@ RUN_STATUS_CHOICES = ( (7, 'QC Finished'), (255, 'DONE'), ) -RUN_STATUS_REVERSE_MAP = dict(((v,k) for k,v in RUN_STATUS_CHOICES)) +RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES)) + class ClusterStation(models.Model): - name = models.CharField(max_length=50, unique=True) + """List of cluster stations""" + name = models.CharField(max_length=50, unique=True) + isdefault = models.BooleanField(default=False, null=False) + + class Meta: + ordering = ["-isdefault", "name"] + + def __unicode__(self): + return unicode(self.name) + + @classmethod + def default(cls): + d = cls.objects.filter(isdefault=True).all() + if len(d) > 0: + return d[0] + d = cls.objects.order_by('-id').all() + if len(d) > 0: + return d[0] + return None + + @staticmethod + def update_isdefault(sender, instance, **kwargs): + """Clear default if needed + """ + if instance.isdefault: + for c in ClusterStation.objects.filter(isdefault=True).all(): + if c.id != instance.id: + c.isdefault = False + c.save() - def __unicode__(self): - return unicode(self.name) +pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation) class Sequencer(models.Model): - name = models.CharField(max_length=50, unique=True) + """Sequencers we've owned + """ + name = models.CharField(max_length=50, db_index=True) + instrument_name = models.CharField(max_length=50, db_index=True) + serial_number = models.CharField(max_length=50, db_index=True) + model = models.CharField(max_length=255) + active = models.BooleanField(default=True, null=False) + isdefault = models.BooleanField(default=False, null=False) + comment = models.CharField(max_length=255) + + class Meta: + ordering = ["-isdefault", "-active", "name"] + + def __unicode__(self): + name = [unicode(self.name)] + if self.instrument_name is not None: + name.append("(%s)" % (unicode(self.instrument_name),)) + return " ".join(name) + + @models.permalink + def get_absolute_url(self): + return ('htsworkflow.frontend.experiments.views.sequencer', + [self.id]) + + @classmethod + def default(cls): + d = cls.objects.filter(isdefault=True).all() + if len(d) > 0: + return d[0] + d = cls.objects.order_by('active', '-id').all() + if len(d) > 0: + return d[0] + return None + + @staticmethod + def update_isdefault(sender, instance, **kwargs): + """Clear default if needed + """ + if instance.isdefault: + for s in Sequencer.objects.filter(isdefault=True).all(): + if s.id != instance.id: + s.isdefault = False + s.save() + +pre_save.connect(Sequencer.update_isdefault, sender=Sequencer) - def __unicode__(self): - return unicode(self.name) class FlowCell(models.Model): - flowcell_id = models.CharField(max_length=20, unique=True, db_index=True) - run_date = models.DateTimeField() - advanced_run = models.BooleanField(default=False) - paired_end = models.BooleanField(default=False) - read_length = models.IntegerField(default=32) #Stanford is currenlty 25 - control_lane = models.IntegerField(choices=[(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(0,'All Lanes')], null=True, blank=True) - - cluster_station = models.ForeignKey(ClusterStation, default=3) - sequencer = models.ForeignKey(Sequencer, default=1) - - notes = models.TextField(blank=True) - - def __unicode__(self): - return unicode(self.flowcell_id) - - def Lanes(self): - html = [''] - for lane in self.lane_set.order_by('lane_number'): - cluster_estimate = lane.cluster_estimate - if cluster_estimate is not None: - cluster_estimate = "%s k" % ((int(cluster_estimate)/1000), ) + flowcell_id = models.CharField(max_length=20, unique=True, db_index=True) + run_date = models.DateTimeField() + advanced_run = models.BooleanField(default=False) + paired_end = models.BooleanField(default=False) + read_length = models.IntegerField(default=32) # Stanford is currenlty 25 + control_lane = models.IntegerField(choices=[(1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (0, 'All Lanes')], + null=True, + blank=True) + + cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default) + sequencer = models.ForeignKey(Sequencer, default=Sequencer.default) + + notes = models.TextField(blank=True) + + def __unicode__(self): + return unicode(self.flowcell_id) + + def Lanes(self): + html = ['
'] + for lane in self.lane_set.order_by('lane_number'): + cluster_estimate = lane.cluster_estimate + if cluster_estimate is not None: + cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), ) + else: + cluster_estimate = 'None' + library_id = lane.library_id + library = lane.library + element = ''\ + '' + html.append(element % (lane.lane_number, + library.get_admin_url(), + library, + cluster_estimate)) + html.append('
%d%s%s
') + return "\n".join(html) + Lanes.allow_tags = True + + class Meta: + ordering = ["-run_date"] + + def get_admin_url(self): + # that's the django way... except it didn't work + return urlresolvers.reverse('admin:experiments_flowcell_change', + args=(self.id,)) + + def flowcell_type(self): + """Convert our boolean 'is paired' flag to a name + """ + if self.paired_end: + return u"Paired" else: - cluster_estimate = 'None' - library_id = lane.library_id - library = lane.library - element = '%d%s%s' - html.append(element % (lane.lane_number, - library.get_admin_url(), - library, - cluster_estimate)) - html.append('') - return "\n".join(html) - Lanes.allow_tags = True - - class Meta: - ordering = ["-run_date"] - - def get_admin_url(self): - # that's the django way... except it didn't work - return urlresolvers.reverse('admin:experiments_flowcell_change', - args=(self.id,)) - - def flowcell_type(self): - """ - Convert our boolean 'is paired' flag to a name - """ - if self.paired_end: - return u"Paired" - else: - return u"Single" - - @models.permalink - def get_absolute_url(self): - flowcell_id, status = parse_flowcell_id(self.flowcell_id) - return ('htsworkflow.frontend.experiments.views.flowcell_detail', - [str(flowcell_id)]) - - def get_raw_data_directory(self): - """Return location of where the raw data is stored""" - flowcell_id, status = parse_flowcell_id(self.flowcell_id) - - return os.path.join(settings.RESULT_HOME_DIR, flowcell_id) - - def update_data_runs(self): - result_root = self.get_raw_data_directory() - logger.debug("Update data runs flowcell root: %s" % (result_root,)) - if result_root is None: - return - - result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'') - run_xml_re = re.compile(glob.fnmatch.translate('run*.xml')) - - dataruns = dict([ (x.result_dir, x) for x in self.datarun_set.all() ]) - - result_dirs = [] - for dirpath, dirnames, filenames in os.walk(result_root): - for filename in filenames: - if run_xml_re.match(filename): - # we have a run directory - relative_pathname = get_relative_pathname(dirpath) - cached_run = dataruns.get(relative_pathname, None) - now = datetime.datetime.now() - if (cached_run is None): - self.import_data_run(relative_pathname, filename) - elif (now - cached_run.last_update_time).days > RESCAN_DELAY: - self.import_data_run(relative_pathname, - filename, cached_run) - - def import_data_run(self, relative_pathname, run_xml_name, run=None): - """Given a result directory import files""" - run_dir = get_absolute_pathname(relative_pathname) - run_xml_path = os.path.join(run_dir, run_xml_name) - run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path) - logger.debug("Importing run from %s" % (relative_pathname,)) - - if run is None: - run = DataRun() - run.flowcell = self - run.status = RUN_STATUS_REVERSE_MAP['DONE'] - run.result_dir = relative_pathname - run.runfolder_name = run_xml_data.runfolder_name - run.cycle_start = run_xml_data.image_analysis.start - run.cycle_stop = run_xml_data.image_analysis.stop - run.run_start_time = run_xml_data.image_analysis.date - - run.last_update_time = datetime.datetime.now() - run.save() - - run.update_result_files() + return u"Single" + + @models.permalink + def get_absolute_url(self): + flowcell_id, status = parse_flowcell_id(self.flowcell_id) + return ('htsworkflow.frontend.experiments.views.flowcell_detail', + [str(flowcell_id)]) + + def get_raw_data_directory(self): + """Return location of where the raw data is stored""" + flowcell_id, status = parse_flowcell_id(self.flowcell_id) + + return os.path.join(settings.RESULT_HOME_DIR, flowcell_id) + + def update_data_runs(self): + result_root = self.get_raw_data_directory() + LOGGER.debug("Update data runs flowcell root: %s" % (result_root,)) + if result_root is None: + return + + result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '') + run_xml_re = re.compile(glob.fnmatch.translate('run*.xml')) + + result_dirs = [] + for dirpath, dirnames, filenames in os.walk(result_root): + for filename in filenames: + if run_xml_re.match(filename): + # we have a run directory + relative_pathname = get_relative_pathname(dirpath) + self.import_data_run(relative_pathname, filename) + + def import_data_run(self, relative_pathname, run_xml_name, force=False): + """Given a result directory import files""" + now = datetime.datetime.now() + run_dir = get_absolute_pathname(relative_pathname) + run_xml_path = os.path.join(run_dir, run_xml_name) + + runs = DataRun.objects.filter(result_dir = relative_pathname) + if len(runs) == 0: + run = DataRun() + created = True + elif len(runs) > 1: + raise RuntimeError("Too many data runs for %s" % ( + relative_pathname,)) + else: + run = runs[0] + created = False + + if created or force or (now-run.last_update_time).days > RESCAN_DELAY: + LOGGER.debug("Importing run from %s" % (relative_pathname,)) + run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path) + run.flowcell = self + run.status = RUN_STATUS_REVERSE_MAP['DONE'] + run.result_dir = relative_pathname + run.runfolder_name = run_xml_data.runfolder_name + run.cycle_start = run_xml_data.image_analysis.start + run.cycle_stop = run_xml_data.image_analysis.stop + run.run_start_time = run_xml_data.image_analysis.date + run.image_software = run_xml_data.image_analysis.software + run.image_version = run_xml_data.image_analysis.version + run.basecall_software = run_xml_data.bustard.software + run.basecall_version = run_xml_data.bustard.version + # we're frequently not running alignments + if run_xml_data.gerald: + run.alignment_software = run_xml_data.gerald.software + run.alignment_version = run_xml_data.gerald.version + + run.last_update_time = datetime.datetime.now() + run.save() + + run.update_result_files() # FIXME: should we automatically update dataruns? @@ -179,39 +271,46 @@ class FlowCell(models.Model): #post_init.connect(update_flowcell_dataruns, sender=FlowCell) - LANE_STATUS_CODES = [(0, 'Failed'), - (1, 'Marginal'), - (2, 'Good'),] -LANE_STATUS_MAP = dict((int(k),v) for k,v in LANE_STATUS_CODES ) + (1, 'Marginal'), + (2, 'Good'), ] +LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES) LANE_STATUS_MAP[None] = "Unknown" + def is_valid_lane(value): if value >= 1 and value <= 8: return True else: - return False + return False + class Lane(models.Model): - flowcell = models.ForeignKey(FlowCell) - lane_number = models.IntegerField() - library = models.ForeignKey(Library) - pM = models.DecimalField(max_digits=5, decimal_places=2,blank=False, null=False,default=default_pM) - cluster_estimate = models.IntegerField(blank=True, null=True) - status = models.IntegerField(choices=LANE_STATUS_CODES, null=True, blank=True) - comment = models.TextField(null=True, blank=True) - - @models.permalink - def get_absolute_url(self): - return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail', - [str(self.id)]) - - def __unicode__(self): - return self.flowcell.flowcell_id + ':' + unicode(self.lane_number) - -### ----------------------- + flowcell = models.ForeignKey(FlowCell) + lane_number = models.IntegerField() + library = models.ForeignKey(Library) + pM = models.DecimalField(max_digits=5, + decimal_places=2, + blank=False, + null=False, + default=default_pM) + cluster_estimate = models.IntegerField(blank=True, null=True) + status = models.IntegerField(choices=LANE_STATUS_CODES, + null=True, + blank=True) + comment = models.TextField(null=True, blank=True) + + @models.permalink + def get_absolute_url(self): + return ('htsworkflow.frontend.experiments.views.flowcell_lane_detail', + [str(self.id)]) + + def __unicode__(self): + return self.flowcell.flowcell_id + ':' + unicode(self.lane_number) + + class DataRun(models.Model): - flowcell = models.ForeignKey(FlowCell,verbose_name="Flowcell Id") + flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id") runfolder_name = models.CharField(max_length=50) result_dir = models.CharField(max_length=255) last_update_time = models.DateTimeField() @@ -220,6 +319,12 @@ class DataRun(models.Model): cycle_stop = models.IntegerField(null=True, blank=True) run_status = models.IntegerField(choices=RUN_STATUS_CHOICES, null=True, blank=True) + image_software = models.CharField(max_length=50) + image_version = models.CharField(max_length=50) + basecall_software = models.CharField(max_length=50) + basecall_version = models.CharField(max_length=50) + alignment_software = models.CharField(max_length=50) + alignment_version = models.CharField(max_length=50) comment = models.TextField(blank=True) def update_result_files(self): @@ -230,8 +335,8 @@ class DataRun(models.Model): pathname = os.path.join(dirname, filename) relative_pathname = get_relative_pathname(pathname) datafiles = self.datafile_set.filter( - data_run = self, - relative_pathname=relative_pathname) + data_run=self, + relative_pathname=relative_pathname) if len(datafiles) > 0: continue @@ -245,7 +350,8 @@ class DataRun(models.Model): lane_number = metadata.get('lane', None) if lane_number is not None: - lane = self.flowcell.lane_set.get(lane_number = lane_number) + lane = self.flowcell.lane_set.get( + lane_number=lane_number) newfile.library = lane.library self.datafile_set.add(newfile) @@ -261,7 +367,8 @@ class DataRun(models.Model): lane = metadata.get('lane', None) if lane is not None: lane_file_set = lanes.setdefault(lane, {}) - lane_file_set[datafile.file_type.normalized_name] = datafile + normalized_name = datafile.file_type.normalized_name + lane_file_set[normalized_name] = datafile return lanes def ivc_plots(self, lane): @@ -273,6 +380,7 @@ class DataRun(models.Model): if metadata.file_type.name in ivc_name: plots[metadata.file_type.name] = (rel_filename, metadata) + class FileType(models.Model): """Represent potential file types @@ -294,7 +402,8 @@ class FileType(models.Model): if len(self.regex) > 0: match = re.match(self.regex, filename) if match is not None: - # These are (?P<>) names we know about from our default regexes. + # These are (?P<>) names we know about from our + # default regexes. results = match.groupdict() # convert int parameters @@ -314,10 +423,12 @@ class FileType(models.Model): #return u"" % (self.name,) return self.name + def str_uuid(): """Helper function to set default UUID in DataFile""" return str(uuid.uuid1()) + class DataFile(models.Model): """Store map from random ID to filename""" random_key = models.CharField(max_length=64, @@ -339,7 +450,8 @@ class DataFile(models.Model): @models.permalink def get_absolute_url(self): return ('htsworkflow.frontend.experiments.views.read_result_file', - (), {'key': self.random_key }) + (), {'key': self.random_key}) + def find_file_type_metadata_from_filename(pathname): path, filename = os.path.split(pathname) @@ -352,14 +464,15 @@ def find_file_type_metadata_from_filename(pathname): return None + def get_relative_pathname(abspath): """Strip off the result home directory from a path """ - result_home_dir = os.path.join(settings.RESULT_HOME_DIR,'') - relative_pathname = abspath.replace(result_home_dir,'') + result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '') + relative_pathname = abspath.replace(result_home_dir, '') return relative_pathname + def get_absolute_pathname(relative_pathname): """Attach relative path to results home directory""" return os.path.join(settings.RESULT_HOME_DIR, relative_pathname) -