From: Diane Trout Date: Tue, 10 Jul 2012 21:13:46 +0000 (-0700) Subject: Guess genome name for building compressed mapping counts from genomesize.xml X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=1d5d5faa13018584e2cd57682bd03c1b9807b238 Guess genome name for building compressed mapping counts from genomesize.xml The HiSeq pipeline creates a file that has the sizes of the genomes that it mapped the particular project to, figuring out the genome version from that is a lot easier than dealing with all the random possible file names for the config files. This patch is lacking in that I just hard coded a few genomes I probably need some mechanism for pulling it from a database. --- diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py index 19905bc..26e6f56 100644 --- a/htsworkflow/pipelines/eland.py +++ b/htsworkflow/pipelines/eland.py @@ -711,10 +711,19 @@ class ELAND(collections.MutableMapping): # runfolder summary_report names = [ os.path.split(p)[1] for p in pathnames] LOGGER.info("Adding eland files %s" %(",".join(names),)) + basedir = os.path.split(pathnames[0])[0] + gs_template = "{0}_*_L{1:03}_genomesize.xml" + genomesize = glob( + os.path.join(basedir, + gs_template.format(key.sample, key.lane))) + genome_map = {} if genome_maps is not None: genome_map = genome_maps[key.lane] + elif len(genomesize) > 0: + print "Found {0}".format(genomesize) + genome_map = build_genome_size_map(genomesize[0]) elif gerald is not None: genome_dir = gerald.lanes[key].eland_genome if genome_dir is not None: @@ -881,6 +890,32 @@ def build_genome_fasta_map(genome_dir): fasta_map[name] = os.path.join(genome, name) return fasta_map +def build_genome_size_map(pathname): + """Guess what genome we're using""" + sizes = {} + tree = ElementTree.parse(pathname) + for element in tree.getroot(): + name = element.attrib['contigName'] + bases = int(element.attrib['totalBases']) + sizes[name] = bases + + # guess genome names + if sizes.get('chr1', 0) == 197195432: + genome = 'mm9' + elif sizes.get('chr1', 0) == 247249719: + genome = 'hg19' + elif sizes.get('chrI', 0) == 230218: + genome = 'sacCer3' + elif len(sizes) == 1: + genome = os.path.splitext(sizes.keys()[0])[0] + else: + raise RuntimeError("Unrecognized genome type, update detection") + + fasta_map = {} + for k,v in sizes.items(): + fasta_map[k] = genome + '/' + k + + return fasta_map def extract_eland_sequence(instream, outstream, start, end): """ diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index aca994f..eb37cc4 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -405,7 +405,9 @@ class LaneSpecificRunParameters(collections.MutableMapping): real_key = self._find_key(key) if real_key is not None: return self._lanes[real_key] - raise KeyError("%s not found" % (repr(key),)) + raise KeyError("%s not found in %s" % ( + repr(key), + ",".join((repr(k) for k in self._lanes.keys())))) def __setitem__(self, key, value): if len(self._lanes) > 100: diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index e857c3e..baf115d 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -211,7 +211,7 @@ def get_runs(runfolder, flowcell_id=None): for aligned in glob(aligned_glob): LOGGER.info("Found aligned directory %s" % (aligned,)) try: - g = gerald.HiSeq(aligned) + g = gerald.gerald(aligned) p = PipelineRun(runfolder, flowcell_id) p.datadir = datadir p.image_analysis = image_analysis