Guess genome name for building compressed mapping counts from genomesize.xml
authorDiane Trout <diane@caltech.edu>
Tue, 10 Jul 2012 21:13:46 +0000 (14:13 -0700)
committerDiane Trout <diane@caltech.edu>
Tue, 10 Jul 2012 21:13:46 +0000 (14:13 -0700)
The HiSeq pipeline creates a file that has the sizes of the genomes
that it mapped the particular project to, figuring out the genome
version from that is a lot easier than dealing with all the random
possible file names for the config files.

This patch is lacking in that I just hard coded a few genomes
I probably need some mechanism for pulling it from a database.

htsworkflow/pipelines/eland.py
htsworkflow/pipelines/gerald.py
htsworkflow/pipelines/runfolder.py

index 19905bc8da526996924f592d15120865b5c1747b..26e6f56b7d74e1d8ace710df473c896ed9e17c51 100644 (file)
@@ -711,10 +711,19 @@ class ELAND(collections.MutableMapping):
         # runfolder summary_report
         names = [ os.path.split(p)[1] for p in pathnames]
         LOGGER.info("Adding eland files %s" %(",".join(names),))
+        basedir = os.path.split(pathnames[0])[0]
+        gs_template = "{0}_*_L{1:03}_genomesize.xml"
+        genomesize = glob(
+            os.path.join(basedir,
+                         gs_template.format(key.sample, key.lane)))
+
 
         genome_map = {}
         if genome_maps is not None:
             genome_map = genome_maps[key.lane]
+        elif len(genomesize) > 0:
+            print "Found {0}".format(genomesize)
+            genome_map = build_genome_size_map(genomesize[0])
         elif gerald is not None:
             genome_dir = gerald.lanes[key].eland_genome
             if genome_dir is not None:
@@ -881,6 +890,32 @@ def build_genome_fasta_map(genome_dir):
             fasta_map[name] = os.path.join(genome, name)
     return fasta_map
 
+def build_genome_size_map(pathname):
+    """Guess what genome we're using"""
+    sizes = {}
+    tree = ElementTree.parse(pathname)
+    for element in tree.getroot():
+        name = element.attrib['contigName']
+        bases = int(element.attrib['totalBases'])
+        sizes[name] = bases
+
+    # guess genome names
+    if sizes.get('chr1', 0) == 197195432:
+        genome = 'mm9'
+    elif sizes.get('chr1', 0) == 247249719:
+        genome = 'hg19'
+    elif sizes.get('chrI', 0) == 230218:
+        genome = 'sacCer3'
+    elif len(sizes) == 1:
+        genome = os.path.splitext(sizes.keys()[0])[0]
+    else:
+        raise RuntimeError("Unrecognized genome type, update detection")
+
+    fasta_map = {}
+    for k,v in sizes.items():
+        fasta_map[k] = genome + '/' + k
+
+    return fasta_map
 
 def extract_eland_sequence(instream, outstream, start, end):
     """
index aca994f4a8ac9c3e4cc24ccc674e8cf91ebef6df..eb37cc4c853af8f4d13bbadc102d4b1a8c700436 100644 (file)
@@ -405,7 +405,9 @@ class LaneSpecificRunParameters(collections.MutableMapping):
         real_key = self._find_key(key)
         if real_key is not None:
             return self._lanes[real_key]
-        raise KeyError("%s not found" % (repr(key),))
+        raise KeyError("%s not found in %s" % (
+            repr(key),
+            ",".join((repr(k) for k in self._lanes.keys()))))
 
     def __setitem__(self, key, value):
         if len(self._lanes) > 100:
index e857c3e6ae04a5ab044cac5c105d5b187e87eafc..baf115d8e3a6dd97f5fb137c945e71f1fd8fef62 100644 (file)
@@ -211,7 +211,7 @@ def get_runs(runfolder, flowcell_id=None):
         for aligned in glob(aligned_glob):
             LOGGER.info("Found aligned directory %s" % (aligned,))
             try:
-                g = gerald.HiSeq(aligned)
+                g = gerald.gerald(aligned)
                 p = PipelineRun(runfolder, flowcell_id)
                 p.datadir = datadir
                 p.image_analysis = image_analysis