From: Diane Trout Date: Thu, 6 Nov 2008 22:39:24 +0000 (+0000) Subject: Process eland extended (or multi) read files. X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~24 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=962efe62453aac4cb9d373af3fec6cc480f4c7e6 Process eland extended (or multi) read files. This also updates the report tools to be compatible with 1.0. For multi reads I mapped 0/1/2 mismatch reads to U0/U1/U2 if the number of reads equaled 1 (for each category seperatly) and I mapped reads >1 and < 255 to R0/R1/R2. Unfortunately 1.1rc1 changed the summary file, so this patch is not compatible with it yet. --- diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py index 43062e1..d44dae8 100644 --- a/htsworkflow/pipelines/eland.py +++ b/htsworkflow/pipelines/eland.py @@ -5,6 +5,7 @@ Analyze ELAND files from glob import glob import logging import os +import re import stat from htsworkflow.pipelines.runfolder import ElementTree @@ -27,7 +28,12 @@ class ElandLane(object): MATCH_ITEM = 'Code' READS = 'Reads' - def __init__(self, pathname=None, genome_map=None, xml=None): + ELAND_SINGLE = 0 + ELAND_MULTI = 1 + ELAND_EXTENDED = 2 + ELAND_EXPORT = 3 + + def __init__(self, pathname=None, genome_map=None, eland_type=None, xml=None): self.pathname = pathname self._sample_name = None self._lane_id = None @@ -37,10 +43,26 @@ class ElandLane(object): if genome_map is None: genome_map = {} self.genome_map = genome_map + self.eland_type = None if xml is not None: self.set_elements(xml) + def _guess_eland_type(self, pathname): + if self.eland_type is None: + # attempt autodetect eland file type + pathn, name = os.path.split(pathname) + if re.search('result', name): + self.eland_type = ElandLane.ELAND_SINGLE + elif re.search('multi', name): + self.eland_type = ElandLane.ELAND_MULTI + elif re.search('extended', name): + self.eland_type = ElandLane.ELAND_EXTENDED + elif re.search('export', name): + self.eland_type = ElandLane.ELAND_EXPORT + else: + self.eland_type = ElandLane.ELAND_SINGLE + def _update(self): """ Actually read the file and actually count the reads @@ -48,11 +70,23 @@ class ElandLane(object): # can't do anything if we don't have a file to process if self.pathname is None: return + self._guess_eland_type(self.pathname) if os.stat(self.pathname)[stat.ST_SIZE] == 0: raise RuntimeError("Eland isn't done, try again later.") logging.info("summarizing results for %s" % (self.pathname)) + + if self.eland_type == ElandLane.ELAND_SINGLE: + result = self._update_eland_result(self.pathname) + elif self.eland_type == ElandLane.ELAND_MULTI or \ + self.eland_type == ElandLane.ELAND_EXTENDED: + result = self._update_eland_multi(self.pathname) + else: + raise NotImplementedError("Only support single/multi/extended eland files") + self._match_codes, self._mapped_reads, self._reads = result + + def _update_eland_result(self, pathname): reads = 0 mapped_reads = {} @@ -60,7 +94,7 @@ class ElandLane(object): 'U0':0, 'U1':0, 'U2':0, 'R0':0, 'R1':0, 'R2':0, } - for line in autoopen(self.pathname,'r'): + for line in autoopen(pathname,'r'): reads += 1 fields = line.split() # code = fields[2] @@ -72,9 +106,58 @@ class ElandLane(object): continue fasta = self.genome_map.get(fields[6], fields[6]) mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 - self._match_codes = match_codes - self._mapped_reads = mapped_reads - self._reads = reads + return match_codes, mapped_reads, reads + + def _update_eland_multi(self, pathname): + reads = 0 + mapped_reads = {} + + match_codes = {'NM':0, 'QC':0, 'RM':0, + 'U0':0, 'U1':0, 'U2':0, + 'R0':0, 'R1':0, 'R2':0, + } + match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)") + for line in autoopen(pathname,'r'): + reads += 1 + fields = line.split() + # fields[2] = QC/NM/or number of matches + groups = match_counts_re.match(fields[2]) + if groups is None: + match_codes[fields[2]] += 1 + else: + # when there are too many hit, eland writes a - where + # it would have put the list of hits + if fields[3] == '-': + continue + zero_mismatches = int(groups.group(1)) + if zero_mismatches == 1: + match_codes['U0'] += 1 + elif zero_mismatches < 255: + match_codes['R0'] += zero_mismatches + + one_mismatches = int(groups.group(2)) + if one_mismatches == 1: + match_codes['U1'] += 1 + elif one_mismatches < 255: + match_codes['R1'] += one_mismatches + + two_mismatches = int(groups.group(3)) + if two_mismatches == 1: + match_codes['U2'] += 1 + elif two_mismatches < 255: + match_codes['R2'] += two_mismatches + + chromo = None + for match in fields[3].split(','): + match_fragment = match.split(':') + if len(match_fragment) == 2: + chromo = match_fragment[0] + pos = match_fragment[1] + + fasta = self.genome_map.get(chromo, chromo) + assert fasta is not None + mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 + return match_codes, mapped_reads, reads def _update_name(self): # extract the sample name @@ -227,6 +310,14 @@ class ELAND(object): lane = ElandLane(xml=element) self.results[lane_id] = lane +def check_for_eland_file(basedir, lane_id, pattern): + basename = pattern % (lane_id,) + pathname = os.path.join(basedir, basename) + if os.path.exists(pathname): + return pathname + else: + return None + def eland(basedir, gerald=None, genome_maps=None): e = ELAND() @@ -235,7 +326,26 @@ def eland(basedir, gerald=None, genome_maps=None): # lets handle compressed eland files too file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) - for pathname in file_list: + lane_ids = ['1','2','3','4','5','6','7','8'] + # the order in patterns determines the preference for what + # will be found. + patterns = ['s_%s_eland_result.txt', + 's_%s_eland_result.txt.bz2', + 's_%s_eland_result.txt.gz', + 's_%s_eland_extended.txt', + 's_%s_eland_extended.txt.bz2', + 's_%s_eland_extended.txt.gz', + 's_%s_eland_multi.txt', + 's_%s_eland_multi.txt.bz2', + 's_%s_eland_multi.txt.gz',] + + for lane_id in lane_ids: + for p in patterns: + pathname = check_for_eland_file(basedir, lane_id, p) + if pathname is not None: + break + else: + continue # yes the lane_id is also being computed in ElandLane._update # I didn't want to clutter up my constructor # but I needed to persist the sample_name/lane_id for @@ -288,4 +398,3 @@ def extract_eland_sequence(instream, outstream, start, end): result = [record[0][start:end]] outstream.write("\t".join(result)) outstream.write(os.linesep) - diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 34f4f30..7e2328a 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -7,7 +7,7 @@ import os import time from htsworkflow.pipelines.summary import Summary -from htsworkflow.pipelines.eland import eland +from htsworkflow.pipelines.eland import eland, ELAND from htsworkflow.pipelines.runfolder import \ ElementTree, \ @@ -29,9 +29,9 @@ class Gerald(object): """ Make it easy to access elements of LaneSpecificRunParameters from python """ - def __init__(self, gerald, key): + def __init__(self, gerald, lane_id): self._gerald = gerald - self._key = key + self._lane_id = lane_id def __get_attribute(self, xml_tag): subtree = self._gerald.tree.find('LaneSpecificRunParameters') @@ -41,7 +41,7 @@ class Gerald(object): if len(container.getchildren()) > LANES_PER_FLOWCELL: raise RuntimeError('GERALD config.xml file changed') lanes = [x.tag.split('_')[1] for x in container.getchildren()] - index = lanes.index(self._key) + index = lanes.index(self._lane_id) element = container[index] return element.text def _get_analysis(self): @@ -73,25 +73,43 @@ class Gerald(object): """ def __init__(self, gerald): self._gerald = gerald - self._keys = None + self._lane = None + + def _initalize_lanes(self): + """ + build dictionary of LaneParameters + """ + self._lanes = {} + tree = self._gerald.tree + analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') + # according to the pipeline specs I think their fields + # are sampleName_laneID, with sampleName defaulting to s + # since laneIDs are constant lets just try using + # those consistently. + for element in analysis: + sample, lane_id = element.tag.split('_') + self._lanes[lane_id] = Gerald.LaneParameters(self._gerald, lane_id) + def __getitem__(self, key): - return Gerald.LaneParameters(self._gerald, key) + if self._lane is None: + self._initalize_lanes() + return self._lanes[key] def keys(self): - if self._keys is None: - tree = self._gerald.tree - analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') - # according to the pipeline specs I think their fields - # are sampleName_laneID, with sampleName defaulting to s - # since laneIDs are constant lets just try using - # those consistently. - self._keys = [ x.tag.split('_')[1] for x in analysis] - return self._keys + if self._lane is None: + self._initalize_lanes() + return self._lanes.keys() def values(self): - return [ self[x] for x in self.keys() ] + if self._lane is None: + self._initalize_lanes() + return self._lanes.values() def items(self): - return zip(self.keys(), self.values()) + if self._lane is None: + self._initalize_lanes() + return self._lanes.items() def __len__(self): - return len(self.keys()) + if self._lane is None: + self._initalize_lanes() + return len(self._lanes) def __init__(self, xml=None): self.pathname = None @@ -179,3 +197,8 @@ def gerald(pathname): g.eland_results = eland(g.pathname, g) return g +if __name__ == "__main__": + # quick test code + import sys + g = gerald(sys.argv[1]) + #ElementTree.dump(g.get_elements()) \ No newline at end of file diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index fc2beeb..20df0b2 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -258,6 +258,14 @@ def summary_report(runs): report.append('') return os.linesep.join(report) +def is_compressed(filename): + if os.path.splitext(filename)[1] == ".gz": + return True + elif os.path.splitext(filename)[1] == '.bz2': + return True + else: + return False + def extract_results(runs, output_base_dir=None): if output_base_dir is None: output_base_dir = os.getcwd() @@ -315,14 +323,19 @@ def extract_results(runs, output_base_dir=None): for eland_lane in g.eland_results.values(): source_name = eland_lane.pathname path, name = os.path.split(eland_lane.pathname) - dest_name = os.path.join(cycle_dir, name+'.bz2') - - args = ['bzip2', '-9', '-c', source_name] - logging.info('Running: %s' % ( " ".join(args) )) - bzip_dest = open(dest_name, 'w') - bzip = subprocess.Popen(args, stdout=bzip_dest) - logging.info('Saving to %s' % (dest_name, )) - bzip.wait() + dest_name = os.path.join(cycle_dir, name) + if is_compressed(name): + logging.info('Already compressed, Saving to %s' % (dest_name, )) + shutil.copy(source_name, dest_name) + else: + # not compressed + dest_name += '.bz2' + args = ['bzip2', '-9', '-c', source_name] + logging.info('Running: %s' % ( " ".join(args) )) + bzip_dest = open(dest_name, 'w') + bzip = subprocess.Popen(args, stdout=bzip_dest) + logging.info('Saving to %s' % (dest_name, )) + bzip.wait() def clean_runs(runs): """ diff --git a/htsworkflow/pipelines/summary.py b/htsworkflow/pipelines/summary.py index 77160d1..8830177 100644 --- a/htsworkflow/pipelines/summary.py +++ b/htsworkflow/pipelines/summary.py @@ -1,7 +1,7 @@ """ Analyze the Summary.htm file produced by GERALD """ - +import types from htsworkflow.pipelines.runfolder import ElementTree from htsworkflow.util.ethelp import indent, flatten diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index c7a9961..f5e2c1e 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -831,3 +831,16 @@ def make_eland_results(gerald_dir): f = open(pathname, 'w') f.write(eland_result) f.close() + +def make_eland_multi(gerald_dir): + eland_multi = """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT NM +>HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:2 chr5.fa:55269838R0 +>HWI-EAS229_60_30DP9AAXX:1:1:1121:379 AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG 2:1:0 chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0 +>HWI-EAS229_60_30DP9AAXX:1:1:892:1155 ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT 0:9:10 chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_multi.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_multi) + f.close() diff --git a/htsworkflow/pipelines/test/test_runfolder_ipar100.py b/htsworkflow/pipelines/test/test_runfolder_ipar100.py index 8be0db1..4924a6b 100644 --- a/htsworkflow/pipelines/test/test_runfolder_ipar100.py +++ b/htsworkflow/pipelines/test/test_runfolder_ipar100.py @@ -45,7 +45,7 @@ def make_runfolder(obj=None): os.mkdir(gerald_dir) make_gerald_config(gerald_dir) make_summary100_htm(gerald_dir) - make_eland_results(gerald_dir) + make_eland_multi(gerald_dir) if obj is not None: obj.temp_dir = temp_dir @@ -140,6 +140,12 @@ class RunfolderTests(unittest.TestCase): self.failUnlessEqual(cur_lane.read_length, '32') self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + # I want to be able to use a simple iterator + for l in g.lanes.values(): + self.failUnlessEqual(l.analysis, 'eland') + self.failUnlessEqual(l.read_length, '32') + self.failUnlessEqual(l.use_bases, 'Y'*32) + # test data extracted from summary file clusters = [None, (96483, 9074), (133738, 7938), @@ -198,11 +204,14 @@ class RunfolderTests(unittest.TestCase): def test_eland(self): - dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', - 'chr2L.fa': 'dm3/chr2L.fa', - 'Lambda.fa': 'Lambda.fa'} - genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, - '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + hg_map = {'Lambda.fa': 'Lambda.fa'} + for i in range(1,22): + short_name = 'chr%d.fa' % (i,) + long_name = 'hg18/chr%d.fa' % (i,) + hg_map[short_name] = long_name + + genome_maps = { '1':hg_map, '2':hg_map, '3':hg_map, '4':hg_map, + '5':hg_map, '6':hg_map, '7':hg_map, '8':hg_map } eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) for i in range(1,9): @@ -210,11 +219,11 @@ class RunfolderTests(unittest.TestCase): self.failUnlessEqual(lane.reads, 4) self.failUnlessEqual(lane.sample_name, "s") self.failUnlessEqual(lane.lane_id, unicode(i)) - self.failUnlessEqual(len(lane.mapped_reads), 3) - self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) - self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) - self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(len(lane.mapped_reads), 15) + self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4) + self.failUnlessEqual(lane.match_codes['U1'], 10) self.failUnlessEqual(lane.match_codes['NM'], 1) + self.failUnlessEqual(lane.match_codes['QC'], 0) xml = eland.get_elements() # just make sure that element tree can serialize the tree @@ -228,7 +237,7 @@ class RunfolderTests(unittest.TestCase): self.failUnlessEqual(l1.sample_name, l2.sample_name) self.failUnlessEqual(l1.lane_id, l2.lane_id) self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 3) + self.failUnlessEqual(len(l1.mapped_reads), 15) for k in l1.mapped_reads.keys(): self.failUnlessEqual(l1.mapped_reads[k], l2.mapped_reads[k]) @@ -244,13 +253,15 @@ class RunfolderTests(unittest.TestCase): # do we get the flowcell id from the filename? self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-10-30.xml') + name = 'run_207BTAAXX_%s.xml' % ( date.today().strftime('%Y-%m-%d'),) + self.failUnlessEqual(runs[0].name, name) # do we get the flowcell id from the FlowcellId.xml file make_flowcell_id(self.runfolder_dir, '207BTAAXY') runs = runfolder.get_runs(self.runfolder_dir) self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-10-30.xml') + name = 'run_207BTAAXY_%s.xml' % ( date.today().strftime('%Y-%m-%d'),) + self.failUnlessEqual(runs[0].name, name) r1 = runs[0] xml = r1.get_elements()