From c5f4ae58378c1607f901ffc15e34c02f3410fda4 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 29 Aug 2008 16:51:24 +0000 Subject: [PATCH] Add support for converting mutli-eland files from pipeline 0.3 to bedfiles --- gaworkflow/util/makebed.py | 93 +++++++++++++++++++++++++--- gaworkflow/util/test/test_makebed.py | 51 +++++++++++++++ scripts/makebed | 47 +++++++++++--- 3 files changed, 174 insertions(+), 17 deletions(-) create mode 100644 gaworkflow/util/test/test_makebed.py diff --git a/gaworkflow/util/makebed.py b/gaworkflow/util/makebed.py index 738b815..6f1511c 100755 --- a/gaworkflow/util/makebed.py +++ b/gaworkflow/util/makebed.py @@ -2,6 +2,22 @@ Utility functions to make bedfiles. """ import os +import re + +# map eland_result.txt sense +sense_map = { 'F': '+', 'R': '-'} +sense_color = { 'F': '0,0,255', 'R': '255,255,0' } + +def write_bed_header(outstream, name, description): + """ + Produce the headerline for a bedfile + """ + # provide default track names + if name is None: name = "track" + if description is None: description = "eland result file" + bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' + bed_header += os.linesep + outstream.write(bed_header % (name, description)) def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'): """ @@ -12,15 +28,8 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom CHR = 6 START = 7 SENSE = 8 - # map eland_result.txt sense - sense_map = { 'F': '+', 'R': '-'} - sense_color = { 'F': '0,0,255', 'R': '255,255,0' } - # provide default track names - if name is None: name = "track" - if description is None: description = "eland result file" - bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' - bed_header += os.linesep - outstream.write(bed_header % (name, description)) + + write_bed_header(outstream, name, description) for line in instream: fields = line.split() @@ -42,6 +51,72 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom os.linesep )) + +def make_bed_from_multi_eland_stream( + instream, + outstream, + name, + description, + chr_prefix='chr', + max_reads=255 + ): + """ + read a multi eland stream and write a bedfile + """ + write_bed_header(outstream, name, description) + parse_multi_eland(instream, outstream, chr_prefix, max_reads) + +def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): + + loc_pattern = '(?P(?P[0-9]+)(?P[FR])(?P[0-9]+))' + other_pattern = '(?P[^:,]+)' + split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern)) + + for line in instream: + rec = line.split() + if len(rec) > 3: + # colony_id = rec[0] + seq = rec[1] + # number of matches for 0, 1, and 2 mismatches + # m0, m1, m2 = [int(x) for x in rec[2].split(':')] + compressed_reads = rec[3] + cur_chr = "" + reads = {0: [], 1: [], 2:[]} + + for token in split_re.finditer(compressed_reads): + if token.group('chr') is not None: + cur_chr = token.group('chr')[:-3] # strip off .fa + elif token.group('fullloc') is not None: + matches = int(token.group('count')) + # only emit a bed line if + # our current chromosome starts with chromosome pattern + if chr_prefix is None or cur_chr.startswith(chr_prefix): + start = int(token.group('start')) + stop = start + len(seq) + orientation = token.group('dir') + strand = sense_map[orientation] + color = sense_color[orientation] + # build up list of reads for this record + reads[matches].append((cur_chr, start, stop, strand, color)) + + # report up to our max_read threshold reporting the fewer-mismatch + # matches first + reported_reads = 0 + keys = [0,1,2] + for mismatch, read_list in ((k, reads[k]) for k in keys): + reported_reads += len(read_list) + if reported_reads <= max_reads: + for cur_chr, start, stop, strand, color in read_list: + reported_reads += 1 + outstream.write('%s %d %d read 0 %s - - %s%s' % ( + cur_chr, + start, + stop, + sense_map[orientation], + sense_color[orientation], + os.linesep + )) + def make_description(database, flowcell_id, lane): """ compute a bedfile name and description from the fctracker database diff --git a/gaworkflow/util/test/test_makebed.py b/gaworkflow/util/test/test_makebed.py new file mode 100644 index 0000000..e96f29b --- /dev/null +++ b/gaworkflow/util/test/test_makebed.py @@ -0,0 +1,51 @@ +import os +from StringIO import StringIO +import unittest + +from gaworkflow.util import makebed + +class testMakeBed(unittest.TestCase): + def test_multi_1_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + def test_multi_1_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + + def test_multi_2_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.len, 0) + + def test_multi_2_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.len, 98) + + def test_multi_0_2_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + print out.getvalue() + self.failUnlessEqual(out.len, 0) + +def suite(): + return unittest.makeSuite(testMakeBed, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + diff --git a/scripts/makebed b/scripts/makebed index 4ffa39f..a4a414b 100755 --- a/scripts/makebed +++ b/scripts/makebed @@ -3,7 +3,7 @@ import optparse import sys import os -from gaworkflow.util.makebed import make_bed_from_eland_stream, make_description +from gaworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description def make_parser(): parser = optparse.OptionParser() @@ -29,6 +29,19 @@ def make_parser(): parser.add_option("--lane", dest='lane', help='specify which lane to use when retrieving description from database', default=None) + + multi = optparse.OptionGroup(parser, 'Multi-read ELAND support') + + multi.add_option('-m', '--multi', action='store_true', + help='Enable parsing multi-read eland files', + default=False) + multi.add_option('--reads', type='int', + help='limit reporting multi reads to this many reads' + '(most usefully --reads=1 will turn a multi-read ' + 'file into a single read file)', + default=255) + parser.add_option_group(multi) + return parser def main(command_line=None): @@ -42,17 +55,26 @@ def main(command_line=None): parser.error("Need eland input file name") return 1 - if options.outname is None: - options.outname = os.path.splitext(options.inname)[0]+'.bed' - print >>sys.stderr, "defaulting to outputname", options.outname - - if os.path.exists(options.inname): + if options.inname == '-': + instream = sys.stdin + elif os.path.exists(options.inname): instream = open(options.inname, 'r') else: parser.error('%s was not found' % (options.inname)) return 1 - if os.path.exists(options.outname): + if options.outname is None: + # if outname wasn't defined, and we're reading from stdout + if instream is sys.stdin: + # write to stdout + outstream = sys.stdout + else: + # if there's a name write to name.bde + options.outname = os.path.splitext(options.inname)[0]+'.bed' + print >>sys.stderr, "defaulting to outputname", options.outname + elif options.outname == '-': + outstream = sys.stdout + elif os.path.exists(options.outname): parser.error("not overwriting %s" % (options.outname)) return 1 else: @@ -67,7 +89,16 @@ def main(command_line=None): name = options.name description = options.description - make_bed_from_eland_stream(instream, outstream, name, description, options.prefix) + if options.multi: + make_bed_from_multi_eland_stream(instream, outstream, + name, description, + options.prefix, + options.reads) + + else: + make_bed_from_eland_stream(instream, outstream, + name, description, + options.prefix) return 0 if __name__ == "__main__": -- 2.30.2