Utility functions to make bedfiles.
"""
import os
+import re
+
+# map eland_result.txt sense
+sense_map = { 'F': '+', 'R': '-'}
+sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
+
+def write_bed_header(outstream, name, description):
+ """
+ Produce the headerline for a bedfile
+ """
+ # provide default track names
+ if name is None: name = "track"
+ if description is None: description = "eland result file"
+ bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
+ bed_header += os.linesep
+ outstream.write(bed_header % (name, description))
def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
"""
CHR = 6
START = 7
SENSE = 8
- # map eland_result.txt sense
- sense_map = { 'F': '+', 'R': '-'}
- sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
- # provide default track names
- if name is None: name = "track"
- if description is None: description = "eland result file"
- bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
- bed_header += os.linesep
- outstream.write(bed_header % (name, description))
+
+ write_bed_header(outstream, name, description)
for line in instream:
fields = line.split()
os.linesep
))
+
+def make_bed_from_multi_eland_stream(
+ instream,
+ outstream,
+ name,
+ description,
+ chr_prefix='chr',
+ max_reads=255
+ ):
+ """
+ read a multi eland stream and write a bedfile
+ """
+ write_bed_header(outstream, name, description)
+ parse_multi_eland(instream, outstream, chr_prefix, max_reads)
+
+def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
+
+ loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
+ other_pattern = '(?P<chr>[^:,]+)'
+ split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
+
+ for line in instream:
+ rec = line.split()
+ if len(rec) > 3:
+ # colony_id = rec[0]
+ seq = rec[1]
+ # number of matches for 0, 1, and 2 mismatches
+ # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
+ compressed_reads = rec[3]
+ cur_chr = ""
+ reads = {0: [], 1: [], 2:[]}
+
+ for token in split_re.finditer(compressed_reads):
+ if token.group('chr') is not None:
+ cur_chr = token.group('chr')[:-3] # strip off .fa
+ elif token.group('fullloc') is not None:
+ matches = int(token.group('count'))
+ # only emit a bed line if
+ # our current chromosome starts with chromosome pattern
+ if chr_prefix is None or cur_chr.startswith(chr_prefix):
+ start = int(token.group('start'))
+ stop = start + len(seq)
+ orientation = token.group('dir')
+ strand = sense_map[orientation]
+ color = sense_color[orientation]
+ # build up list of reads for this record
+ reads[matches].append((cur_chr, start, stop, strand, color))
+
+ # report up to our max_read threshold reporting the fewer-mismatch
+ # matches first
+ reported_reads = 0
+ keys = [0,1,2]
+ for mismatch, read_list in ((k, reads[k]) for k in keys):
+ reported_reads += len(read_list)
+ if reported_reads <= max_reads:
+ for cur_chr, start, stop, strand, color in read_list:
+ reported_reads += 1
+ outstream.write('%s %d %d read 0 %s - - %s%s' % (
+ cur_chr,
+ start,
+ stop,
+ sense_map[orientation],
+ sense_color[orientation],
+ os.linesep
+ ))
+
def make_description(database, flowcell_id, lane):
"""
compute a bedfile name and description from the fctracker database
--- /dev/null
+import os
+from StringIO import StringIO
+import unittest
+
+from gaworkflow.util import makebed
+
+class testMakeBed(unittest.TestCase):
+ def test_multi_1_0_0_limit_1(self):
+ instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0')
+ out = StringIO()
+
+ makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+ self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n')
+
+ def test_multi_1_0_0_limit_255(self):
+ instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0')
+ out = StringIO()
+
+ makebed.parse_multi_eland(instream, out, 'mm9_chr', 255)
+ self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n')
+
+
+ def test_multi_2_0_0_limit_1(self):
+ instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0')
+ out = StringIO()
+
+ makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+ self.failUnlessEqual(out.len, 0)
+
+ def test_multi_2_0_0_limit_255(self):
+ instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0')
+ out = StringIO()
+
+ makebed.parse_multi_eland(instream, out, 'mm9_chr', 255)
+ self.failUnlessEqual(out.len, 98)
+
+ def test_multi_0_2_0_limit_1(self):
+ instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1')
+ out = StringIO()
+
+ makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+ print out.getvalue()
+ self.failUnlessEqual(out.len, 0)
+
+def suite():
+ return unittest.makeSuite(testMakeBed, 'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest='suite')
+
+
import sys
import os
-from gaworkflow.util.makebed import make_bed_from_eland_stream, make_description
+from gaworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description
def make_parser():
parser = optparse.OptionParser()
parser.add_option("--lane", dest='lane',
help='specify which lane to use when retrieving description from database',
default=None)
+
+ multi = optparse.OptionGroup(parser, 'Multi-read ELAND support')
+
+ multi.add_option('-m', '--multi', action='store_true',
+ help='Enable parsing multi-read eland files',
+ default=False)
+ multi.add_option('--reads', type='int',
+ help='limit reporting multi reads to this many reads'
+ '(most usefully --reads=1 will turn a multi-read '
+ 'file into a single read file)',
+ default=255)
+ parser.add_option_group(multi)
+
return parser
def main(command_line=None):
parser.error("Need eland input file name")
return 1
- if options.outname is None:
- options.outname = os.path.splitext(options.inname)[0]+'.bed'
- print >>sys.stderr, "defaulting to outputname", options.outname
-
- if os.path.exists(options.inname):
+ if options.inname == '-':
+ instream = sys.stdin
+ elif os.path.exists(options.inname):
instream = open(options.inname, 'r')
else:
parser.error('%s was not found' % (options.inname))
return 1
- if os.path.exists(options.outname):
+ if options.outname is None:
+ # if outname wasn't defined, and we're reading from stdout
+ if instream is sys.stdin:
+ # write to stdout
+ outstream = sys.stdout
+ else:
+ # if there's a name write to name.bde
+ options.outname = os.path.splitext(options.inname)[0]+'.bed'
+ print >>sys.stderr, "defaulting to outputname", options.outname
+ elif options.outname == '-':
+ outstream = sys.stdout
+ elif os.path.exists(options.outname):
parser.error("not overwriting %s" % (options.outname))
return 1
else:
name = options.name
description = options.description
- make_bed_from_eland_stream(instream, outstream, name, description, options.prefix)
+ if options.multi:
+ make_bed_from_multi_eland_stream(instream, outstream,
+ name, description,
+ options.prefix,
+ options.reads)
+
+ else:
+ make_bed_from_eland_stream(instream, outstream,
+ name, description,
+ options.prefix)
return 0
if __name__ == "__main__":