+
+def make_bed_from_multi_eland_stream(
+ instream,
+ outstream,
+ name,
+ description,
+ chr_prefix='chr',
+ max_reads=255
+ ):
+ """
+ read a multi eland stream and write a bedfile
+ """
+ write_bed_header(outstream, name, description)
+ parse_multi_eland(instream, outstream, chr_prefix, max_reads)
+
+def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
+
+ loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
+ other_pattern = '(?P<chr>[^:,]+)'
+ split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
+
+ for line in instream:
+ rec = line.split()
+ if len(rec) > 3:
+ # colony_id = rec[0]
+ seq = rec[1]
+ # number of matches for 0, 1, and 2 mismatches
+ # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
+ compressed_reads = rec[3]
+ cur_chr = ""
+ reads = {0: [], 1: [], 2:[]}
+
+ for token in split_re.finditer(compressed_reads):
+ if token.group('chr') is not None:
+ cur_chr = token.group('chr')[:-3] # strip off .fa
+ elif token.group('fullloc') is not None:
+ matches = int(token.group('count'))
+ # only emit a bed line if
+ # our current chromosome starts with chromosome pattern
+ if chr_prefix is None or cur_chr.startswith(chr_prefix):
+ start = int(token.group('start'))
+ stop = start + len(seq)
+ orientation = token.group('dir')
+ strand = sense_map[orientation]
+ color = sense_color[orientation]
+ # build up list of reads for this record
+ reads[matches].append((cur_chr, start, stop, strand, color))
+
+ # report up to our max_read threshold reporting the fewer-mismatch
+ # matches first
+ reported_reads = 0
+ keys = [0,1,2]
+ for mismatch, read_list in ((k, reads[k]) for k in keys):
+ reported_reads += len(read_list)
+ if reported_reads <= max_reads:
+ for cur_chr, start, stop, strand, color in read_list:
+ reported_reads += 1
+ outstream.write('%s %d %d read 0 %s - - %s%s' % (
+ cur_chr,
+ start,
+ stop,
+ sense_map[orientation],
+ sense_color[orientation],
+ os.linesep
+ ))
+