2 Utility functions to make bedfiles.
7 # map eland_result.txt sense
8 sense_map = { 'F': '+', 'R': '-'}
9 sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
11 def write_bed_header(outstream, name, description):
13 Produce the headerline for a bedfile
15 # provide default track names
16 if name is None: name = "track"
17 if description is None: description = "eland result file"
18 bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
19 bed_header += os.linesep
20 outstream.write(bed_header % (name, description))
22 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
24 read an eland result file from instream and write a bedfile to outstream
26 # indexes into fields in eland_result.txt file
32 write_bed_header(outstream, name, description)
33 prefix_len = len(chromosome_prefix)
37 # we need more than the CHR field, and it needs to match a chromosome
38 if len(fields) <= CHR or fields[CHR][:prefix_len] != chromosome_prefix:
41 stop = int(start) + len(fields[SEQ])
42 # strip off filename extension
43 chromosome = fields[CHR].split('.')[0]
45 outstream.write('%s %s %d read 0 %s - - %s%s' % (
49 sense_map[fields[SENSE]],
50 sense_color[fields[SENSE]],
55 def make_bed_from_multi_eland_stream(
64 read a multi eland stream and write a bedfile
66 write_bed_header(outstream, name, description)
67 parse_multi_eland(instream, outstream, chr_prefix, max_reads)
69 def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
71 loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
72 other_pattern = '(?P<chr>[^:,]+)'
73 split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
80 # number of matches for 0, 1, and 2 mismatches
81 # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
82 compressed_reads = rec[3]
84 reads = {0: [], 1: [], 2:[]}
86 for token in split_re.finditer(compressed_reads):
87 if token.group('chr') is not None:
88 cur_chr = token.group('chr')
89 # strip off extension if present
90 cur_chr = os.path.splitext(cur_chr)[0]
91 elif token.group('fullloc') is not None:
92 matches = int(token.group('count'))
93 # only emit a bed line if
94 # our current chromosome starts with chromosome pattern
95 if chr_prefix is None or cur_chr.startswith(chr_prefix):
96 start = int(token.group('start'))
97 stop = start + len(seq)
98 orientation = token.group('dir')
99 strand = sense_map[orientation]
100 color = sense_color[orientation]
101 # build up list of reads for this record
102 reads[matches].append((cur_chr, start, stop, strand, color))
104 # report up to our max_read threshold reporting the fewer-mismatch
108 for mismatch, read_list in ((k, reads[k]) for k in keys):
109 reported_reads += len(read_list)
110 if reported_reads <= max_reads:
111 for cur_chr, start, stop, strand, color in read_list:
113 outstream.write('%s %d %d read 0 %s - - %s%s' % (
117 sense_map[orientation],
118 sense_color[orientation],
122 def make_description(database, flowcell_id, lane):
124 compute a bedfile name and description from the fctracker database
126 from htsworkflow.util.fctracker import fctracker
128 fc = fctracker(database)
129 cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id))
131 raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id))
133 if lane < 1 or lane > 8:
134 raise RuntimeError("flowcells only have lanes 1-8")
136 name = "%s-%s" % (flowcell_id, lane)
138 cell_id, cell = cells.items()[0]
139 assert cell_id == flowcell_id
141 cell_library_id = cell['lane_%d_library_id' %(lane,)]
142 cell_library = cell['lane_%d_library' %(lane,)]
143 description = "%s-%s" % (cell_library['library_name'], cell_library_id)
144 return name, description