2 Utility functions to make bedfiles.
7 __docformat__ = "restructredtext en"
9 # map eland_result.txt sense
10 sense_map = { 'F': '+', 'R': '-'}
11 sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
13 def create_bed_header(name, description):
15 Produce the headerline for a bedfile
17 # provide default track names
18 if name is None: name = "track"
19 if description is None: description = "eland result file"
20 bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' % (name, description)
21 bed_header += os.linesep
24 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
26 read an eland result file from instream and write a bedfile to outstream
29 - `instream`: stream containing the output from eland
30 - `outstream`: stream to write the bed file too
31 - `name`: name of bed-file (must be unique)
32 - `description`: longer description of the bed file
33 - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
35 for line in make_bed_from_eland_generator(instream, name, description, chromosome_prefix):
38 def make_bed_from_eland_generator(instream, name, description, chromosome_prefix='chr'):
40 read an eland result file from instream and write a bedfile to outstream
43 - `instream`: stream containing the output from eland
44 - `name`: name of bed-file (must be unique)
45 - `description`: longer description of the bed file
46 - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
48 :Return: generator which yields lines of bedfile
50 # indexes into fields in eland_result.txt file
56 yield create_bed_header(name, description)
57 prefix_len = len(chromosome_prefix)
61 # we need more than the CHR field, and it needs to match a chromosome
62 if len(fields) <= CHR or fields[CHR][:prefix_len] != chromosome_prefix:
65 stop = int(start) + len(fields[SEQ])
66 # strip off filename extension
67 chromosome = fields[CHR].split('.')[0]
69 yield '%s %s %d read 0 %s - - %s%s' % (
73 sense_map[fields[SENSE]],
74 sense_color[fields[SENSE]],
78 def make_bed_from_multi_eland_stream(
87 read a multi eland result file from instream and write the bedfile to outstream
90 - `instream`: stream containing the output from eland
91 - `outstream`: stream to write the bed file too
92 - `name`: name of bed-file (must be unique)
93 - `description`: longer description of the bed file
94 - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
95 - `max_reads`: maximum number of reads to write to bed stream
97 for lane in make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads):
100 def make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads=255):
101 loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9AGCT]+))'
102 other_pattern = '(?P<chr>[^:,]+)'
103 split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
105 yield create_bed_header(name, description)
106 for line in instream:
111 # number of matches for 0, 1, and 2 mismatches
112 # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
113 compressed_reads = rec[3]
115 reads = {0: [], 1: [], 2:[]}
117 for token in split_re.finditer(compressed_reads):
118 if token.group('chr') is not None:
119 cur_chr = token.group('chr')
120 # strip off extension if present
121 cur_chr = os.path.splitext(cur_chr)[0]
122 elif token.group('fullloc') is not None:
123 matches = int(token.group('count'))
124 # only emit a bed line if
125 # our current chromosome starts with chromosome pattern
126 if chr_prefix is None or cur_chr.startswith(chr_prefix):
127 start = int(token.group('start'))
128 stop = start + len(seq)
129 orientation = token.group('dir')
130 strand = sense_map[orientation]
131 color = sense_color[orientation]
132 # build up list of reads for this record
133 reads[matches].append((cur_chr, start, stop, strand, color))
135 # report up to our max_read threshold reporting the fewer-mismatch
139 for mismatch, read_list in ((k, reads[k]) for k in keys):
140 reported_reads += len(read_list)
141 if reported_reads <= max_reads:
142 for cur_chr, start, stop, strand, color in read_list:
144 yield '%s %d %d read 0 %s - - %s%s' % (
148 sense_map[orientation],
149 sense_color[orientation],
153 def make_description(flowcell_id, lane):
155 compute a bedfile name and description from the django database
157 from htsworkflow.frontend.experiments import models as experiments
160 if lane < 1 or lane > 8:
161 raise RuntimeError("flowcells only have lanes 1-8")
163 cell = experiments.FlowCell.objects.get(flowcell_id=flowcell_id)
165 name = "%s-%s" % (flowcell_id, lane)
167 cell_library = getattr(cell, 'lane_%d_library' %(lane,))
168 cell_library_id = cell_library.library_id
169 description = "%s-%s" % (cell_library.library_name, cell_library_id)
170 return name, description