11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
15 class ElandLane(object):
17 Process an eland result file
21 SAMPLE_NAME = 'SampleName'
24 GENOME_MAP = 'GenomeMap'
25 GENOME_ITEM = 'GenomeItem'
26 MAPPED_READS = 'MappedReads'
27 MAPPED_ITEM = 'MappedItem'
28 MATCH_CODES = 'MatchCodes'
37 def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
38 self.pathname = pathname
39 self._sample_name = None
40 self.lane_id = lane_id
43 self._mapped_reads = None
44 self._match_codes = None
45 if genome_map is None:
47 self.genome_map = genome_map
48 self.eland_type = None
51 self.set_elements(xml)
53 def _guess_eland_type(self, pathname):
54 if self.eland_type is None:
55 # attempt autodetect eland file type
56 pathn, name = os.path.split(pathname)
57 if re.search('result', name):
58 self.eland_type = ElandLane.ELAND_SINGLE
59 elif re.search('multi', name):
60 self.eland_type = ElandLane.ELAND_MULTI
61 elif re.search('extended', name):
62 self.eland_type = ElandLane.ELAND_EXTENDED
63 elif re.search('export', name):
64 self.eland_type = ElandLane.ELAND_EXPORT
66 self.eland_type = ElandLane.ELAND_SINGLE
70 Actually read the file and actually count the reads
72 # can't do anything if we don't have a file to process
73 if self.pathname is None:
75 self._guess_eland_type(self.pathname)
77 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
78 raise RuntimeError("Eland isn't done, try again later.")
80 logging.info("summarizing results for %s" % (self.pathname))
82 if self.eland_type == ElandLane.ELAND_SINGLE:
83 result = self._update_eland_result(self.pathname)
84 elif self.eland_type == ElandLane.ELAND_MULTI or \
85 self.eland_type == ElandLane.ELAND_EXTENDED:
86 result = self._update_eland_multi(self.pathname)
88 raise NotImplementedError("Only support single/multi/extended eland files")
89 self._match_codes, self._mapped_reads, self._reads = result
91 def _update_eland_result(self, pathname):
95 match_codes = {'NM':0, 'QC':0, 'RM':0,
96 'U0':0, 'U1':0, 'U2':0,
97 'R0':0, 'R1':0, 'R2':0,
99 for line in autoopen(pathname,'r'):
101 fields = line.split()
103 # match_codes[code] = match_codes.setdefault(code, 0) + 1
104 # the QC/NM etc codes are in the 3rd field and always present
105 match_codes[fields[2]] += 1
106 # ignore lines that don't have a fasta filename
109 fasta = self.genome_map.get(fields[6], fields[6])
110 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
111 return match_codes, mapped_reads, reads
113 def _update_eland_multi(self, pathname):
117 match_codes = {'NM':0, 'QC':0, 'RM':0,
118 'U0':0, 'U1':0, 'U2':0,
119 'R0':0, 'R1':0, 'R2':0,
121 match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
122 for line in autoopen(pathname,'r'):
124 fields = line.split()
125 # fields[2] = QC/NM/or number of matches
126 groups = match_counts_re.match(fields[2])
128 match_codes[fields[2]] += 1
130 # when there are too many hit, eland writes a - where
131 # it would have put the list of hits.
132 # or in a different version of eland, it just leaves
133 # that column blank, and only outputs 3 fields.
134 if len(fields) < 4 or fields[3] == '-':
136 zero_mismatches = int(groups.group(1))
137 if zero_mismatches == 1:
138 match_codes['U0'] += 1
139 elif zero_mismatches < 255:
140 match_codes['R0'] += zero_mismatches
142 one_mismatches = int(groups.group(2))
143 if one_mismatches == 1:
144 match_codes['U1'] += 1
145 elif one_mismatches < 255:
146 match_codes['R1'] += one_mismatches
148 two_mismatches = int(groups.group(3))
149 if two_mismatches == 1:
150 match_codes['U2'] += 1
151 elif two_mismatches < 255:
152 match_codes['R2'] += two_mismatches
155 for match in fields[3].split(','):
156 match_fragment = match.split(':')
157 if len(match_fragment) == 2:
158 chromo = match_fragment[0]
159 pos = match_fragment[1]
161 fasta = self.genome_map.get(chromo, chromo)
162 assert fasta is not None
163 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
164 return match_codes, mapped_reads, reads
166 def _update_name(self):
167 # extract the sample name
168 if self.pathname is None:
171 path, name = os.path.split(self.pathname)
172 split_name = name.split('_')
173 self._sample_name = split_name[0]
175 def _get_sample_name(self):
176 if self._sample_name is None:
178 return self._sample_name
179 sample_name = property(_get_sample_name)
181 def _get_reads(self):
182 if self._reads is None:
185 reads = property(_get_reads)
187 def _get_mapped_reads(self):
188 if self._mapped_reads is None:
190 return self._mapped_reads
191 mapped_reads = property(_get_mapped_reads)
193 def _get_match_codes(self):
194 if self._match_codes is None:
196 return self._match_codes
197 match_codes = property(_get_match_codes)
199 def _get_no_match(self):
200 if self._mapped_reads is None:
202 return self._match_codes['NM']
203 no_match = property(_get_no_match,
204 doc="total reads that didn't match the target genome.")
206 def _get_no_match_percent(self):
207 return float(self.no_match)/self.reads * 100
208 no_match_percent = property(_get_no_match_percent,
209 doc="no match reads as percent of total")
211 def _get_qc_failed(self):
212 if self._mapped_reads is None:
214 return self._match_codes['QC']
215 qc_failed = property(_get_qc_failed,
216 doc="total reads that didn't match the target genome.")
218 def _get_qc_failed_percent(self):
219 return float(self.qc_failed)/self.reads * 100
220 qc_failed_percent = property(_get_qc_failed_percent,
221 doc="QC failed reads as percent of total")
223 def _get_unique_reads(self):
224 if self._mapped_reads is None:
227 for code in ['U0','U1','U2']:
228 sum += self._match_codes[code]
230 unique_reads = property(_get_unique_reads,
231 doc="total unique reads")
233 def _get_repeat_reads(self):
234 if self._mapped_reads is None:
237 for code in ['R0','R1','R2']:
238 sum += self._match_codes[code]
240 repeat_reads = property(_get_repeat_reads,
241 doc="total repeat reads")
243 def get_elements(self):
244 lane = ElementTree.Element(ElandLane.LANE,
246 unicode(ElandLane.XML_VERSION)})
247 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
248 sample_tag.text = self.sample_name
249 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
250 lane_tag.text = str(self.lane_id)
251 if self.end is not None:
252 end_tag = ElementTree.SubElement(lane, ElandLane.END)
253 end_tag.text = str(self.end)
254 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
255 for k, v in self.genome_map.items():
256 item = ElementTree.SubElement(
257 genome_map, ElandLane.GENOME_ITEM,
258 {'name':k, 'value':unicode(v)})
259 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
260 for k, v in self.mapped_reads.items():
261 item = ElementTree.SubElement(
262 mapped_reads, ElandLane.MAPPED_ITEM,
263 {'name':k, 'value':unicode(v)})
264 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
265 for k, v in self.match_codes.items():
266 item = ElementTree.SubElement(
267 match_codes, ElandLane.MATCH_ITEM,
268 {'name':k, 'value':unicode(v)})
269 reads = ElementTree.SubElement(lane, ElandLane.READS)
270 reads.text = unicode(self.reads)
274 def set_elements(self, tree):
275 if tree.tag != ElandLane.LANE:
276 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
279 self._mapped_reads = {}
280 self._match_codes = {}
283 tag = element.tag.lower()
284 if tag == ElandLane.SAMPLE_NAME.lower():
285 self._sample_name = element.text
286 elif tag == ElandLane.LANE_ID.lower():
287 self.lane_id = int(element.text)
288 elif tag == ElandLane.END.lower():
289 self.end = int(element.text)
290 elif tag == ElandLane.GENOME_MAP.lower():
291 for child in element:
292 name = child.attrib['name']
293 value = child.attrib['value']
294 self.genome_map[name] = value
295 elif tag == ElandLane.MAPPED_READS.lower():
296 for child in element:
297 name = child.attrib['name']
298 value = child.attrib['value']
299 self._mapped_reads[name] = int(value)
300 elif tag == ElandLane.MATCH_CODES.lower():
301 for child in element:
302 name = child.attrib['name']
303 value = int(child.attrib['value'])
304 self._match_codes[name] = value
305 elif tag == ElandLane.READS.lower():
306 self._reads = int(element.text)
308 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
312 Summarize information from eland files
316 ELAND = 'ElandCollection'
321 def __init__(self, xml=None):
322 # we need information from the gerald config.xml
323 self.results = [{},{}]
326 self.set_elements(xml)
328 def get_elements(self):
329 root = ElementTree.Element(ELAND.ELAND,
330 {'version': unicode(ELAND.XML_VERSION)})
331 for end in range(len(self.results)):
332 end_results = self.results[end]
333 for lane_id, lane in end_results.items():
334 eland_lane = lane.get_elements()
335 eland_lane.attrib[ELAND.END] = unicode (end)
336 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
337 root.append(eland_lane)
340 def set_elements(self, tree):
341 if tree.tag.lower() != ELAND.ELAND.lower():
342 raise ValueError('Expecting %s', ELAND.ELAND)
343 for element in list(tree):
344 lane_id = int(element.attrib[ELAND.LANE_ID])
345 end = int(element.attrib.get(ELAND.END, 0))
346 lane = ElandLane(xml=element)
347 self.results[end][lane_id] = lane
349 def check_for_eland_file(basedir, pattern, lane_id, end):
351 full_lane_id = lane_id
353 full_lane_id = "%d_%d" % ( lane_id, end )
355 basename = pattern % (full_lane_id,)
356 pathname = os.path.join(basedir, basename)
357 if os.path.exists(pathname):
358 logging.info('found eland file in %s' % (pathname,))
363 def eland(gerald_dir, gerald=None, genome_maps=None):
366 lane_ids = range(1,9)
369 basedirs = [gerald_dir]
371 # if there is a basedir/Temp change basedir to point to the temp
372 # directory, as 1.1rc1 moves most of the files we've historically
373 # cared about to that subdirectory.
374 # we should look into what the official 'result' files are.
375 # and 1.3 moves them back
376 basedir_temp = os.path.join(gerald_dir, 'Temp')
377 if os.path.isdir(basedir_temp):
378 basedirs.append(basedir_temp)
381 # the order in patterns determines the preference for what
383 patterns = ['s_%s_eland_result.txt',
384 's_%s_eland_result.txt.bz2',
385 's_%s_eland_result.txt.gz',
386 's_%s_eland_extended.txt',
387 's_%s_eland_extended.txt.bz2',
388 's_%s_eland_extended.txt.gz',
389 's_%s_eland_multi.txt',
390 's_%s_eland_multi.txt.bz2',
391 's_%s_eland_multi.txt.gz',]
393 for basedir in basedirs:
395 for lane_id in lane_ids:
397 pathname = check_for_eland_file(basedir, p, lane_id, end)
398 if pathname is not None:
401 logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
404 # yes the lane_id is also being computed in ElandLane._update
405 # I didn't want to clutter up my constructor
406 # but I needed to persist the sample_name/lane_id for
407 # runfolder summary_report
408 path, name = os.path.split(pathname)
409 logging.info("Adding eland file %s" %(name,))
410 # split_name = name.split('_')
411 # lane_id = int(split_name[1])
413 if genome_maps is not None:
414 genome_map = genome_maps[lane_id]
415 elif gerald is not None:
416 genome_dir = gerald.lanes[lane_id].eland_genome
417 genome_map = build_genome_fasta_map(genome_dir)
421 eland_result = ElandLane(pathname, lane_id, end, genome_map)
425 effective_end = end - 1
426 e.results[effective_end][lane_id] = eland_result
429 def build_genome_fasta_map(genome_dir):
430 # build fasta to fasta file map
431 logging.info("Building genome map")
432 genome = genome_dir.split(os.path.sep)[-1]
434 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
436 if os.path.islink(vld_file):
438 vld_file = os.path.realpath(vld_file)
439 path, vld_name = os.path.split(vld_file)
440 name, ext = os.path.splitext(vld_name)
442 fasta_map[name] = name
444 fasta_map[name] = os.path.join(genome, name)
448 def extract_eland_sequence(instream, outstream, start, end):
450 Extract a chunk of sequence out of an eland file
452 for line in instream:
453 record = line.split()
455 result = [record[0], record[1][start:end]]
457 result = [record[0][start:end]]
458 outstream.write("\t".join(result))
459 outstream.write(os.linesep)