11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
15 SAMPLE_NAME = 'SampleName'
20 GENOME_MAP = 'GenomeMap'
21 GENOME_ITEM = 'GenomeItem'
22 MAPPED_READS = 'MappedReads'
23 MAPPED_ITEM = 'MappedItem'
24 MATCH_CODES = 'MatchCodes'
34 class ResultLane(object):
36 Base class for result lanes
41 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42 self.pathname = pathname
43 self._sample_name = None
44 self.lane_id = lane_id
49 self.set_elements(xml)
53 Actually read the file and actually count the reads
55 raise NotImplementedError("Can't count abstract classes")
57 def _update_name(self):
58 # extract the sample name
59 if self.pathname is None:
62 path, name = os.path.split(self.pathname)
63 split_name = name.split('_')
64 self._sample_name = split_name[0]
66 def _get_sample_name(self):
67 if self._sample_name is None:
69 return self._sample_name
70 sample_name = property(_get_sample_name)
73 if self._reads is None:
76 reads = property(_get_reads)
79 class ElandLane(ResultLane):
81 Process an eland result file
86 def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
87 super(ElandLane, self).__init__(pathname, lane_id, end)
89 self._mapped_reads = None
90 self._match_codes = None
91 if genome_map is None:
93 self.genome_map = genome_map
94 self.eland_type = None
97 self.set_elements(xml)
99 def _guess_eland_type(self, pathname):
100 if self.eland_type is None:
101 # attempt autodetect eland file type
102 pathn, name = os.path.split(pathname)
103 if re.search('result', name):
104 self.eland_type = ELAND_SINGLE
105 elif re.search('multi', name):
106 self.eland_type = ELAND_MULTI
107 elif re.search('extended', name):
108 self.eland_type = ELAND_EXTENDED
109 elif re.search('export', name):
110 self.eland_type = ELAND_EXPORT
112 self.eland_type = ELAND_SINGLE
116 Actually read the file and actually count the reads
118 # can't do anything if we don't have a file to process
119 if self.pathname is None:
121 self._guess_eland_type(self.pathname)
123 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
124 raise RuntimeError("Eland isn't done, try again later.")
126 logging.info("summarizing results for %s" % (self.pathname))
128 if self.eland_type == ELAND_SINGLE:
129 result = self._update_eland_result(self.pathname)
130 elif self.eland_type == ELAND_MULTI or \
131 self.eland_type == ELAND_EXTENDED:
132 result = self._update_eland_multi(self.pathname)
134 raise NotImplementedError("Only support single/multi/extended eland files")
135 self._match_codes, self._mapped_reads, self._reads = result
137 def _update_eland_result(self, pathname):
141 match_codes = {'NM':0, 'QC':0, 'RM':0,
142 'U0':0, 'U1':0, 'U2':0,
143 'R0':0, 'R1':0, 'R2':0,
145 for line in autoopen(pathname,'r'):
147 fields = line.split()
149 # match_codes[code] = match_codes.setdefault(code, 0) + 1
150 # the QC/NM etc codes are in the 3rd field and always present
151 match_codes[fields[2]] += 1
152 # ignore lines that don't have a fasta filename
155 fasta = self.genome_map.get(fields[6], fields[6])
156 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
157 return match_codes, mapped_reads, reads
159 def _update_eland_multi(self, pathname):
163 match_codes = {'NM':0, 'QC':0, 'RM':0,
164 'U0':0, 'U1':0, 'U2':0,
165 'R0':0, 'R1':0, 'R2':0,
167 match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
168 for line in autoopen(pathname,'r'):
170 fields = line.split()
171 # fields[2] = QC/NM/or number of matches
172 groups = match_counts_re.match(fields[2])
174 match_codes[fields[2]] += 1
176 # when there are too many hit, eland writes a - where
177 # it would have put the list of hits.
178 # or in a different version of eland, it just leaves
179 # that column blank, and only outputs 3 fields.
180 if len(fields) < 4 or fields[3] == '-':
182 zero_mismatches = int(groups.group(1))
183 if zero_mismatches == 1:
184 match_codes['U0'] += 1
185 elif zero_mismatches < 255:
186 match_codes['R0'] += zero_mismatches
188 one_mismatches = int(groups.group(2))
189 if one_mismatches == 1:
190 match_codes['U1'] += 1
191 elif one_mismatches < 255:
192 match_codes['R1'] += one_mismatches
194 two_mismatches = int(groups.group(3))
195 if two_mismatches == 1:
196 match_codes['U2'] += 1
197 elif two_mismatches < 255:
198 match_codes['R2'] += two_mismatches
201 for match in fields[3].split(','):
202 match_fragment = match.split(':')
203 if len(match_fragment) == 2:
204 chromo = match_fragment[0]
205 pos = match_fragment[1]
207 fasta = self.genome_map.get(chromo, chromo)
208 assert fasta is not None
209 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
210 return match_codes, mapped_reads, reads
212 def _get_mapped_reads(self):
213 if self._mapped_reads is None:
215 return self._mapped_reads
216 mapped_reads = property(_get_mapped_reads)
218 def _get_match_codes(self):
219 if self._match_codes is None:
221 return self._match_codes
222 match_codes = property(_get_match_codes)
224 def _get_no_match(self):
225 if self._mapped_reads is None:
227 return self._match_codes['NM']
228 no_match = property(_get_no_match,
229 doc="total reads that didn't match the target genome.")
231 def _get_no_match_percent(self):
232 return float(self.no_match)/self.reads * 100
233 no_match_percent = property(_get_no_match_percent,
234 doc="no match reads as percent of total")
236 def _get_qc_failed(self):
237 if self._mapped_reads is None:
239 return self._match_codes['QC']
240 qc_failed = property(_get_qc_failed,
241 doc="total reads that didn't match the target genome.")
243 def _get_qc_failed_percent(self):
244 return float(self.qc_failed)/self.reads * 100
245 qc_failed_percent = property(_get_qc_failed_percent,
246 doc="QC failed reads as percent of total")
248 def _get_unique_reads(self):
249 if self._mapped_reads is None:
252 for code in ['U0','U1','U2']:
253 sum += self._match_codes[code]
255 unique_reads = property(_get_unique_reads,
256 doc="total unique reads")
258 def _get_repeat_reads(self):
259 if self._mapped_reads is None:
262 for code in ['R0','R1','R2']:
263 sum += self._match_codes[code]
265 repeat_reads = property(_get_repeat_reads,
266 doc="total repeat reads")
268 def get_elements(self):
269 lane = ElementTree.Element(ElandLane.LANE,
271 unicode(ElandLane.XML_VERSION)})
272 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
273 sample_tag.text = self.sample_name
274 lane_tag = ElementTree.SubElement(lane, LANE_ID)
275 lane_tag.text = str(self.lane_id)
276 if self.end is not None:
277 end_tag = ElementTree.SubElement(lane, END)
278 end_tag.text = str(self.end)
279 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
280 for k, v in self.genome_map.items():
281 item = ElementTree.SubElement(
282 genome_map, GENOME_ITEM,
283 {'name':k, 'value':unicode(v)})
284 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
285 for k, v in self.mapped_reads.items():
286 item = ElementTree.SubElement(
287 mapped_reads, MAPPED_ITEM,
288 {'name':k, 'value':unicode(v)})
289 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
290 for k, v in self.match_codes.items():
291 item = ElementTree.SubElement(
292 match_codes, MATCH_ITEM,
293 {'name':k, 'value':unicode(v)})
294 reads = ElementTree.SubElement(lane, READS)
295 reads.text = unicode(self.reads)
299 def set_elements(self, tree):
300 if tree.tag != ElandLane.LANE:
301 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
304 self._mapped_reads = {}
305 self._match_codes = {}
308 tag = element.tag.lower()
309 if tag == SAMPLE_NAME.lower():
310 self._sample_name = element.text
311 elif tag == LANE_ID.lower():
312 self.lane_id = int(element.text)
313 elif tag == END.lower():
314 self.end = int(element.text)
315 elif tag == GENOME_MAP.lower():
316 for child in element:
317 name = child.attrib['name']
318 value = child.attrib['value']
319 self.genome_map[name] = value
320 elif tag == MAPPED_READS.lower():
321 for child in element:
322 name = child.attrib['name']
323 value = child.attrib['value']
324 self._mapped_reads[name] = int(value)
325 elif tag == MATCH_CODES.lower():
326 for child in element:
327 name = child.attrib['name']
328 value = int(child.attrib['value'])
329 self._match_codes[name] = value
330 elif tag == READS.lower():
331 self._reads = int(element.text)
333 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
335 class SequenceLane(ResultLane):
337 LANE = 'SequenceLane'
338 SEQUENCE_TYPE = 'SequenceType'
343 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
345 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
346 self.sequence_type = None
347 super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
349 def _guess_sequence_type(self, pathname):
351 Determine if we have a scarf or fastq sequence file
353 f = open(pathname,'r')
358 # fastq starts with a @
359 self.sequence_type = SequenceLane.FASTQ_TYPE
361 self.sequence_type = SequenceLane.SCARF_TYPE
362 return self.sequence_type
366 Actually read the file and actually count the reads
368 # can't do anything if we don't have a file to process
369 if self.pathname is None:
372 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
373 raise RuntimeError("Sequencing isn't done, try again later.")
375 self._guess_sequence_type(self.pathname)
377 logging.info("summarizing results for %s" % (self.pathname))
379 f = open(self.pathname)
380 for l in f.xreadlines():
384 if self.sequence_type == SequenceLane.SCARF_TYPE:
386 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
387 self._reads = lines / 4
389 raise NotImplementedError("This only supports scarf or fastq squence files")
391 def get_elements(self):
392 lane = ElementTree.Element(SequenceLane.LANE,
394 unicode(SequenceLane.XML_VERSION)})
395 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
396 sample_tag.text = self.sample_name
397 lane_tag = ElementTree.SubElement(lane, LANE_ID)
398 lane_tag.text = str(self.lane_id)
399 if self.end is not None:
400 end_tag = ElementTree.SubElement(lane, END)
401 end_tag.text = str(self.end)
402 reads = ElementTree.SubElement(lane, READS)
403 reads.text = unicode(self.reads)
404 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
405 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
409 def set_elements(self, tree):
410 if tree.tag != SequenceLane.LANE:
411 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
412 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
415 tag = element.tag.lower()
416 if tag == SAMPLE_NAME.lower():
417 self._sample_name = element.text
418 elif tag == LANE_ID.lower():
419 self.lane_id = int(element.text)
420 elif tag == END.lower():
421 self.end = int(element.text)
422 elif tag == READS.lower():
423 self._reads = int(element.text)
424 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
425 self.sequence_type = lookup_sequence_type.get(element.text, None)
426 print self.sequence_type
428 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
432 Summarize information from eland files
436 ELAND = 'ElandCollection'
441 def __init__(self, xml=None):
442 # we need information from the gerald config.xml
443 self.results = [{},{}]
446 self.set_elements(xml)
448 def get_elements(self):
449 root = ElementTree.Element(ELAND.ELAND,
450 {'version': unicode(ELAND.XML_VERSION)})
451 for end in range(len(self.results)):
452 end_results = self.results[end]
453 for lane_id, lane in end_results.items():
454 eland_lane = lane.get_elements()
455 eland_lane.attrib[ELAND.END] = unicode (end)
456 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
457 root.append(eland_lane)
460 def set_elements(self, tree):
461 if tree.tag.lower() != ELAND.ELAND.lower():
462 raise ValueError('Expecting %s', ELAND.ELAND)
463 for element in list(tree):
464 lane_id = int(element.attrib[ELAND.LANE_ID])
465 end = int(element.attrib.get(ELAND.END, 0))
466 if element.tag.lower() == ElandLane.LANE.lower():
467 lane = ElandLane(xml=element)
468 elif element.tag.lower() == SequenceLane.LANE.lower():
469 lane = SequenceLane(xml=element)
471 self.results[end][lane_id] = lane
473 def check_for_eland_file(basedir, pattern, lane_id, end):
475 full_lane_id = lane_id
477 full_lane_id = "%d_%d" % ( lane_id, end )
479 basename = pattern % (full_lane_id,)
480 pathname = os.path.join(basedir, basename)
481 if os.path.exists(pathname):
482 logging.info('found eland file in %s' % (pathname,))
487 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
488 # yes the lane_id is also being computed in ElandLane._update
489 # I didn't want to clutter up my constructor
490 # but I needed to persist the sample_name/lane_id for
491 # runfolder summary_report
492 path, name = os.path.split(pathname)
493 logging.info("Adding eland file %s" %(name,))
494 # split_name = name.split('_')
495 # lane_id = int(split_name[1])
497 if genome_maps is not None:
498 genome_map = genome_maps[lane_id]
499 elif gerald is not None:
500 genome_dir = gerald.lanes[lane_id].eland_genome
501 genome_map = build_genome_fasta_map(genome_dir)
505 lane = ElandLane(pathname, lane_id, end, genome_map)
510 effective_end = end - 1
512 results[effective_end][lane_id] = lane
514 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
515 result = SequenceLane(pathname, lane_id, end)
520 effective_end = end - 1
522 results[effective_end][lane_id] = result
525 def eland(gerald_dir, gerald=None, genome_maps=None):
528 lane_ids = range(1,9)
531 basedirs = [gerald_dir]
533 # if there is a basedir/Temp change basedir to point to the temp
534 # directory, as 1.1rc1 moves most of the files we've historically
535 # cared about to that subdirectory.
536 # we should look into what the official 'result' files are.
537 # and 1.3 moves them back
538 basedir_temp = os.path.join(gerald_dir, 'Temp')
539 if os.path.isdir(basedir_temp):
540 basedirs.append(basedir_temp)
543 # the order in patterns determines the preference for what
547 patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
548 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
549 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
550 ('s_%s_eland_extended.txt', MAPPED_ELAND),
551 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
552 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
553 ('s_%s_eland_multi.txt', MAPPED_ELAND),
554 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
555 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
556 ('s_%s_sequence.txt', SEQUENCE),]
558 for basedir in basedirs:
560 for lane_id in lane_ids:
562 pathname = check_for_eland_file(basedir, p[0], lane_id, end)
563 if pathname is not None:
564 if p[1] == MAPPED_ELAND:
565 update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
566 elif p[1] == SEQUENCE:
567 update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
570 logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
575 def build_genome_fasta_map(genome_dir):
576 # build fasta to fasta file map
577 logging.info("Building genome map")
578 genome = genome_dir.split(os.path.sep)[-1]
580 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
582 if os.path.islink(vld_file):
584 vld_file = os.path.realpath(vld_file)
585 path, vld_name = os.path.split(vld_file)
586 name, ext = os.path.splitext(vld_name)
588 fasta_map[name] = name
590 fasta_map[name] = os.path.join(genome, name)
594 def extract_eland_sequence(instream, outstream, start, end):
596 Extract a chunk of sequence out of an eland file
598 for line in instream:
599 record = line.split()
601 result = [record[0], record[1][start:end]]
603 result = [record[0][start:end]]
604 outstream.write("\t".join(result))
605 outstream.write(os.linesep)