12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
16 SAMPLE_NAME = 'SampleName'
21 GENOME_MAP = 'GenomeMap'
22 GENOME_ITEM = 'GenomeItem'
23 MAPPED_READS = 'MappedReads'
24 MAPPED_ITEM = 'MappedItem'
25 MATCH_CODES = 'MatchCodes'
34 class ResultLane(object):
36 Base class for result lanes
41 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42 self.pathname = pathname
43 self._sample_name = None
44 self.lane_id = lane_id
49 self.set_elements(xml)
53 Actually read the file and actually count the reads
57 def _update_name(self):
58 # extract the sample name
59 if self.pathname is None:
62 path, name = os.path.split(self.pathname)
63 split_name = name.split('_')
64 self._sample_name = split_name[0]
66 def _get_sample_name(self):
67 if self._sample_name is None:
69 return self._sample_name
70 sample_name = property(_get_sample_name)
73 if self._reads is None:
76 reads = property(_get_reads)
79 class ElandLane(ResultLane):
81 Process an eland result file
85 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
86 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
87 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
88 SCORE_UNRECOGNIZED = 0
92 def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
93 super(ElandLane, self).__init__(pathname, lane_id, end)
95 self._mapped_reads = None
96 self._match_codes = None
97 if genome_map is None:
99 self.genome_map = genome_map
100 self.eland_type = None
103 self.set_elements(xml)
105 def _guess_eland_type(self, pathname):
106 if self.eland_type is None:
107 # attempt autodetect eland file type
108 pathn, name = os.path.split(pathname)
109 if re.search('result', name):
110 self.eland_type = ELAND_SINGLE
111 elif re.search('multi', name):
112 self.eland_type = ELAND_MULTI
113 elif re.search('extended', name):
114 self.eland_type = ELAND_EXTENDED
115 elif re.search('export', name):
116 self.eland_type = ELAND_EXPORT
118 self.eland_type = ELAND_SINGLE
122 Actually read the file and actually count the reads
124 # can't do anything if we don't have a file to process
125 if self.pathname is None:
127 self._guess_eland_type(self.pathname)
129 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
130 raise RuntimeError("Eland isn't done, try again later.")
132 logging.info("summarizing results for %s" % (self.pathname))
134 stream = autoopen(self.pathname, 'r')
135 if self.eland_type == ELAND_SINGLE:
136 result = self._update_eland_result(stream)
137 elif self.eland_type == ELAND_MULTI or \
138 self.eland_type == ELAND_EXTENDED:
139 result = self._update_eland_multi(stream)
140 elif self.eland_type == ELAND_EXPORT:
141 result = self._update_eland_export(stream)
143 raise NotImplementedError("Only support single/multi/extended eland files")
144 self._match_codes, self._mapped_reads, self._reads = result
146 def _update_eland_result(self, instream):
150 match_codes = {'NM':0, 'QC':0, 'RM':0,
151 'U0':0, 'U1':0, 'U2':0,
152 'R0':0, 'R1':0, 'R2':0,
154 for line in instream:
156 fields = line.split()
158 # match_codes[code] = match_codes.setdefault(code, 0) + 1
159 # the QC/NM etc codes are in the 3rd field and always present
160 match_codes[fields[2]] += 1
161 # ignore lines that don't have a fasta filename
164 fasta = self.genome_map.get(fields[6], fields[6])
165 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
166 return match_codes, mapped_reads, reads
168 def _update_eland_multi(self, instream):
169 """Summarize an eland_extend."""
175 match_codes = {'NM':0, 'QC':0, 'RM':0,
176 'U0':0, 'U1':0, 'U2':0,
177 'R0':0, 'R1':0, 'R2':0,
179 for line in instream:
181 fields = line.split()
182 # fields[2] = QC/NM/or number of matches
183 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
185 if score_type == ElandLane.SCORE_READ:
186 # when there are too many hits, eland writes a - where
187 # it would have put the list of hits.
188 # or in a different version of eland, it just leaves
189 # that column blank, and only outputs 3 fields.
190 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
193 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
195 return match_codes, mapped_reads, reads
197 def _update_eland_export(self, instream):
198 """Summarize a gerald export file."""
205 match_codes = {'NM':0, 'QC':0, 'RM':0,
206 'U0':0, 'U1':0, 'U2':0,
207 'R0':0, 'R1':0, 'R2':0,
210 for line in instream:
212 fields = line.split()
213 # fields[2] = QC/NM/or number of matches
214 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
216 if score_type == ElandLane.SCORE_UNRECOGNIZED:
217 # export files have three states for the match field
218 # QC code, count of multi-reads, or a single
219 # read location. The score_mapped_mismatches function
220 # only understands the first two types.
221 # if we get unrecognized, that implies the field is probably
223 code = self._count_mapped_export(mapped_reads,
224 fields[LOCATION_INDEX],
225 fields[DESCRIPTOR_INDEX])
226 match_codes[code] += 1
228 return match_codes, mapped_reads, reads
231 def _score_mapped_mismatches(self, match, match_codes):
232 """Update match_codes with eland map counts, or failure code.
234 Returns True if the read mapped, false if it was an error code.
236 groups = ElandLane.MATCH_COUNTS_RE.match(match)
238 # match is not of the form [\d]+:[\d]+:[\d]+
239 if match_codes.has_key(match):
240 # match is one quality control codes QC/NM etc
241 match_codes[match] += 1
242 return ElandLane.SCORE_QC
244 return ElandLane.SCORE_UNRECOGNIZED
246 # match is of the form [\d]+:[\d]+:[\d]+
248 zero_mismatches = int(groups.group(1))
249 one_mismatches = int(groups.group(2))
250 two_mismatches = int(groups.group(3))
252 if zero_mismatches == 1:
253 match_codes['U0'] += 1
254 elif zero_mismatches < 255:
255 match_codes['R0'] += zero_mismatches
257 if one_mismatches == 1:
258 match_codes['U1'] += 1
259 elif one_mismatches < 255:
260 match_codes['R1'] += one_mismatches
262 if two_mismatches == 1:
263 match_codes['U2'] += 1
264 elif two_mismatches < 255:
265 match_codes['R2'] += two_mismatches
267 return ElandLane.SCORE_READ
270 def _count_mapped_multireads(self, mapped_reads, match_string):
272 for match in match_string.split(','):
273 match_fragment = match.split(':')
274 if len(match_fragment) == 2:
275 chromo = match_fragment[0]
276 pos = match_fragment[1]
278 fasta = self.genome_map.get(chromo, chromo)
279 assert fasta is not None
280 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
283 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
284 """Count a read as defined in an export file
286 match_string contains the chromosome
287 descriptor contains the an ecoding of bases that match, mismatch,
289 returns the "best" match code
291 Currently "best" match code is ignoring the possibility of in-dels
293 chromo = match_string
294 fasta = self.genome_map.get(chromo, chromo)
295 assert fasta is not None
296 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
298 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
299 if len(mismatch_bases) == 0:
301 elif len(mismatch_bases) == 1:
307 def _get_mapped_reads(self):
308 if self._mapped_reads is None:
310 return self._mapped_reads
311 mapped_reads = property(_get_mapped_reads)
313 def _get_match_codes(self):
314 if self._match_codes is None:
316 return self._match_codes
317 match_codes = property(_get_match_codes)
319 def _get_no_match(self):
320 if self._mapped_reads is None:
322 return self._match_codes['NM']
323 no_match = property(_get_no_match,
324 doc="total reads that didn't match the target genome.")
326 def _get_no_match_percent(self):
327 return float(self.no_match)/self.reads * 100
328 no_match_percent = property(_get_no_match_percent,
329 doc="no match reads as percent of total")
331 def _get_qc_failed(self):
332 if self._mapped_reads is None:
334 return self._match_codes['QC']
335 qc_failed = property(_get_qc_failed,
336 doc="total reads that didn't match the target genome.")
338 def _get_qc_failed_percent(self):
339 return float(self.qc_failed)/self.reads * 100
340 qc_failed_percent = property(_get_qc_failed_percent,
341 doc="QC failed reads as percent of total")
343 def _get_unique_reads(self):
344 if self._mapped_reads is None:
347 for code in ['U0','U1','U2']:
348 sum += self._match_codes[code]
350 unique_reads = property(_get_unique_reads,
351 doc="total unique reads")
353 def _get_repeat_reads(self):
354 if self._mapped_reads is None:
357 for code in ['R0','R1','R2']:
358 sum += self._match_codes[code]
360 repeat_reads = property(_get_repeat_reads,
361 doc="total repeat reads")
363 def get_elements(self):
364 lane = ElementTree.Element(ElandLane.LANE,
366 unicode(ElandLane.XML_VERSION)})
367 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
368 sample_tag.text = self.sample_name
369 lane_tag = ElementTree.SubElement(lane, LANE_ID)
370 lane_tag.text = str(self.lane_id)
371 if self.end is not None:
372 end_tag = ElementTree.SubElement(lane, END)
373 end_tag.text = str(self.end)
374 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
375 for k, v in self.genome_map.items():
376 item = ElementTree.SubElement(
377 genome_map, GENOME_ITEM,
378 {'name':k, 'value':unicode(v)})
379 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
380 for k, v in self.mapped_reads.items():
381 item = ElementTree.SubElement(
382 mapped_reads, MAPPED_ITEM,
383 {'name':k, 'value':unicode(v)})
384 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
385 for k, v in self.match_codes.items():
386 item = ElementTree.SubElement(
387 match_codes, MATCH_ITEM,
388 {'name':k, 'value':unicode(v)})
389 reads = ElementTree.SubElement(lane, READS)
390 reads.text = unicode(self.reads)
394 def set_elements(self, tree):
395 if tree.tag != ElandLane.LANE:
396 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
399 self._mapped_reads = {}
400 self._match_codes = {}
403 tag = element.tag.lower()
404 if tag == SAMPLE_NAME.lower():
405 self._sample_name = element.text
406 elif tag == LANE_ID.lower():
407 self.lane_id = int(element.text)
408 elif tag == END.lower():
409 self.end = int(element.text)
410 elif tag == GENOME_MAP.lower():
411 for child in element:
412 name = child.attrib['name']
413 value = child.attrib['value']
414 self.genome_map[name] = value
415 elif tag == MAPPED_READS.lower():
416 for child in element:
417 name = child.attrib['name']
418 value = child.attrib['value']
419 self._mapped_reads[name] = int(value)
420 elif tag == MATCH_CODES.lower():
421 for child in element:
422 name = child.attrib['name']
423 value = int(child.attrib['value'])
424 self._match_codes[name] = value
425 elif tag == READS.lower():
426 self._reads = int(element.text)
428 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
430 class SequenceLane(ResultLane):
432 LANE = 'SequenceLane'
433 SEQUENCE_TYPE = 'SequenceType'
438 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
440 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
441 self.sequence_type = None
442 super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
444 def _guess_sequence_type(self, pathname):
446 Determine if we have a scarf or fastq sequence file
448 f = open(pathname,'r')
453 # fastq starts with a @
454 self.sequence_type = SequenceLane.FASTQ_TYPE
456 self.sequence_type = SequenceLane.SCARF_TYPE
457 return self.sequence_type
461 Actually read the file and actually count the reads
463 # can't do anything if we don't have a file to process
464 if self.pathname is None:
467 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
468 raise RuntimeError("Sequencing isn't done, try again later.")
470 self._guess_sequence_type(self.pathname)
472 logging.info("summarizing results for %s" % (self.pathname))
474 f = open(self.pathname)
475 for l in f.xreadlines():
479 if self.sequence_type == SequenceLane.SCARF_TYPE:
481 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
482 self._reads = lines / 4
484 raise NotImplementedError("This only supports scarf or fastq squence files")
486 def get_elements(self):
487 lane = ElementTree.Element(SequenceLane.LANE,
489 unicode(SequenceLane.XML_VERSION)})
490 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
491 sample_tag.text = self.sample_name
492 lane_tag = ElementTree.SubElement(lane, LANE_ID)
493 lane_tag.text = str(self.lane_id)
494 if self.end is not None:
495 end_tag = ElementTree.SubElement(lane, END)
496 end_tag.text = str(self.end)
497 reads = ElementTree.SubElement(lane, READS)
498 reads.text = unicode(self.reads)
499 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
500 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
504 def set_elements(self, tree):
505 if tree.tag != SequenceLane.LANE:
506 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
507 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
510 tag = element.tag.lower()
511 if tag == SAMPLE_NAME.lower():
512 self._sample_name = element.text
513 elif tag == LANE_ID.lower():
514 self.lane_id = int(element.text)
515 elif tag == END.lower():
516 self.end = int(element.text)
517 elif tag == READS.lower():
518 self._reads = int(element.text)
519 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
520 self.sequence_type = lookup_sequence_type.get(element.text, None)
522 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
526 Summarize information from eland files
530 ELAND = 'ElandCollection'
535 def __init__(self, xml=None):
536 # we need information from the gerald config.xml
537 self.results = [{},{}]
540 self.set_elements(xml)
542 if len(self.results[0]) == 0:
543 # Initialize our eland object with meaningless junk
545 self.results[0][l] = ResultLane(lane_id=l, end=0)
548 def get_elements(self):
549 root = ElementTree.Element(ELAND.ELAND,
550 {'version': unicode(ELAND.XML_VERSION)})
551 for end in range(len(self.results)):
552 end_results = self.results[end]
553 for lane_id, lane in end_results.items():
554 eland_lane = lane.get_elements()
555 eland_lane.attrib[ELAND.END] = unicode (end)
556 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
557 root.append(eland_lane)
560 def set_elements(self, tree):
561 if tree.tag.lower() != ELAND.ELAND.lower():
562 raise ValueError('Expecting %s', ELAND.ELAND)
563 for element in list(tree):
564 lane_id = int(element.attrib[ELAND.LANE_ID])
565 end = int(element.attrib.get(ELAND.END, 0))
566 if element.tag.lower() == ElandLane.LANE.lower():
567 lane = ElandLane(xml=element)
568 elif element.tag.lower() == SequenceLane.LANE.lower():
569 lane = SequenceLane(xml=element)
571 self.results[end][lane_id] = lane
573 def check_for_eland_file(basedir, pattern, lane_id, end):
575 full_lane_id = lane_id
577 full_lane_id = "%d_%d" % ( lane_id, end )
579 basename = pattern % (full_lane_id,)
580 logging.info("Eland pattern: %s" %(basename,))
581 pathname = os.path.join(basedir, basename)
582 if os.path.exists(pathname):
583 logging.info('found eland file in %s' % (pathname,))
588 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
589 # yes the lane_id is also being computed in ElandLane._update
590 # I didn't want to clutter up my constructor
591 # but I needed to persist the sample_name/lane_id for
592 # runfolder summary_report
593 path, name = os.path.split(pathname)
594 logging.info("Adding eland file %s" %(name,))
595 # split_name = name.split('_')
596 # lane_id = int(split_name[1])
598 if genome_maps is not None:
599 genome_map = genome_maps[lane_id]
600 elif gerald is not None:
601 genome_dir = gerald.lanes[lane_id].eland_genome
602 genome_map = build_genome_fasta_map(genome_dir)
606 lane = ElandLane(pathname, lane_id, end, genome_map)
611 effective_end = end - 1
613 results[effective_end][lane_id] = lane
615 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
616 result = SequenceLane(pathname, lane_id, end)
621 effective_end = end - 1
623 results[effective_end][lane_id] = result
626 def eland(gerald_dir, gerald=None, genome_maps=None):
629 lane_ids = range(1,9)
632 basedirs = [gerald_dir]
634 # if there is a basedir/Temp change basedir to point to the temp
635 # directory, as 1.1rc1 moves most of the files we've historically
636 # cared about to that subdirectory.
637 # we should look into what the official 'result' files are.
638 # and 1.3 moves them back
639 basedir_temp = os.path.join(gerald_dir, 'Temp')
640 if os.path.isdir(basedir_temp):
641 basedirs.append(basedir_temp)
644 # the order in patterns determines the preference for what
648 patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
649 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
650 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
651 ('s_%s_eland_extended.txt', MAPPED_ELAND),
652 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
653 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
654 ('s_%s_eland_multi.txt', MAPPED_ELAND),
655 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
656 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
657 ('s_%s_export.txt', MAPPED_ELAND),
658 ('s_%s_export.txt.bz2', MAPPED_ELAND),
659 ('s_%s_export.txt.gz', MAPPED_ELAND),
660 ('s_%s_sequence.txt', SEQUENCE),]
662 for basedir in basedirs:
664 for lane_id in lane_ids:
666 pathname = check_for_eland_file(basedir, p[0], lane_id, end)
667 if pathname is not None:
668 if p[1] == MAPPED_ELAND:
669 update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
670 elif p[1] == SEQUENCE:
671 update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
674 logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
679 def build_genome_fasta_map(genome_dir):
680 # build fasta to fasta file map
681 logging.info("Building genome map")
682 genome = genome_dir.split(os.path.sep)[-1]
684 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
686 if os.path.islink(vld_file):
688 vld_file = os.path.realpath(vld_file)
689 path, vld_name = os.path.split(vld_file)
690 name, ext = os.path.splitext(vld_name)
692 fasta_map[name] = name
694 fasta_map[name] = os.path.join(genome, name)
698 def extract_eland_sequence(instream, outstream, start, end):
700 Extract a chunk of sequence out of an eland file
702 for line in instream:
703 record = line.split()
705 result = [record[0], record[1][start:end]]
707 result = [record[0][start:end]]
708 outstream.write("\t".join(result))
709 outstream.write(os.linesep)
712 def main(cmdline=None):
713 """Run eland extraction against the specified gerald directory"""
714 from optparse import OptionParser
715 parser = OptionParser("%prog: <gerald dir>+")
716 opts, args = parser.parse_args(cmdline)
717 logging.basicConfig(level=logging.DEBUG)
719 logging.info("Starting scan of %s" % (a,))
721 print e.get_elements()
726 if __name__ == "__main__":