12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
16 SAMPLE_NAME = 'SampleName'
21 GENOME_MAP = 'GenomeMap'
22 GENOME_ITEM = 'GenomeItem'
23 MAPPED_READS = 'MappedReads'
24 MAPPED_ITEM = 'MappedItem'
25 MATCH_CODES = 'MatchCodes'
34 class ResultLane(object):
36 Base class for result lanes
41 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42 self.pathname = pathname
43 self._sample_name = None
44 self.lane_id = lane_id
49 self.set_elements(xml)
53 Actually read the file and actually count the reads
57 def _update_name(self):
58 # extract the sample name
59 if self.pathname is None:
62 path, name = os.path.split(self.pathname)
63 split_name = name.split('_')
64 self._sample_name = split_name[0]
66 def _get_sample_name(self):
67 if self._sample_name is None:
69 return self._sample_name
70 sample_name = property(_get_sample_name)
73 if self._reads is None:
76 reads = property(_get_reads)
78 def get_elements(self):
81 class ElandLane(ResultLane):
83 Process an eland result file
87 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
88 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
89 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
90 SCORE_UNRECOGNIZED = 0
94 def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
95 super(ElandLane, self).__init__(pathname, lane_id, end)
97 self._mapped_reads = None
98 self._match_codes = None
99 if genome_map is None:
101 self.genome_map = genome_map
102 self.eland_type = None
105 self.set_elements(xml)
107 def _guess_eland_type(self, pathname):
108 if self.eland_type is None:
109 # attempt autodetect eland file type
110 pathn, name = os.path.split(pathname)
111 if re.search('result', name):
112 self.eland_type = ELAND_SINGLE
113 elif re.search('multi', name):
114 self.eland_type = ELAND_MULTI
115 elif re.search('extended', name):
116 self.eland_type = ELAND_EXTENDED
117 elif re.search('export', name):
118 self.eland_type = ELAND_EXPORT
120 self.eland_type = ELAND_SINGLE
124 Actually read the file and actually count the reads
126 # can't do anything if we don't have a file to process
127 if self.pathname is None:
129 self._guess_eland_type(self.pathname)
131 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
132 raise RuntimeError("Eland isn't done, try again later.")
134 logging.info("summarizing results for %s" % (self.pathname))
136 stream = autoopen(self.pathname, 'r')
137 if self.eland_type == ELAND_SINGLE:
138 result = self._update_eland_result(stream)
139 elif self.eland_type == ELAND_MULTI or \
140 self.eland_type == ELAND_EXTENDED:
141 result = self._update_eland_multi(stream)
142 elif self.eland_type == ELAND_EXPORT:
143 result = self._update_eland_export(stream)
145 raise NotImplementedError("Only support single/multi/extended eland files")
146 self._match_codes, self._mapped_reads, self._reads = result
148 def _update_eland_result(self, instream):
152 match_codes = {'NM':0, 'QC':0, 'RM':0,
153 'U0':0, 'U1':0, 'U2':0,
154 'R0':0, 'R1':0, 'R2':0,
156 for line in instream:
158 fields = line.split()
160 # match_codes[code] = match_codes.setdefault(code, 0) + 1
161 # the QC/NM etc codes are in the 3rd field and always present
162 match_codes[fields[2]] += 1
163 # ignore lines that don't have a fasta filename
166 fasta = self.genome_map.get(fields[6], fields[6])
167 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
168 return match_codes, mapped_reads, reads
170 def _update_eland_multi(self, instream):
171 """Summarize an eland_extend."""
177 match_codes = {'NM':0, 'QC':0, 'RM':0,
178 'U0':0, 'U1':0, 'U2':0,
179 'R0':0, 'R1':0, 'R2':0,
181 for line in instream:
183 fields = line.split()
184 # fields[2] = QC/NM/or number of matches
185 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
187 if score_type == ElandLane.SCORE_READ:
188 # when there are too many hits, eland writes a - where
189 # it would have put the list of hits.
190 # or in a different version of eland, it just leaves
191 # that column blank, and only outputs 3 fields.
192 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
195 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
197 return match_codes, mapped_reads, reads
199 def _update_eland_export(self, instream):
200 """Summarize a gerald export file."""
207 match_codes = {'NM':0, 'QC':0, 'RM':0,
208 'U0':0, 'U1':0, 'U2':0,
209 'R0':0, 'R1':0, 'R2':0,
212 for line in instream:
214 fields = line.split()
215 # fields[2] = QC/NM/or number of matches
216 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
218 if score_type == ElandLane.SCORE_UNRECOGNIZED:
219 # export files have three states for the match field
220 # QC code, count of multi-reads, or a single
221 # read location. The score_mapped_mismatches function
222 # only understands the first two types.
223 # if we get unrecognized, that implies the field is probably
225 code = self._count_mapped_export(mapped_reads,
226 fields[LOCATION_INDEX],
227 fields[DESCRIPTOR_INDEX])
228 match_codes[code] += 1
230 return match_codes, mapped_reads, reads
233 def _score_mapped_mismatches(self, match, match_codes):
234 """Update match_codes with eland map counts, or failure code.
236 Returns True if the read mapped, false if it was an error code.
238 groups = ElandLane.MATCH_COUNTS_RE.match(match)
240 # match is not of the form [\d]+:[\d]+:[\d]+
241 if match_codes.has_key(match):
242 # match is one quality control codes QC/NM etc
243 match_codes[match] += 1
244 return ElandLane.SCORE_QC
246 return ElandLane.SCORE_UNRECOGNIZED
248 # match is of the form [\d]+:[\d]+:[\d]+
250 zero_mismatches = int(groups.group(1))
251 one_mismatches = int(groups.group(2))
252 two_mismatches = int(groups.group(3))
254 if zero_mismatches == 1:
255 match_codes['U0'] += 1
256 elif zero_mismatches < 255:
257 match_codes['R0'] += zero_mismatches
259 if one_mismatches == 1:
260 match_codes['U1'] += 1
261 elif one_mismatches < 255:
262 match_codes['R1'] += one_mismatches
264 if two_mismatches == 1:
265 match_codes['U2'] += 1
266 elif two_mismatches < 255:
267 match_codes['R2'] += two_mismatches
269 return ElandLane.SCORE_READ
272 def _count_mapped_multireads(self, mapped_reads, match_string):
274 for match in match_string.split(','):
275 match_fragment = match.split(':')
276 if len(match_fragment) == 2:
277 chromo = match_fragment[0]
278 pos = match_fragment[1]
280 fasta = self.genome_map.get(chromo, chromo)
281 assert fasta is not None
282 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
285 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
286 """Count a read as defined in an export file
288 match_string contains the chromosome
289 descriptor contains the an ecoding of bases that match, mismatch,
291 returns the "best" match code
293 Currently "best" match code is ignoring the possibility of in-dels
295 chromo = match_string
296 fasta = self.genome_map.get(chromo, chromo)
297 assert fasta is not None
298 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
300 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
301 if len(mismatch_bases) == 0:
303 elif len(mismatch_bases) == 1:
309 def _get_mapped_reads(self):
310 if self._mapped_reads is None:
312 return self._mapped_reads
313 mapped_reads = property(_get_mapped_reads)
315 def _get_match_codes(self):
316 if self._match_codes is None:
318 return self._match_codes
319 match_codes = property(_get_match_codes)
321 def _get_no_match(self):
322 if self._mapped_reads is None:
324 return self._match_codes['NM']
325 no_match = property(_get_no_match,
326 doc="total reads that didn't match the target genome.")
328 def _get_no_match_percent(self):
329 return float(self.no_match)/self.reads * 100
330 no_match_percent = property(_get_no_match_percent,
331 doc="no match reads as percent of total")
333 def _get_qc_failed(self):
334 if self._mapped_reads is None:
336 return self._match_codes['QC']
337 qc_failed = property(_get_qc_failed,
338 doc="total reads that didn't match the target genome.")
340 def _get_qc_failed_percent(self):
341 return float(self.qc_failed)/self.reads * 100
342 qc_failed_percent = property(_get_qc_failed_percent,
343 doc="QC failed reads as percent of total")
345 def _get_unique_reads(self):
346 if self._mapped_reads is None:
349 for code in ['U0','U1','U2']:
350 sum += self._match_codes[code]
352 unique_reads = property(_get_unique_reads,
353 doc="total unique reads")
355 def _get_repeat_reads(self):
356 if self._mapped_reads is None:
359 for code in ['R0','R1','R2']:
360 sum += self._match_codes[code]
362 repeat_reads = property(_get_repeat_reads,
363 doc="total repeat reads")
365 def get_elements(self):
366 lane = ElementTree.Element(ElandLane.LANE,
368 unicode(ElandLane.XML_VERSION)})
369 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
370 sample_tag.text = self.sample_name
371 lane_tag = ElementTree.SubElement(lane, LANE_ID)
372 lane_tag.text = str(self.lane_id)
373 if self.end is not None:
374 end_tag = ElementTree.SubElement(lane, END)
375 end_tag.text = str(self.end)
376 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
377 for k, v in self.genome_map.items():
378 item = ElementTree.SubElement(
379 genome_map, GENOME_ITEM,
380 {'name':k, 'value':unicode(v)})
381 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
382 for k, v in self.mapped_reads.items():
383 item = ElementTree.SubElement(
384 mapped_reads, MAPPED_ITEM,
385 {'name':k, 'value':unicode(v)})
386 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
387 for k, v in self.match_codes.items():
388 item = ElementTree.SubElement(
389 match_codes, MATCH_ITEM,
390 {'name':k, 'value':unicode(v)})
391 reads = ElementTree.SubElement(lane, READS)
392 reads.text = unicode(self.reads)
396 def set_elements(self, tree):
397 if tree.tag != ElandLane.LANE:
398 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
401 self._mapped_reads = {}
402 self._match_codes = {}
405 tag = element.tag.lower()
406 if tag == SAMPLE_NAME.lower():
407 self._sample_name = element.text
408 elif tag == LANE_ID.lower():
409 self.lane_id = int(element.text)
410 elif tag == END.lower():
411 self.end = int(element.text)
412 elif tag == GENOME_MAP.lower():
413 for child in element:
414 name = child.attrib['name']
415 value = child.attrib['value']
416 self.genome_map[name] = value
417 elif tag == MAPPED_READS.lower():
418 for child in element:
419 name = child.attrib['name']
420 value = child.attrib['value']
421 self._mapped_reads[name] = int(value)
422 elif tag == MATCH_CODES.lower():
423 for child in element:
424 name = child.attrib['name']
425 value = int(child.attrib['value'])
426 self._match_codes[name] = value
427 elif tag == READS.lower():
428 self._reads = int(element.text)
430 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
432 class SequenceLane(ResultLane):
434 LANE = 'SequenceLane'
435 SEQUENCE_TYPE = 'SequenceType'
440 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
442 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
443 self.sequence_type = None
444 super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
446 def _guess_sequence_type(self, pathname):
448 Determine if we have a scarf or fastq sequence file
450 f = open(pathname,'r')
455 # fastq starts with a @
456 self.sequence_type = SequenceLane.FASTQ_TYPE
458 self.sequence_type = SequenceLane.SCARF_TYPE
459 return self.sequence_type
463 Actually read the file and actually count the reads
465 # can't do anything if we don't have a file to process
466 if self.pathname is None:
469 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
470 raise RuntimeError("Sequencing isn't done, try again later.")
472 self._guess_sequence_type(self.pathname)
474 logging.info("summarizing results for %s" % (self.pathname))
476 f = open(self.pathname)
477 for l in f.xreadlines():
481 if self.sequence_type == SequenceLane.SCARF_TYPE:
483 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
484 self._reads = lines / 4
486 raise NotImplementedError("This only supports scarf or fastq squence files")
488 def get_elements(self):
489 lane = ElementTree.Element(SequenceLane.LANE,
491 unicode(SequenceLane.XML_VERSION)})
492 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
493 sample_tag.text = self.sample_name
494 lane_tag = ElementTree.SubElement(lane, LANE_ID)
495 lane_tag.text = str(self.lane_id)
496 if self.end is not None:
497 end_tag = ElementTree.SubElement(lane, END)
498 end_tag.text = str(self.end)
499 reads = ElementTree.SubElement(lane, READS)
500 reads.text = unicode(self.reads)
501 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
502 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
506 def set_elements(self, tree):
507 if tree.tag != SequenceLane.LANE:
508 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
509 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
512 tag = element.tag.lower()
513 if tag == SAMPLE_NAME.lower():
514 self._sample_name = element.text
515 elif tag == LANE_ID.lower():
516 self.lane_id = int(element.text)
517 elif tag == END.lower():
518 self.end = int(element.text)
519 elif tag == READS.lower():
520 self._reads = int(element.text)
521 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
522 self.sequence_type = lookup_sequence_type.get(element.text, None)
524 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
528 Summarize information from eland files
532 ELAND = 'ElandCollection'
537 def __init__(self, xml=None):
538 # we need information from the gerald config.xml
539 self.results = [{},{}]
542 self.set_elements(xml)
544 if len(self.results[0]) == 0:
545 # Initialize our eland object with meaningless junk
547 self.results[0][l] = ResultLane(lane_id=l, end=0)
550 def get_elements(self):
551 root = ElementTree.Element(ELAND.ELAND,
552 {'version': unicode(ELAND.XML_VERSION)})
553 for end in range(len(self.results)):
554 end_results = self.results[end]
555 for lane_id, lane in end_results.items():
556 eland_lane = lane.get_elements()
557 if eland_lane is not None:
558 eland_lane.attrib[ELAND.END] = unicode (end)
559 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
560 root.append(eland_lane)
563 def set_elements(self, tree):
564 if tree.tag.lower() != ELAND.ELAND.lower():
565 raise ValueError('Expecting %s', ELAND.ELAND)
566 for element in list(tree):
567 lane_id = int(element.attrib[ELAND.LANE_ID])
568 end = int(element.attrib.get(ELAND.END, 0))
569 if element.tag.lower() == ElandLane.LANE.lower():
570 lane = ElandLane(xml=element)
571 elif element.tag.lower() == SequenceLane.LANE.lower():
572 lane = SequenceLane(xml=element)
574 self.results[end][lane_id] = lane
576 def check_for_eland_file(basedir, pattern, lane_id, end):
578 full_lane_id = lane_id
580 full_lane_id = "%d_%d" % ( lane_id, end )
582 basename = pattern % (full_lane_id,)
583 logging.debug("Eland pattern: %s" %(basename,))
584 pathname = os.path.join(basedir, basename)
585 if os.path.exists(pathname):
586 logging.info('found eland file in %s' % (pathname,))
591 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
592 # yes the lane_id is also being computed in ElandLane._update
593 # I didn't want to clutter up my constructor
594 # but I needed to persist the sample_name/lane_id for
595 # runfolder summary_report
596 path, name = os.path.split(pathname)
597 logging.info("Adding eland file %s" %(name,))
598 # split_name = name.split('_')
599 # lane_id = int(split_name[1])
602 if genome_maps is not None:
603 genome_map = genome_maps[lane_id]
604 elif gerald is not None:
605 genome_dir = gerald.lanes[lane_id].eland_genome
606 if genome_dir is not None:
607 genome_map = build_genome_fasta_map(genome_dir)
609 lane = ElandLane(pathname, lane_id, end, genome_map)
614 effective_end = end - 1
616 results[effective_end][lane_id] = lane
618 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
619 result = SequenceLane(pathname, lane_id, end)
624 effective_end = end - 1
626 results[effective_end][lane_id] = result
629 def eland(gerald_dir, gerald=None, genome_maps=None):
632 lane_ids = range(1,9)
635 basedirs = [gerald_dir]
637 # if there is a basedir/Temp change basedir to point to the temp
638 # directory, as 1.1rc1 moves most of the files we've historically
639 # cared about to that subdirectory.
640 # we should look into what the official 'result' files are.
641 # and 1.3 moves them back
642 basedir_temp = os.path.join(gerald_dir, 'Temp')
643 if os.path.isdir(basedir_temp):
644 basedirs.append(basedir_temp)
647 # the order in patterns determines the preference for what
651 patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
652 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
653 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
654 ('s_%s_eland_extended.txt', MAPPED_ELAND),
655 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
656 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
657 ('s_%s_eland_multi.txt', MAPPED_ELAND),
658 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
659 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
660 ('s_%s_export.txt', MAPPED_ELAND),
661 ('s_%s_export.txt.bz2', MAPPED_ELAND),
662 ('s_%s_export.txt.gz', MAPPED_ELAND),
663 ('s_%s_sequence.txt', SEQUENCE),]
665 for basedir in basedirs:
667 for lane_id in lane_ids:
669 pathname = check_for_eland_file(basedir, p[0], lane_id, end)
670 if pathname is not None:
671 if p[1] == MAPPED_ELAND:
672 update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
673 elif p[1] == SEQUENCE:
674 update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
677 logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
682 def build_genome_fasta_map(genome_dir):
683 # build fasta to fasta file map
684 logging.info("Building genome map")
685 genome = genome_dir.split(os.path.sep)[-1]
687 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
689 if os.path.islink(vld_file):
691 vld_file = os.path.realpath(vld_file)
692 path, vld_name = os.path.split(vld_file)
693 name, ext = os.path.splitext(vld_name)
695 fasta_map[name] = name
697 fasta_map[name] = os.path.join(genome, name)
701 def extract_eland_sequence(instream, outstream, start, end):
703 Extract a chunk of sequence out of an eland file
705 for line in instream:
706 record = line.split()
708 result = [record[0], record[1][start:end]]
710 result = [record[0][start:end]]
711 outstream.write("\t".join(result))
712 outstream.write(os.linesep)
715 def main(cmdline=None):
716 """Run eland extraction against the specified gerald directory"""
717 from optparse import OptionParser
718 parser = OptionParser("%prog: <gerald dir>+")
719 opts, args = parser.parse_args(cmdline)
720 logging.basicConfig(level=logging.DEBUG)
722 logging.info("Starting scan of %s" % (a,))
724 print e.get_elements()
729 if __name__ == "__main__":