12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
16 LOGGER = logging.getLogger(__name__)
18 SAMPLE_NAME = 'SampleName'
23 GENOME_MAP = 'GenomeMap'
24 GENOME_ITEM = 'GenomeItem'
25 MAPPED_READS = 'MappedReads'
26 MAPPED_ITEM = 'MappedItem'
27 MATCH_CODES = 'MatchCodes'
36 class ResultLane(object):
38 Base class for result lanes
43 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
44 self.pathname = pathname
45 self._sample_name = None
46 self.lane_id = lane_id
51 self.set_elements(xml)
55 Actually read the file and actually count the reads
59 def _update_name(self):
60 # extract the sample name
61 if self.pathname is None:
64 path, name = os.path.split(self.pathname)
65 split_name = name.split('_')
66 self._sample_name = split_name[0]
68 def _get_sample_name(self):
69 if self._sample_name is None:
71 return self._sample_name
72 sample_name = property(_get_sample_name)
75 if self._reads is None:
78 reads = property(_get_reads)
80 def get_elements(self):
83 class ElandLane(ResultLane):
85 Process an eland result file
89 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92 SCORE_UNRECOGNIZED = 0
96 def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
97 super(ElandLane, self).__init__(pathname, lane_id, end)
99 self._mapped_reads = None
100 self._match_codes = None
101 if genome_map is None:
103 self.genome_map = genome_map
104 self.eland_type = None
107 self.set_elements(xml)
109 def _guess_eland_type(self, pathname):
110 if self.eland_type is None:
111 # attempt autodetect eland file type
112 pathn, name = os.path.split(pathname)
113 if re.search('result', name):
114 self.eland_type = ELAND_SINGLE
115 elif re.search('multi', name):
116 self.eland_type = ELAND_MULTI
117 elif re.search('extended', name):
118 self.eland_type = ELAND_EXTENDED
119 elif re.search('export', name):
120 self.eland_type = ELAND_EXPORT
122 self.eland_type = ELAND_SINGLE
126 Actually read the file and actually count the reads
128 # can't do anything if we don't have a file to process
129 if self.pathname is None:
131 self._guess_eland_type(self.pathname)
133 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
134 raise RuntimeError("Eland isn't done, try again later.")
136 LOGGER.info("summarizing results for %s" % (self.pathname))
138 stream = autoopen(self.pathname, 'r')
139 if self.eland_type == ELAND_SINGLE:
140 result = self._update_eland_result(stream)
141 elif self.eland_type == ELAND_MULTI or \
142 self.eland_type == ELAND_EXTENDED:
143 result = self._update_eland_multi(stream)
144 elif self.eland_type == ELAND_EXPORT:
145 result = self._update_eland_export(stream)
147 raise NotImplementedError("Only support single/multi/extended eland files")
148 self._match_codes, self._mapped_reads, self._reads = result
150 def _update_eland_result(self, instream):
154 match_codes = {'NM':0, 'QC':0, 'RM':0,
155 'U0':0, 'U1':0, 'U2':0,
156 'R0':0, 'R1':0, 'R2':0,
158 for line in instream:
160 fields = line.split()
162 # match_codes[code] = match_codes.setdefault(code, 0) + 1
163 # the QC/NM etc codes are in the 3rd field and always present
164 match_codes[fields[2]] += 1
165 # ignore lines that don't have a fasta filename
168 fasta = self.genome_map.get(fields[6], fields[6])
169 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
170 return match_codes, mapped_reads, reads
172 def _update_eland_multi(self, instream):
173 """Summarize an eland_extend."""
179 match_codes = {'NM':0, 'QC':0, 'RM':0,
180 'U0':0, 'U1':0, 'U2':0,
181 'R0':0, 'R1':0, 'R2':0,
183 for line in instream:
185 fields = line.split()
186 # fields[2] = QC/NM/or number of matches
187 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
189 if score_type == ElandLane.SCORE_READ:
190 # when there are too many hits, eland writes a - where
191 # it would have put the list of hits.
192 # or in a different version of eland, it just leaves
193 # that column blank, and only outputs 3 fields.
194 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
197 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
199 return match_codes, mapped_reads, reads
201 def _update_eland_export(self, instream):
202 """Summarize a gerald export file."""
209 match_codes = {'NM':0, 'QC':0, 'RM':0,
210 'U0':0, 'U1':0, 'U2':0,
211 'R0':0, 'R1':0, 'R2':0,
214 for line in instream:
216 fields = line.split()
217 # fields[2] = QC/NM/or number of matches
218 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
220 if score_type == ElandLane.SCORE_UNRECOGNIZED:
221 # export files have three states for the match field
222 # QC code, count of multi-reads, or a single
223 # read location. The score_mapped_mismatches function
224 # only understands the first two types.
225 # if we get unrecognized, that implies the field is probably
227 code = self._count_mapped_export(mapped_reads,
228 fields[LOCATION_INDEX],
229 fields[DESCRIPTOR_INDEX])
230 match_codes[code] += 1
232 return match_codes, mapped_reads, reads
235 def _score_mapped_mismatches(self, match, match_codes):
236 """Update match_codes with eland map counts, or failure code.
238 Returns True if the read mapped, false if it was an error code.
240 groups = ElandLane.MATCH_COUNTS_RE.match(match)
242 # match is not of the form [\d]+:[\d]+:[\d]+
243 if match_codes.has_key(match):
244 # match is one quality control codes QC/NM etc
245 match_codes[match] += 1
246 return ElandLane.SCORE_QC
248 return ElandLane.SCORE_UNRECOGNIZED
250 # match is of the form [\d]+:[\d]+:[\d]+
252 zero_mismatches = int(groups.group(1))
253 one_mismatches = int(groups.group(2))
254 two_mismatches = int(groups.group(3))
256 if zero_mismatches == 1:
257 match_codes['U0'] += 1
258 elif zero_mismatches < 255:
259 match_codes['R0'] += zero_mismatches
261 if one_mismatches == 1:
262 match_codes['U1'] += 1
263 elif one_mismatches < 255:
264 match_codes['R1'] += one_mismatches
266 if two_mismatches == 1:
267 match_codes['U2'] += 1
268 elif two_mismatches < 255:
269 match_codes['R2'] += two_mismatches
271 return ElandLane.SCORE_READ
274 def _count_mapped_multireads(self, mapped_reads, match_string):
276 for match in match_string.split(','):
277 match_fragment = match.split(':')
278 if len(match_fragment) == 2:
279 chromo = match_fragment[0]
280 pos = match_fragment[1]
282 fasta = self.genome_map.get(chromo, chromo)
283 assert fasta is not None
284 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
287 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
288 """Count a read as defined in an export file
290 match_string contains the chromosome
291 descriptor contains the an ecoding of bases that match, mismatch,
293 returns the "best" match code
295 Currently "best" match code is ignoring the possibility of in-dels
297 chromo = match_string
298 fasta = self.genome_map.get(chromo, chromo)
299 assert fasta is not None
300 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
302 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
303 if len(mismatch_bases) == 0:
305 elif len(mismatch_bases) == 1:
311 def _get_mapped_reads(self):
312 if self._mapped_reads is None:
314 return self._mapped_reads
315 mapped_reads = property(_get_mapped_reads)
317 def _get_match_codes(self):
318 if self._match_codes is None:
320 return self._match_codes
321 match_codes = property(_get_match_codes)
323 def _get_no_match(self):
324 if self._mapped_reads is None:
326 return self._match_codes['NM']
327 no_match = property(_get_no_match,
328 doc="total reads that didn't match the target genome.")
330 def _get_no_match_percent(self):
331 return float(self.no_match)/self.reads * 100
332 no_match_percent = property(_get_no_match_percent,
333 doc="no match reads as percent of total")
335 def _get_qc_failed(self):
336 if self._mapped_reads is None:
338 return self._match_codes['QC']
339 qc_failed = property(_get_qc_failed,
340 doc="total reads that didn't match the target genome.")
342 def _get_qc_failed_percent(self):
343 return float(self.qc_failed)/self.reads * 100
344 qc_failed_percent = property(_get_qc_failed_percent,
345 doc="QC failed reads as percent of total")
347 def _get_unique_reads(self):
348 if self._mapped_reads is None:
351 for code in ['U0','U1','U2']:
352 sum += self._match_codes[code]
354 unique_reads = property(_get_unique_reads,
355 doc="total unique reads")
357 def _get_repeat_reads(self):
358 if self._mapped_reads is None:
361 for code in ['R0','R1','R2']:
362 sum += self._match_codes[code]
364 repeat_reads = property(_get_repeat_reads,
365 doc="total repeat reads")
367 def get_elements(self):
368 lane = ElementTree.Element(ElandLane.LANE,
370 unicode(ElandLane.XML_VERSION)})
371 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
372 sample_tag.text = self.sample_name
373 lane_tag = ElementTree.SubElement(lane, LANE_ID)
374 lane_tag.text = str(self.lane_id)
375 if self.end is not None:
376 end_tag = ElementTree.SubElement(lane, END)
377 end_tag.text = str(self.end)
378 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
379 for k, v in self.genome_map.items():
380 item = ElementTree.SubElement(
381 genome_map, GENOME_ITEM,
382 {'name':k, 'value':unicode(v)})
383 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
384 for k, v in self.mapped_reads.items():
385 item = ElementTree.SubElement(
386 mapped_reads, MAPPED_ITEM,
387 {'name':k, 'value':unicode(v)})
388 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
389 for k, v in self.match_codes.items():
390 item = ElementTree.SubElement(
391 match_codes, MATCH_ITEM,
392 {'name':k, 'value':unicode(v)})
393 reads = ElementTree.SubElement(lane, READS)
394 reads.text = unicode(self.reads)
398 def set_elements(self, tree):
399 if tree.tag != ElandLane.LANE:
400 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
403 self._mapped_reads = {}
404 self._match_codes = {}
407 tag = element.tag.lower()
408 if tag == SAMPLE_NAME.lower():
409 self._sample_name = element.text
410 elif tag == LANE_ID.lower():
411 self.lane_id = int(element.text)
412 elif tag == END.lower():
413 self.end = int(element.text)
414 elif tag == GENOME_MAP.lower():
415 for child in element:
416 name = child.attrib['name']
417 value = child.attrib['value']
418 self.genome_map[name] = value
419 elif tag == MAPPED_READS.lower():
420 for child in element:
421 name = child.attrib['name']
422 value = child.attrib['value']
423 self._mapped_reads[name] = int(value)
424 elif tag == MATCH_CODES.lower():
425 for child in element:
426 name = child.attrib['name']
427 value = int(child.attrib['value'])
428 self._match_codes[name] = value
429 elif tag == READS.lower():
430 self._reads = int(element.text)
432 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
434 class SequenceLane(ResultLane):
436 LANE = 'SequenceLane'
437 SEQUENCE_TYPE = 'SequenceType'
442 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
444 def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
445 self.sequence_type = None
446 super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
448 def _guess_sequence_type(self, pathname):
450 Determine if we have a scarf or fastq sequence file
452 f = open(pathname,'r')
457 # fastq starts with a @
458 self.sequence_type = SequenceLane.FASTQ_TYPE
460 self.sequence_type = SequenceLane.SCARF_TYPE
461 return self.sequence_type
465 Actually read the file and actually count the reads
467 # can't do anything if we don't have a file to process
468 if self.pathname is None:
471 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
472 raise RuntimeError("Sequencing isn't done, try again later.")
474 self._guess_sequence_type(self.pathname)
476 LOGGER.info("summarizing results for %s" % (self.pathname))
478 f = open(self.pathname)
479 for l in f.xreadlines():
483 if self.sequence_type == SequenceLane.SCARF_TYPE:
485 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
486 self._reads = lines / 4
488 raise NotImplementedError("This only supports scarf or fastq squence files")
490 def get_elements(self):
491 lane = ElementTree.Element(SequenceLane.LANE,
493 unicode(SequenceLane.XML_VERSION)})
494 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
495 sample_tag.text = self.sample_name
496 lane_tag = ElementTree.SubElement(lane, LANE_ID)
497 lane_tag.text = str(self.lane_id)
498 if self.end is not None:
499 end_tag = ElementTree.SubElement(lane, END)
500 end_tag.text = str(self.end)
501 reads = ElementTree.SubElement(lane, READS)
502 reads.text = unicode(self.reads)
503 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
504 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
508 def set_elements(self, tree):
509 if tree.tag != SequenceLane.LANE:
510 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
511 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
514 tag = element.tag.lower()
515 if tag == SAMPLE_NAME.lower():
516 self._sample_name = element.text
517 elif tag == LANE_ID.lower():
518 self.lane_id = int(element.text)
519 elif tag == END.lower():
520 self.end = int(element.text)
521 elif tag == READS.lower():
522 self._reads = int(element.text)
523 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
524 self.sequence_type = lookup_sequence_type.get(element.text, None)
526 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
530 Summarize information from eland files
534 ELAND = 'ElandCollection'
539 def __init__(self, xml=None):
540 # we need information from the gerald config.xml
541 self.results = [{},{}]
544 self.set_elements(xml)
546 if len(self.results[0]) == 0:
547 # Initialize our eland object with meaningless junk
549 self.results[0][l] = ResultLane(lane_id=l, end=0)
552 def get_elements(self):
553 root = ElementTree.Element(ELAND.ELAND,
554 {'version': unicode(ELAND.XML_VERSION)})
555 for end in range(len(self.results)):
556 end_results = self.results[end]
557 for lane_id, lane in end_results.items():
558 eland_lane = lane.get_elements()
559 if eland_lane is not None:
560 eland_lane.attrib[ELAND.END] = unicode (end)
561 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
562 root.append(eland_lane)
565 def set_elements(self, tree):
566 if tree.tag.lower() != ELAND.ELAND.lower():
567 raise ValueError('Expecting %s', ELAND.ELAND)
568 for element in list(tree):
569 lane_id = int(element.attrib[ELAND.LANE_ID])
570 end = int(element.attrib.get(ELAND.END, 0))
571 if element.tag.lower() == ElandLane.LANE.lower():
572 lane = ElandLane(xml=element)
573 elif element.tag.lower() == SequenceLane.LANE.lower():
574 lane = SequenceLane(xml=element)
576 self.results[end][lane_id] = lane
578 def check_for_eland_file(basedir, pattern, lane_id, end):
580 full_lane_id = lane_id
582 full_lane_id = "%d_%d" % ( lane_id, end )
584 basename = pattern % (full_lane_id,)
585 LOGGER.info("Eland pattern: %s" %(basename,))
586 pathname = os.path.join(basedir, basename)
587 if os.path.exists(pathname):
588 LOGGER.info('found eland file in %s' % (pathname,))
593 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
594 # yes the lane_id is also being computed in ElandLane._update
595 # I didn't want to clutter up my constructor
596 # but I needed to persist the sample_name/lane_id for
597 # runfolder summary_report
598 path, name = os.path.split(pathname)
599 LOGGER.info("Adding eland file %s" %(name,))
600 # split_name = name.split('_')
601 # lane_id = int(split_name[1])
604 if genome_maps is not None:
605 genome_map = genome_maps[lane_id]
606 elif gerald is not None:
607 genome_dir = gerald.lanes[lane_id].eland_genome
608 if genome_dir is not None:
609 genome_map = build_genome_fasta_map(genome_dir)
611 lane = ElandLane(pathname, lane_id, end, genome_map)
616 effective_end = end - 1
618 results[effective_end][lane_id] = lane
620 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
621 result = SequenceLane(pathname, lane_id, end)
626 effective_end = end - 1
628 results[effective_end][lane_id] = result
631 def eland(gerald_dir, gerald=None, genome_maps=None):
634 lane_ids = range(1,9)
637 basedirs = [gerald_dir]
639 # if there is a basedir/Temp change basedir to point to the temp
640 # directory, as 1.1rc1 moves most of the files we've historically
641 # cared about to that subdirectory.
642 # we should look into what the official 'result' files are.
643 # and 1.3 moves them back
644 basedir_temp = os.path.join(gerald_dir, 'Temp')
645 if os.path.isdir(basedir_temp):
646 basedirs.append(basedir_temp)
649 # the order in patterns determines the preference for what
653 patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
654 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
655 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
656 ('s_%s_eland_extended.txt', MAPPED_ELAND),
657 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
658 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
659 ('s_%s_eland_multi.txt', MAPPED_ELAND),
660 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
661 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
662 ('s_%s_export.txt', MAPPED_ELAND),
663 ('s_%s_export.txt.bz2', MAPPED_ELAND),
664 ('s_%s_export.txt.gz', MAPPED_ELAND),
665 ('s_%s_sequence.txt', SEQUENCE),]
667 for basedir in basedirs:
669 for lane_id in lane_ids:
671 pathname = check_for_eland_file(basedir, p[0], lane_id, end)
672 if pathname is not None:
673 if p[1] == MAPPED_ELAND:
674 update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
675 elif p[1] == SEQUENCE:
676 update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
679 LOGGER.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
684 def build_genome_fasta_map(genome_dir):
685 # build fasta to fasta file map
686 LOGGER.info("Building genome map")
687 genome = genome_dir.split(os.path.sep)[-1]
689 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
691 if os.path.islink(vld_file):
693 vld_file = os.path.realpath(vld_file)
694 path, vld_name = os.path.split(vld_file)
695 name, ext = os.path.splitext(vld_name)
697 fasta_map[name] = name
699 fasta_map[name] = os.path.join(genome, name)
703 def extract_eland_sequence(instream, outstream, start, end):
705 Extract a chunk of sequence out of an eland file
707 for line in instream:
708 record = line.split()
710 result = [record[0], record[1][start:end]]
712 result = [record[0][start:end]]
713 outstream.write("\t".join(result))
714 outstream.write(os.linesep)
717 def main(cmdline=None):
718 """Run eland extraction against the specified gerald directory"""
719 from optparse import OptionParser
720 parser = OptionParser("%prog: <gerald dir>+")
721 opts, args = parser.parse_args(cmdline)
722 LOGGER.basicConfig(level=logging.DEBUG)
724 LOGGER.info("Starting scan of %s" % (a,))
726 print e.get_elements()
731 if __name__ == "__main__":