2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
12 from htsworkflow.pipelines.runfolder import \
17 from htsworkflow.util.ethelp import indent, flatten
18 from htsworkflow.util.opener import autoopen
22 Capture meaning out of the GERALD directory
26 RUN_PARAMETERS='RunParameters'
29 class LaneParameters(object):
31 Make it easy to access elements of LaneSpecificRunParameters from python
33 def __init__(self, gerald, key):
37 def __get_attribute(self, xml_tag):
38 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
39 container = subtree.find(xml_tag)
42 if len(container.getchildren()) > LANES_PER_FLOWCELL:
43 raise RuntimeError('GERALD config.xml file changed')
44 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
45 index = lanes.index(self._key)
46 element = container[index]
48 def _get_analysis(self):
49 return self.__get_attribute('ANALYSIS')
50 analysis = property(_get_analysis)
52 def _get_eland_genome(self):
53 genome = self.__get_attribute('ELAND_GENOME')
54 # default to the chipwide parameters if there isn't an
55 # entry in the lane specific paramaters
57 subtree = self._gerald.tree.find('ChipWideRunParameters')
58 container = subtree.find('ELAND_GENOME')
59 genome = container.text
61 eland_genome = property(_get_eland_genome)
63 def _get_read_length(self):
64 return self.__get_attribute('READ_LENGTH')
65 read_length = property(_get_read_length)
67 def _get_use_bases(self):
68 return self.__get_attribute('USE_BASES')
69 use_bases = property(_get_use_bases)
71 class LaneSpecificRunParameters(object):
73 Provide access to LaneSpecificRunParameters
75 def __init__(self, gerald):
78 def __getitem__(self, key):
79 return Gerald.LaneParameters(self._gerald, key)
81 if self._keys is None:
82 tree = self._gerald.tree
83 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
84 # according to the pipeline specs I think their fields
85 # are sampleName_laneID, with sampleName defaulting to s
86 # since laneIDs are constant lets just try using
88 self._keys = [ x.tag.split('_')[1] for x in analysis]
91 return [ self[x] for x in self.keys() ]
93 return zip(self.keys(), self.values())
95 return len(self.keys())
97 def __init__(self, xml=None):
101 # parse lane parameters out of the config.xml file
102 self.lanes = Gerald.LaneSpecificRunParameters(self)
105 self.eland_results = None
108 self.set_elements(xml)
111 if self.tree is None:
112 return datetime.today()
113 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
114 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
115 return datetime.fromtimestamp(epochstamp)
116 date = property(_get_date)
119 return time.mktime(self.date.timetuple())
120 time = property(_get_time, doc='return run time as seconds since epoch')
122 def _get_version(self):
123 if self.tree is None:
125 return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
126 version = property(_get_version)
130 Debugging function, report current object
132 print 'Gerald version:', self.version
133 print 'Gerald run date:', self.date
134 print 'Gerald config.xml:', self.tree
137 def get_elements(self):
138 if self.tree is None or self.summary is None:
141 gerald = ElementTree.Element(Gerald.GERALD,
142 {'version': unicode(Gerald.XML_VERSION)})
143 gerald.append(self.tree)
144 gerald.append(self.summary.get_elements())
145 if self.eland_results:
146 gerald.append(self.eland_results.get_elements())
149 def set_elements(self, tree):
150 if tree.tag != Gerald.GERALD:
151 raise ValueError('exptected GERALD')
152 xml_version = int(tree.attrib.get('version', 0))
153 if xml_version > Gerald.XML_VERSION:
154 logging.warn('XML tree is a higher version than this class')
155 for element in list(tree):
156 tag = element.tag.lower()
157 if tag == Gerald.RUN_PARAMETERS.lower():
159 elif tag == Gerald.SUMMARY.lower():
160 self.summary = Summary(xml=element)
161 elif tag == ELAND.ELAND.lower():
162 self.eland_results = ELAND(xml=element)
164 logging.warn("Unrecognized tag %s" % (element.tag,))
167 def gerald(pathname):
169 g.pathname = pathname
170 path, name = os.path.split(pathname)
171 logging.info("Parsing gerald config.xml")
172 config_pathname = os.path.join(pathname, 'config.xml')
173 g.tree = ElementTree.parse(config_pathname).getroot()
175 # parse Summary.htm file
176 logging.info("Parsing Summary.htm")
177 summary_pathname = os.path.join(pathname, 'Summary.htm')
178 g.summary = Summary(summary_pathname)
180 g.eland_results = eland(g.pathname, g)
185 Convert a value to int if its an int otherwise a float.
189 except ValueError, e:
193 def parse_mean_range(value):
195 Parse values like 123 +/- 4.5
197 if value.strip() == 'unknown':
200 average, pm, deviation = value.split()
202 raise RuntimeError("Summary.htm file format changed")
203 return tonumber(average), tonumber(deviation)
205 def make_mean_range_element(parent, name, mean, deviation):
207 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
209 element = ElementTree.SubElement(parent, name,
210 { 'mean': unicode(mean),
211 'deviation': unicode(deviation)})
214 def parse_mean_range_element(element):
216 Grab mean/deviation out of element
218 return (tonumber(element.attrib['mean']),
219 tonumber(element.attrib['deviation']))
221 def parse_summary_element(element):
223 Determine if we have a simple element or a mean/deviation element
225 if len(element.attrib) > 0:
226 return parse_mean_range_element(element)
230 class Summary(object):
232 Extract some useful information from the Summary.htm file
237 class LaneResultSummary(object):
239 Parse the LaneResultSummary table out of Summary.htm
240 Mostly for the cluster number
242 LANE_RESULT_SUMMARY = 'LaneResultSummary'
244 'LaneYield': 'lane_yield',
245 'Cluster': 'cluster', # Raw
246 'ClusterPF': 'cluster_pass_filter',
247 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
248 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
249 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
250 'PercentPassFilterAlign': 'percent_pass_filter_align',
251 'AverageAlignmentScore': 'average_alignment_score',
252 'PercentErrorRate': 'percent_error_rate'
255 def __init__(self, html=None, xml=None):
257 self.lane_yield = None
259 self.cluster_pass_filter = None
260 self.average_first_cycle_intensity = None
261 self.percent_intensity_after_20_cycles = None
262 self.percent_pass_filter_clusters = None
263 self.percent_pass_filter_align = None
264 self.average_alignment_score = None
265 self.percent_error_rate = None
268 self.set_elements_from_html(html)
270 self.set_elements(xml)
272 def set_elements_from_html(self, data):
273 if not len(data) in (8,10):
274 raise RuntimeError("Summary.htm file format changed")
276 # same in pre-0.3.0 Summary file and 0.3 summary file
280 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
281 # this is the < 0.3 Pipeline version
282 self.cluster = parsed_data[0]
283 self.average_first_cycle_intensity = parsed_data[1]
284 self.percent_intensity_after_20_cycles = parsed_data[2]
285 self.percent_pass_filter_clusters = parsed_data[3]
286 self.percent_pass_filter_align = parsed_data[4]
287 self.average_alignment_score = parsed_data[5]
288 self.percent_error_rate = parsed_data[6]
289 elif len(data) == 10:
290 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
291 # this is the >= 0.3 summary file
292 self.lane_yield = data[1]
293 self.cluster = parsed_data[0]
294 self.cluster_pass_filter = parsed_data[1]
295 self.average_first_cycle_intensity = parsed_data[2]
296 self.percent_intensity_after_20_cycles = parsed_data[3]
297 self.percent_pass_filter_clusters = parsed_data[4]
298 self.percent_pass_filter_align = parsed_data[5]
299 self.average_alignment_score = parsed_data[6]
300 self.percent_error_rate = parsed_data[7]
302 def get_elements(self):
303 lane_result = ElementTree.Element(
304 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
306 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
307 value = getattr(self, variable_name)
310 # it looks like a sequence
311 elif type(value) in (types.TupleType, types.ListType):
312 element = make_mean_range_element(
318 element = ElementTree.SubElement(lane_result, tag)
322 def set_elements(self, tree):
323 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
324 raise ValueError('Expected %s' % (
325 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
326 self.lane = tree.attrib['lane']
327 tags = Summary.LaneResultSummary.TAGS
328 for element in list(tree):
330 variable_name = tags[element.tag]
331 setattr(self, variable_name,
332 parse_summary_element(element))
334 logging.warn('Unrecognized tag %s' % (element.tag,))
336 def __init__(self, filename=None, xml=None):
337 self.lane_results = {}
339 if filename is not None:
340 self._extract_lane_results(filename)
342 self.set_elements(xml)
344 def __getitem__(self, key):
345 return self.lane_results[key]
348 return len(self.lane_results)
351 return self.lane_results.keys()
354 return self.lane_results.values()
357 return self.lane_results.items()
359 def _flattened_row(self, row):
361 flatten the children of a <tr>...</tr>
363 return [flatten(x) for x in row.getchildren() ]
365 def _parse_table(self, table):
367 assumes the first line is the header of a table,
368 and that the remaining rows are data
370 rows = table.getchildren()
373 data.append(self._flattened_row(r))
376 def _extract_named_tables(self, pathname):
378 extract all the 'named' tables from a Summary.htm file
379 and return as a dictionary
381 Named tables are <h2>...</h2><table>...</table> pairs
382 The contents of the h2 tag is considered to the name
385 tree = ElementTree.parse(pathname).getroot()
386 body = tree.find('body')
388 for i in range(len(body)):
389 if body[i].tag == 'h2' and body[i+1].tag == 'table':
390 # we have an interesting table
391 name = flatten(body[i])
393 data = self._parse_table(table)
397 def _extract_lane_results(self, pathname):
399 extract the Lane Results Summary table
402 tables = self._extract_named_tables(pathname)
404 # parse lane result summary
405 lane_summary = tables['Lane Results Summary']
406 # this is version 1 of the summary file
407 if len(lane_summary[-1]) == 8:
409 headers = lane_summary[0]
410 # grab the lane by lane data
411 lane_summary = lane_summary[1:]
413 # this is version 2 of the summary file
414 if len(lane_summary[-1]) == 10:
415 # lane_summary[0] is a different less specific header row
416 headers = lane_summary[1]
417 lane_summary = lane_summary[2:10]
418 # after the last lane, there's a set of chip wide averages
420 for r in lane_summary:
421 lrs = Summary.LaneResultSummary(html=r)
422 self.lane_results[lrs.lane] = lrs
424 def get_elements(self):
425 summary = ElementTree.Element(Summary.SUMMARY,
426 {'version': unicode(Summary.XML_VERSION)})
427 for lane in self.lane_results.values():
428 summary.append(lane.get_elements())
431 def set_elements(self, tree):
432 if tree.tag != Summary.SUMMARY:
433 return ValueError("Expected %s" % (Summary.SUMMARY,))
434 xml_version = int(tree.attrib.get('version', 0))
435 if xml_version > Summary.XML_VERSION:
436 logging.warn('Summary XML tree is a higher version than this class')
437 for element in list(tree):
438 lrs = Summary.LaneResultSummary()
439 lrs.set_elements(element)
440 self.lane_results[lrs.lane] = lrs
444 Debugging function, report current object
449 def build_genome_fasta_map(genome_dir):
450 # build fasta to fasta file map
451 logging.info("Building genome map")
452 genome = genome_dir.split(os.path.sep)[-1]
454 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
456 if os.path.islink(vld_file):
458 vld_file = os.path.realpath(vld_file)
459 path, vld_name = os.path.split(vld_file)
460 name, ext = os.path.splitext(vld_name)
462 fasta_map[name] = name
464 fasta_map[name] = os.path.join(genome, name)
467 class ElandLane(object):
469 Process an eland result file
473 SAMPLE_NAME = 'SampleName'
475 GENOME_MAP = 'GenomeMap'
476 GENOME_ITEM = 'GenomeItem'
477 MAPPED_READS = 'MappedReads'
478 MAPPED_ITEM = 'MappedItem'
479 MATCH_CODES = 'MatchCodes'
483 def __init__(self, pathname=None, genome_map=None, xml=None):
484 self.pathname = pathname
485 self._sample_name = None
488 self._mapped_reads = None
489 self._match_codes = None
490 if genome_map is None:
492 self.genome_map = genome_map
495 self.set_elements(xml)
499 Actually read the file and actually count the reads
501 # can't do anything if we don't have a file to process
502 if self.pathname is None:
505 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
506 raise RuntimeError("Eland isn't done, try again later.")
508 logging.info("summarizing results for %s" % (self.pathname))
512 match_codes = {'NM':0, 'QC':0, 'RM':0,
513 'U0':0, 'U1':0, 'U2':0,
514 'R0':0, 'R1':0, 'R2':0,
516 for line in autoopen(self.pathname,'r'):
518 fields = line.split()
520 # match_codes[code] = match_codes.setdefault(code, 0) + 1
521 # the QC/NM etc codes are in the 3rd field and always present
522 match_codes[fields[2]] += 1
523 # ignore lines that don't have a fasta filename
526 fasta = self.genome_map.get(fields[6], fields[6])
527 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
528 self._match_codes = match_codes
529 self._mapped_reads = mapped_reads
532 def _update_name(self):
533 # extract the sample name
534 if self.pathname is None:
537 path, name = os.path.split(self.pathname)
538 split_name = name.split('_')
539 self._sample_name = split_name[0]
540 self._lane_id = split_name[1]
542 def _get_sample_name(self):
543 if self._sample_name is None:
545 return self._sample_name
546 sample_name = property(_get_sample_name)
548 def _get_lane_id(self):
549 if self._lane_id is None:
552 lane_id = property(_get_lane_id)
554 def _get_reads(self):
555 if self._reads is None:
558 reads = property(_get_reads)
560 def _get_mapped_reads(self):
561 if self._mapped_reads is None:
563 return self._mapped_reads
564 mapped_reads = property(_get_mapped_reads)
566 def _get_match_codes(self):
567 if self._match_codes is None:
569 return self._match_codes
570 match_codes = property(_get_match_codes)
572 def get_elements(self):
573 lane = ElementTree.Element(ElandLane.LANE,
575 unicode(ElandLane.XML_VERSION)})
576 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
577 sample_tag.text = self.sample_name
578 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
579 lane_tag.text = self.lane_id
580 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
581 for k, v in self.genome_map.items():
582 item = ElementTree.SubElement(
583 genome_map, ElandLane.GENOME_ITEM,
584 {'name':k, 'value':unicode(v)})
585 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
586 for k, v in self.mapped_reads.items():
587 item = ElementTree.SubElement(
588 mapped_reads, ElandLane.MAPPED_ITEM,
589 {'name':k, 'value':unicode(v)})
590 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
591 for k, v in self.match_codes.items():
592 item = ElementTree.SubElement(
593 match_codes, ElandLane.MATCH_ITEM,
594 {'name':k, 'value':unicode(v)})
595 reads = ElementTree.SubElement(lane, ElandLane.READS)
596 reads.text = unicode(self.reads)
600 def set_elements(self, tree):
601 if tree.tag != ElandLane.LANE:
602 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
605 self._mapped_reads = {}
606 self._match_codes = {}
609 tag = element.tag.lower()
610 if tag == ElandLane.SAMPLE_NAME.lower():
611 self._sample_name = element.text
612 elif tag == ElandLane.LANE_ID.lower():
613 self._lane_id = element.text
614 elif tag == ElandLane.GENOME_MAP.lower():
615 for child in element:
616 name = child.attrib['name']
617 value = child.attrib['value']
618 self.genome_map[name] = value
619 elif tag == ElandLane.MAPPED_READS.lower():
620 for child in element:
621 name = child.attrib['name']
622 value = child.attrib['value']
623 self._mapped_reads[name] = int(value)
624 elif tag == ElandLane.MATCH_CODES.lower():
625 for child in element:
626 name = child.attrib['name']
627 value = int(child.attrib['value'])
628 self._match_codes[name] = value
629 elif tag == ElandLane.READS.lower():
630 self._reads = int(element.text)
632 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
634 def extract_eland_sequence(instream, outstream, start, end):
636 Extract a chunk of sequence out of an eland file
638 for line in instream:
639 record = line.split()
641 result = [record[0], record[1][start:end]]
643 result = [record[0][start:end]]
644 outstream.write("\t".join(result))
645 outstream.write(os.linesep)
649 Summarize information from eland files
653 ELAND = 'ElandCollection'
657 def __init__(self, xml=None):
658 # we need information from the gerald config.xml
662 self.set_elements(xml)
665 return len(self.results)
668 return self.results.keys()
671 return self.results.values()
674 return self.results.items()
676 def __getitem__(self, key):
677 return self.results[key]
679 def get_elements(self):
680 root = ElementTree.Element(ELAND.ELAND,
681 {'version': unicode(ELAND.XML_VERSION)})
682 for lane_id, lane in self.results.items():
683 eland_lane = lane.get_elements()
684 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
685 root.append(eland_lane)
688 def set_elements(self, tree):
689 if tree.tag.lower() != ELAND.ELAND.lower():
690 raise ValueError('Expecting %s', ELAND.ELAND)
691 for element in list(tree):
692 lane_id = element.attrib[ELAND.LANE_ID]
693 lane = ElandLane(xml=element)
694 self.results[lane_id] = lane
696 def eland(basedir, gerald=None, genome_maps=None):
699 file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
700 if len(file_list) == 0:
701 # lets handle compressed eland files too
702 file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
704 for pathname in file_list:
705 # yes the lane_id is also being computed in ElandLane._update
706 # I didn't want to clutter up my constructor
707 # but I needed to persist the sample_name/lane_id for
708 # runfolder summary_report
709 path, name = os.path.split(pathname)
710 logging.info("Adding eland file %s" %(name,))
711 split_name = name.split('_')
712 lane_id = split_name[1]
714 if genome_maps is not None:
715 genome_map = genome_maps[lane_id]
716 elif gerald is not None:
717 genome_dir = gerald.lanes[lane_id].eland_genome
718 genome_map = build_genome_fasta_map(genome_dir)
722 eland_result = ElandLane(pathname, genome_map)
723 e.results[lane_id] = eland_result