2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
11 from gaworkflow.pipeline.runfolder import \
16 from gaworkflow.util.ethelp import indent, flatten
17 from gaworkflow.util.opener import autoopen
21 Capture meaning out of the GERALD directory
25 RUN_PARAMETERS='RunParameters'
28 class LaneParameters(object):
30 Make it easy to access elements of LaneSpecificRunParameters from python
32 def __init__(self, gerald, key):
36 def __get_attribute(self, xml_tag):
37 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38 container = subtree.find(xml_tag)
41 if len(container.getchildren()) != LANES_PER_FLOWCELL:
42 raise RuntimeError('GERALD config.xml file changed')
43 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
44 index = lanes.index(self._key)
45 element = container[index]
47 def _get_analysis(self):
48 return self.__get_attribute('ANALYSIS')
49 analysis = property(_get_analysis)
51 def _get_eland_genome(self):
52 genome = self.__get_attribute('ELAND_GENOME')
53 # default to the chipwide parameters if there isn't an
54 # entry in the lane specific paramaters
56 subtree = self._gerald.tree.find('ChipWideRunParameters')
57 container = subtree.find('ELAND_GENOME')
58 genome = container.text
60 eland_genome = property(_get_eland_genome)
62 def _get_read_length(self):
63 return self.__get_attribute('READ_LENGTH')
64 read_length = property(_get_read_length)
66 def _get_use_bases(self):
67 return self.__get_attribute('USE_BASES')
68 use_bases = property(_get_use_bases)
70 class LaneSpecificRunParameters(object):
72 Provide access to LaneSpecificRunParameters
74 def __init__(self, gerald):
77 def __getitem__(self, key):
78 return Gerald.LaneParameters(self._gerald, key)
80 if self._keys is None:
81 tree = self._gerald.tree
82 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
83 # according to the pipeline specs I think their fields
84 # are sampleName_laneID, with sampleName defaulting to s
85 # since laneIDs are constant lets just try using
87 self._keys = [ x.tag.split('_')[1] for x in analysis]
90 return [ self[x] for x in self.keys() ]
92 return zip(self.keys(), self.values())
94 return len(self.keys())
96 def __init__(self, xml=None):
100 # parse lane parameters out of the config.xml file
101 self.lanes = Gerald.LaneSpecificRunParameters(self)
104 self.eland_results = None
107 self.set_elements(xml)
110 if self.tree is None:
111 return datetime.today()
112 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
113 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
114 return datetime.fromtimestamp(epochstamp)
115 date = property(_get_date)
118 return time.mktime(self.date.timetuple())
119 time = property(_get_time, doc='return run time as seconds since epoch')
121 def _get_version(self):
122 if self.tree is None:
124 return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
125 version = property(_get_version)
129 Debugging function, report current object
131 print 'Gerald version:', self.version
132 print 'Gerald run date:', self.date
133 print 'Gerald config.xml:', self.tree
136 def get_elements(self):
137 if self.tree is None or self.summary is None:
140 gerald = ElementTree.Element(Gerald.GERALD,
141 {'version': unicode(Gerald.XML_VERSION)})
142 gerald.append(self.tree)
143 gerald.append(self.summary.get_elements())
144 if self.eland_results:
145 gerald.append(self.eland_results.get_elements())
148 def set_elements(self, tree):
149 if tree.tag != Gerald.GERALD:
150 raise ValueError('exptected GERALD')
151 xml_version = int(tree.attrib.get('version', 0))
152 if xml_version > Gerald.XML_VERSION:
153 logging.warn('XML tree is a higher version than this class')
154 for element in list(tree):
155 tag = element.tag.lower()
156 if tag == Gerald.RUN_PARAMETERS.lower():
158 elif tag == Gerald.SUMMARY.lower():
159 self.summary = Summary(xml=element)
160 elif tag == ELAND.ELAND.lower():
161 self.eland_results = ELAND(xml=element)
163 logging.warn("Unrecognized tag %s" % (element.tag,))
166 def gerald(pathname):
168 g.pathname = pathname
169 path, name = os.path.split(pathname)
170 config_pathname = os.path.join(pathname, 'config.xml')
171 g.tree = ElementTree.parse(config_pathname).getroot()
173 # parse Summary.htm file
174 summary_pathname = os.path.join(pathname, 'Summary.htm')
175 g.summary = Summary(summary_pathname)
177 g.eland_results = eland(g.pathname, g)
182 Convert a value to int if its an int otherwise a float.
186 except ValueError, e:
190 def parse_mean_range(value):
192 Parse values like 123 +/- 4.5
194 if value.strip() == 'unknown':
197 average, pm, deviation = value.split()
199 raise RuntimeError("Summary.htm file format changed")
200 return tonumber(average), tonumber(deviation)
202 def make_mean_range_element(parent, name, mean, deviation):
204 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
206 element = ElementTree.SubElement(parent, name,
207 { 'mean': unicode(mean),
208 'deviation': unicode(deviation)})
211 def parse_mean_range_element(element):
213 Grab mean/deviation out of element
215 return (tonumber(element.attrib['mean']),
216 tonumber(element.attrib['deviation']))
218 class Summary(object):
220 Extract some useful information from the Summary.htm file
225 class LaneResultSummary(object):
227 Parse the LaneResultSummary table out of Summary.htm
228 Mostly for the cluster number
230 LANE_RESULT_SUMMARY = 'LaneResultSummary'
232 'Cluster': 'cluster',
233 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
234 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
235 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
236 'PercentPassFilterAlign': 'percent_pass_filter_align',
237 'AverageAlignmentScore': 'average_alignment_score',
238 'PercentErrorRate': 'percent_error_rate'
241 def __init__(self, html=None, xml=None):
244 self.average_first_cycle_intensity = None
245 self.percent_intensity_after_20_cycles = None
246 self.percent_pass_filter_clusters = None
247 self.percent_pass_filter_align = None
248 self.average_alignment_score = None
249 self.percent_error_rate = None
252 self.set_elements_from_html(html)
254 self.set_elements(xml)
256 def set_elements_from_html(self, data):
257 if not len(data) in (8,10):
258 raise RuntimeError("Summary.htm file format changed")
260 # same in pre-0.3.0 Summary file and 0.3 summary file
264 # this is the < 0.3 Pipeline version
265 self.cluster = parse_mean_range(data[1])
266 self.average_first_cycle_intensity = parse_mean_range(data[2])
267 self.percent_intensity_after_20_cycles = \
268 parse_mean_range(data[3])
269 self.percent_pass_filter_clusters = parse_mean_range(data[4])
270 self.percent_pass_filter_align = parse_mean_range(data[5])
271 self.average_alignment_score = parse_mean_range(data[6])
272 self.percent_error_rate = parse_mean_range(data[7])
273 elif len(data) == 10:
274 # this is the >= 0.3 summary file
275 self.cluster_raw = data[1]
276 self.cluster = parse_mean_range(data[2])
277 # FIXME: think of generic way to capture the variable data
280 def get_elements(self):
281 lane_result = ElementTree.Element(
282 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
284 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
285 element = make_mean_range_element(
288 *getattr(self, variable_name)
292 def set_elements(self, tree):
293 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
294 raise ValueError('Expected %s' % (
295 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
296 self.lane = tree.attrib['lane']
297 tags = Summary.LaneResultSummary.TAGS
298 for element in list(tree):
300 variable_name = tags[element.tag]
301 setattr(self, variable_name,
302 parse_mean_range_element(element))
304 logging.warn('Unrecognized tag %s' % (element.tag,))
306 def __init__(self, filename=None, xml=None):
307 self.lane_results = {}
309 if filename is not None:
310 self._extract_lane_results(filename)
312 self.set_elements(xml)
314 def __getitem__(self, key):
315 return self.lane_results[key]
318 return len(self.lane_results)
321 return self.lane_results.keys()
324 return self.lane_results.values()
327 return self.lane_results.items()
329 def _flattened_row(self, row):
331 flatten the children of a <tr>...</tr>
333 return [flatten(x) for x in row.getchildren() ]
335 def _parse_table(self, table):
337 assumes the first line is the header of a table,
338 and that the remaining rows are data
340 rows = table.getchildren()
343 data.append(self._flattened_row(r))
346 def _extract_named_tables(self, pathname):
348 extract all the 'named' tables from a Summary.htm file
349 and return as a dictionary
351 Named tables are <h2>...</h2><table>...</table> pairs
352 The contents of the h2 tag is considered to the name
355 tree = ElementTree.parse(pathname).getroot()
356 body = tree.find('body')
358 for i in range(len(body)):
359 if body[i].tag == 'h2' and body[i+1].tag == 'table':
360 # we have an interesting table
361 name = flatten(body[i])
363 data = self._parse_table(table)
367 def _extract_lane_results(self, pathname):
369 extract the Lane Results Summary table
372 tables = self._extract_named_tables(pathname)
374 # parse lane result summary
375 lane_summary = tables['Lane Results Summary']
376 # this is version 1 of the summary file
377 if len(lane_summary[-1]) == 8:
379 headers = lane_summary[0]
380 # grab the lane by lane data
381 lane_summary = lane_summary[1:]
383 # this is version 2 of the summary file
384 if len(lane_summary[-1]) == 10:
385 # lane_summary[0] is a different less specific header row
386 headers = lane_summary[1]
387 lane_summary = lane_summary[2:10]
388 # after the last lane, there's a set of chip wide averages
390 for r in lane_summary:
391 lrs = Summary.LaneResultSummary(html=r)
392 self.lane_results[lrs.lane] = lrs
394 def get_elements(self):
395 summary = ElementTree.Element(Summary.SUMMARY,
396 {'version': unicode(Summary.XML_VERSION)})
397 for lane in self.lane_results.values():
398 summary.append(lane.get_elements())
401 def set_elements(self, tree):
402 if tree.tag != Summary.SUMMARY:
403 return ValueError("Expected %s" % (Summary.SUMMARY,))
404 xml_version = int(tree.attrib.get('version', 0))
405 if xml_version > Summary.XML_VERSION:
406 logging.warn('Summary XML tree is a higher version than this class')
407 for element in list(tree):
408 lrs = Summary.LaneResultSummary()
409 lrs.set_elements(element)
410 self.lane_results[lrs.lane] = lrs
414 Debugging function, report current object
419 def build_genome_fasta_map(genome_dir):
420 # build fasta to fasta file map
421 genome = genome_dir.split(os.path.sep)[-1]
423 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
425 if os.path.islink(vld_file):
427 vld_file = os.path.realpath(vld_file)
428 path, vld_name = os.path.split(vld_file)
429 name, ext = os.path.splitext(vld_name)
431 fasta_map[name] = name
433 fasta_map[name] = os.path.join(genome, name)
436 class ElandLane(object):
438 Process an eland result file
442 SAMPLE_NAME = 'SampleName'
444 GENOME_MAP = 'GenomeMap'
445 GENOME_ITEM = 'GenomeItem'
446 MAPPED_READS = 'MappedReads'
447 MAPPED_ITEM = 'MappedItem'
448 MATCH_CODES = 'MatchCodes'
452 def __init__(self, pathname=None, genome_map=None, xml=None):
453 self.pathname = pathname
454 self._sample_name = None
457 self._mapped_reads = None
458 self._match_codes = None
459 if genome_map is None:
461 self.genome_map = genome_map
464 self.set_elements(xml)
468 Actually read the file and actually count the reads
470 # can't do anything if we don't have a file to process
471 if self.pathname is None:
474 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
475 raise RuntimeError("Eland isn't done, try again later.")
480 match_codes = {'NM':0, 'QC':0, 'RM':0,
481 'U0':0, 'U1':0, 'U2':0,
482 'R0':0, 'R1':0, 'R2':0,
484 for line in autoopen(self.pathname,'r'):
486 fields = line.split()
488 # match_codes[code] = match_codes.setdefault(code, 0) + 1
489 # the QC/NM etc codes are in the 3rd field and always present
490 match_codes[fields[2]] += 1
491 # ignore lines that don't have a fasta filename
494 fasta = self.genome_map.get(fields[6], fields[6])
495 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
496 self._match_codes = match_codes
497 self._mapped_reads = mapped_reads
500 def _update_name(self):
501 # extract the sample name
502 if self.pathname is None:
505 path, name = os.path.split(self.pathname)
506 split_name = name.split('_')
507 self._sample_name = split_name[0]
508 self._lane_id = split_name[1]
510 def _get_sample_name(self):
511 if self._sample_name is None:
513 return self._sample_name
514 sample_name = property(_get_sample_name)
516 def _get_lane_id(self):
517 if self._lane_id is None:
520 lane_id = property(_get_lane_id)
522 def _get_reads(self):
523 if self._reads is None:
526 reads = property(_get_reads)
528 def _get_mapped_reads(self):
529 if self._mapped_reads is None:
531 return self._mapped_reads
532 mapped_reads = property(_get_mapped_reads)
534 def _get_match_codes(self):
535 if self._match_codes is None:
537 return self._match_codes
538 match_codes = property(_get_match_codes)
540 def get_elements(self):
541 lane = ElementTree.Element(ElandLane.LANE,
543 unicode(ElandLane.XML_VERSION)})
544 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
545 sample_tag.text = self.sample_name
546 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
547 lane_tag.text = self.lane_id
548 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
549 for k, v in self.genome_map.items():
550 item = ElementTree.SubElement(
551 genome_map, ElandLane.GENOME_ITEM,
552 {'name':k, 'value':unicode(v)})
553 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
554 for k, v in self.mapped_reads.items():
555 item = ElementTree.SubElement(
556 mapped_reads, ElandLane.MAPPED_ITEM,
557 {'name':k, 'value':unicode(v)})
558 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
559 for k, v in self.match_codes.items():
560 item = ElementTree.SubElement(
561 match_codes, ElandLane.MATCH_ITEM,
562 {'name':k, 'value':unicode(v)})
563 reads = ElementTree.SubElement(lane, ElandLane.READS)
564 reads.text = unicode(self.reads)
568 def set_elements(self, tree):
569 if tree.tag != ElandLane.LANE:
570 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
573 self._mapped_reads = {}
574 self._match_codes = {}
577 tag = element.tag.lower()
578 if tag == ElandLane.SAMPLE_NAME.lower():
579 self._sample_name = element.text
580 elif tag == ElandLane.LANE_ID.lower():
581 self._lane_id = element.text
582 elif tag == ElandLane.GENOME_MAP.lower():
583 for child in element:
584 name = child.attrib['name']
585 value = child.attrib['value']
586 self.genome_map[name] = value
587 elif tag == ElandLane.MAPPED_READS.lower():
588 for child in element:
589 name = child.attrib['name']
590 value = child.attrib['value']
591 self._mapped_reads[name] = int(value)
592 elif tag == ElandLane.MATCH_CODES.lower():
593 for child in element:
594 name = child.attrib['name']
595 value = int(child.attrib['value'])
596 self._match_codes[name] = value
597 elif tag == ElandLane.READS.lower():
598 self._reads = int(element.text)
600 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
602 def extract_eland_sequence(instream, outstream, start, end):
604 Extract a chunk of sequence out of an eland file
606 for line in instream:
607 record = line.split()
609 result = [record[0], record[1][start:end]]
611 result = [record[0][start:end]]
612 outstream.write("\t".join(result))
613 outstream.write(os.linesep)
617 Summarize information from eland files
621 ELAND = 'ElandCollection'
625 def __init__(self, xml=None):
626 # we need information from the gerald config.xml
630 self.set_elements(xml)
633 return len(self.results)
636 return self.results.keys()
639 return self.results.values()
642 return self.results.items()
644 def __getitem__(self, key):
645 return self.results[key]
647 def get_elements(self):
648 root = ElementTree.Element(ELAND.ELAND,
649 {'version': unicode(ELAND.XML_VERSION)})
650 for lane_id, lane in self.results.items():
651 eland_lane = lane.get_elements()
652 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
653 root.append(eland_lane)
656 def set_elements(self, tree):
657 if tree.tag.lower() != ELAND.ELAND.lower():
658 raise ValueError('Expecting %s', ELAND.ELAND)
659 for element in list(tree):
660 lane_id = element.attrib[ELAND.LANE_ID]
661 lane = ElandLane(xml=element)
662 self.results[lane_id] = lane
664 def eland(basedir, gerald=None, genome_maps=None):
667 file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
668 if len(file_list) == 0:
669 # lets handle compressed eland files too
670 file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
672 for pathname in file_list:
673 # yes the lane_id is also being computed in ElandLane._update
674 # I didn't want to clutter up my constructor
675 # but I needed to persist the sample_name/lane_id for
676 # runfolder summary_report
677 path, name = os.path.split(pathname)
678 split_name = name.split('_')
679 lane_id = split_name[1]
681 if genome_maps is not None:
682 genome_map = genome_maps[lane_id]
683 elif gerald is not None:
684 genome_dir = gerald.lanes[lane_id].eland_genome
685 genome_map = build_genome_fasta_map(genome_dir)
689 eland_result = ElandLane(pathname, genome_map)
690 e.results[lane_id] = eland_result