2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
11 from gaworkflow.pipeline.runfolder import \
16 from gaworkflow.util.ethelp import indent, flatten
20 Capture meaning out of the GERALD directory
24 RUN_PARAMETERS='RunParameters'
27 class LaneParameters(object):
29 Make it easy to access elements of LaneSpecificRunParameters from python
31 def __init__(self, gerald, key):
35 def __get_attribute(self, xml_tag):
36 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
37 container = subtree.find(xml_tag)
38 if container is None or \
39 len(container.getchildren()) != LANES_PER_FLOWCELL:
40 raise RuntimeError('GERALD config.xml file changed')
41 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
42 index = lanes.index(self._key)
43 #element = container.find(self._key)
44 element = container[index]
46 def _get_analysis(self):
47 return self.__get_attribute('ANALYSIS')
48 analysis = property(_get_analysis)
50 def _get_eland_genome(self):
51 return self.__get_attribute('ELAND_GENOME')
52 eland_genome = property(_get_eland_genome)
54 def _get_read_length(self):
55 return self.__get_attribute('READ_LENGTH')
56 read_length = property(_get_read_length)
58 def _get_use_bases(self):
59 return self.__get_attribute('USE_BASES')
60 use_bases = property(_get_use_bases)
62 class LaneSpecificRunParameters(object):
64 Provide access to LaneSpecificRunParameters
66 def __init__(self, gerald):
69 def __getitem__(self, key):
70 return Gerald.LaneParameters(self._gerald, key)
72 if self._keys is None:
73 tree = self._gerald.tree
74 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
75 # according to the pipeline specs I think their fields
76 # are sampleName_laneID, with sampleName defaulting to s
77 # since laneIDs are constant lets just try using
79 self._keys = [ x.tag.split('_')[1] for x in analysis]
82 return [ self[x] for x in self.keys() ]
84 return zip(self.keys(), self.values())
86 return len(self.keys())
88 def __init__(self, xml=None):
92 # parse lane parameters out of the config.xml file
93 self.lanes = Gerald.LaneSpecificRunParameters(self)
96 self.eland_results = None
99 self.set_elements(xml)
102 if self.tree is None:
103 return datetime.today()
104 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
105 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
106 return datetime.fromtimestamp(epochstamp)
107 date = property(_get_date)
110 return time.mktime(self.date.timetuple())
111 time = property(_get_time, doc='return run time as seconds since epoch')
113 def _get_version(self):
114 if self.tree is None:
116 return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
117 version = property(_get_version)
121 Debugging function, report current object
123 print 'Gerald version:', self.version
124 print 'Gerald run date:', self.date
125 print 'Gerald config.xml:', self.tree
128 def get_elements(self):
129 if self.tree is None or self.summary is None:
132 gerald = ElementTree.Element(Gerald.GERALD,
133 {'version': unicode(Gerald.XML_VERSION)})
134 gerald.append(self.tree)
135 gerald.append(self.summary.get_elements())
136 if self.eland_results:
137 gerald.append(self.eland_results.get_elements())
140 def set_elements(self, tree):
141 if tree.tag != Gerald.GERALD:
142 raise ValueError('exptected GERALD')
143 xml_version = int(tree.attrib.get('version', 0))
144 if xml_version > Gerald.XML_VERSION:
145 logging.warn('XML tree is a higher version than this class')
146 for element in list(tree):
147 tag = element.tag.lower()
148 if tag == Gerald.RUN_PARAMETERS.lower():
150 elif tag == Gerald.SUMMARY.lower():
151 self.summary = Summary(xml=element)
152 elif tag == ELAND.ELAND.lower():
153 self.eland_results = ELAND(xml=element)
155 logging.warn("Unrecognized tag %s" % (element.tag,))
158 def gerald(pathname):
160 g.pathname = pathname
161 path, name = os.path.split(pathname)
162 config_pathname = os.path.join(pathname, 'config.xml')
163 g.tree = ElementTree.parse(config_pathname).getroot()
165 # parse Summary.htm file
166 summary_pathname = os.path.join(pathname, 'Summary.htm')
167 g.summary = Summary(summary_pathname)
169 g.eland_results = eland(g.pathname, g)
174 Convert a value to int if its an int otherwise a float.
178 except ValueError, e:
182 def parse_mean_range(value):
184 Parse values like 123 +/- 4.5
186 if value.strip() == 'unknown':
189 average, pm, deviation = value.split()
191 raise RuntimeError("Summary.htm file format changed")
192 return tonumber(average), tonumber(deviation)
194 def make_mean_range_element(parent, name, mean, deviation):
196 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
198 element = ElementTree.SubElement(parent, name,
199 { 'mean': unicode(mean),
200 'deviation': unicode(deviation)})
203 def parse_mean_range_element(element):
205 Grab mean/deviation out of element
207 return (tonumber(element.attrib['mean']),
208 tonumber(element.attrib['deviation']))
210 class Summary(object):
212 Extract some useful information from the Summary.htm file
217 class LaneResultSummary(object):
219 Parse the LaneResultSummary table out of Summary.htm
220 Mostly for the cluster number
222 LANE_RESULT_SUMMARY = 'LaneResultSummary'
224 'Cluster': 'cluster',
225 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
226 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
227 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
228 'PercentPassFilterAlign': 'percent_pass_filter_align',
229 'AverageAlignmentScore': 'average_alignment_score',
230 'PercentErrorRate': 'percent_error_rate'
233 def __init__(self, html=None, xml=None):
236 self.average_first_cycle_intensity = None
237 self.percent_intensity_after_20_cycles = None
238 self.percent_pass_filter_clusters = None
239 self.percent_pass_filter_align = None
240 self.average_alignment_score = None
241 self.percent_error_rate = None
244 self.set_elements_from_html(html)
246 self.set_elements(xml)
248 def set_elements_from_html(self, row_element):
249 data = [ flatten(x) for x in row_element ]
251 raise RuntimeError("Summary.htm file format changed")
254 self.cluster = parse_mean_range(data[1])
255 self.average_first_cycle_intensity = parse_mean_range(data[2])
256 self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
257 self.percent_pass_filter_clusters = parse_mean_range(data[4])
258 self.percent_pass_filter_align = parse_mean_range(data[5])
259 self.average_alignment_score = parse_mean_range(data[6])
260 self.percent_error_rate = parse_mean_range(data[7])
262 def get_elements(self):
263 lane_result = ElementTree.Element(
264 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
266 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
267 element = make_mean_range_element(
270 *getattr(self, variable_name)
274 def set_elements(self, tree):
275 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
276 raise ValueError('Expected %s' % (
277 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
278 self.lane = tree.attrib['lane']
279 tags = Summary.LaneResultSummary.TAGS
280 for element in list(tree):
282 variable_name = tags[element.tag]
283 setattr(self, variable_name,
284 parse_mean_range_element(element))
286 logging.warn('Unrecognized tag %s' % (element.tag,))
288 def __init__(self, filename=None, xml=None):
289 self.lane_results = {}
291 if filename is not None:
292 self._extract_lane_results(filename)
294 self.set_elements(xml)
296 def __getitem__(self, key):
297 return self.lane_results[key]
300 return len(self.lane_results)
303 return self.lane_results.keys()
306 return self.lane_results.values()
309 return self.lane_results.items()
311 def _extract_lane_results(self, pathname):
313 extract the Lane Results Summary table
315 tree = ElementTree.parse(pathname).getroot()
316 if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
317 raise RuntimeError("Summary.htm file format changed")
319 tables = tree.findall('*//table')
321 # parse lane result summary
322 lane_summary = tables[2]
323 rows = lane_summary.getchildren()
324 headers = rows[0].getchildren()
325 if flatten(headers[2]) != 'Av 1st Cycle Int ':
326 raise RuntimeError("Summary.htm file format changed")
329 lrs = Summary.LaneResultSummary(html=r)
330 self.lane_results[lrs.lane] = lrs
332 def get_elements(self):
333 summary = ElementTree.Element(Summary.SUMMARY,
334 {'version': unicode(Summary.XML_VERSION)})
335 for lane in self.lane_results.values():
336 summary.append(lane.get_elements())
339 def set_elements(self, tree):
340 if tree.tag != Summary.SUMMARY:
341 return ValueError("Expected %s" % (Summary.SUMMARY,))
342 xml_version = int(tree.attrib.get('version', 0))
343 if xml_version > Summary.XML_VERSION:
344 logging.warn('Summary XML tree is a higher version than this class')
345 for element in list(tree):
346 lrs = Summary.LaneResultSummary()
347 lrs.set_elements(element)
348 self.lane_results[lrs.lane] = lrs
352 Debugging function, report current object
357 def build_genome_fasta_map(genome_dir):
358 # build fasta to fasta file map
359 genome = genome_dir.split(os.path.sep)[-1]
361 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
363 if os.path.islink(vld_file):
365 vld_file = os.path.realpath(vld_file)
366 path, vld_name = os.path.split(vld_file)
367 name, ext = os.path.splitext(vld_name)
369 fasta_map[name] = name
371 fasta_map[name] = os.path.join(genome, name)
374 class ElandLane(object):
376 Process an eland result file
380 SAMPLE_NAME = 'SampleName'
382 GENOME_MAP = 'GenomeMap'
383 GENOME_ITEM = 'GenomeItem'
384 MAPPED_READS = 'MappedReads'
385 MAPPED_ITEM = 'MappedItem'
386 MATCH_CODES = 'MatchCodes'
390 def __init__(self, pathname=None, genome_map=None, xml=None):
391 self.pathname = pathname
392 self._sample_name = None
395 self._mapped_reads = None
396 self._match_codes = None
397 if genome_map is None:
399 self.genome_map = genome_map
402 self.set_elements(xml)
406 Actually read the file and actually count the reads
408 # can't do anything if we don't have a file to process
409 if self.pathname is None:
412 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
413 raise RuntimeError("Eland isn't done, try again later.")
418 match_codes = {'NM':0, 'QC':0, 'RM':0,
419 'U0':0, 'U1':0, 'U2':0,
420 'R0':0, 'R1':0, 'R2':0,
422 for line in open(self.pathname):
424 fields = line.split()
426 # match_codes[code] = match_codes.setdefault(code, 0) + 1
427 # the QC/NM etc codes are in the 3rd field and always present
428 match_codes[fields[2]] += 1
429 # ignore lines that don't have a fasta filename
432 fasta = self.genome_map.get(fields[6], fields[6])
433 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
434 self._match_codes = match_codes
435 self._mapped_reads = mapped_reads
438 def _update_name(self):
439 # extract the sample name
440 if self.pathname is None:
443 path, name = os.path.split(self.pathname)
444 split_name = name.split('_')
445 self._sample_name = split_name[0]
446 self._lane_id = split_name[1]
448 def _get_sample_name(self):
449 if self._sample_name is None:
451 return self._sample_name
452 sample_name = property(_get_sample_name)
454 def _get_lane_id(self):
455 if self._lane_id is None:
458 lane_id = property(_get_lane_id)
460 def _get_reads(self):
461 if self._reads is None:
464 reads = property(_get_reads)
466 def _get_mapped_reads(self):
467 if self._mapped_reads is None:
469 return self._mapped_reads
470 mapped_reads = property(_get_mapped_reads)
472 def _get_match_codes(self):
473 if self._match_codes is None:
475 return self._match_codes
476 match_codes = property(_get_match_codes)
478 def get_elements(self):
479 lane = ElementTree.Element(ElandLane.LANE,
481 unicode(ElandLane.XML_VERSION)})
482 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
483 sample_tag.text = self.sample_name
484 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
485 lane_tag.text = self.lane_id
486 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
487 for k, v in self.genome_map.items():
488 item = ElementTree.SubElement(
489 genome_map, ElandLane.GENOME_ITEM,
490 {'name':k, 'value':unicode(v)})
491 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
492 for k, v in self.mapped_reads.items():
493 item = ElementTree.SubElement(
494 mapped_reads, ElandLane.MAPPED_ITEM,
495 {'name':k, 'value':unicode(v)})
496 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
497 for k, v in self.match_codes.items():
498 item = ElementTree.SubElement(
499 match_codes, ElandLane.MATCH_ITEM,
500 {'name':k, 'value':unicode(v)})
501 reads = ElementTree.SubElement(lane, ElandLane.READS)
502 reads.text = unicode(self.reads)
506 def set_elements(self, tree):
507 if tree.tag != ElandLane.LANE:
508 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
511 self._mapped_reads = {}
512 self._match_codes = {}
515 tag = element.tag.lower()
516 if tag == ElandLane.SAMPLE_NAME.lower():
517 self._sample_name = element.text
518 elif tag == ElandLane.LANE_ID.lower():
519 self._lane_id = element.text
520 elif tag == ElandLane.GENOME_MAP.lower():
521 for child in element:
522 name = child.attrib['name']
523 value = child.attrib['value']
524 self.genome_map[name] = value
525 elif tag == ElandLane.MAPPED_READS.lower():
526 for child in element:
527 name = child.attrib['name']
528 value = child.attrib['value']
529 self._mapped_reads[name] = int(value)
530 elif tag == ElandLane.MATCH_CODES.lower():
531 for child in element:
532 name = child.attrib['name']
533 value = int(child.attrib['value'])
534 self._match_codes[name] = value
535 elif tag == ElandLane.READS.lower():
536 self._reads = int(element.text)
538 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
540 def extract_eland_sequence(instream, outstream, start, end):
542 Extract a chunk of sequence out of an eland file
544 for line in instream:
545 record = line.split()
547 result = [record[0], record[1][start:end]]
549 result = [record[0][start:end]]
550 outstream.write("\t".join(result))
551 outstream.write(os.linesep)
555 Summarize information from eland files
559 ELAND = 'ElandCollection'
563 def __init__(self, xml=None):
564 # we need information from the gerald config.xml
568 self.set_elements(xml)
571 return len(self.results)
574 return self.results.keys()
577 return self.results.values()
580 return self.results.items()
582 def __getitem__(self, key):
583 return self.results[key]
585 def get_elements(self):
586 root = ElementTree.Element(ELAND.ELAND,
587 {'version': unicode(ELAND.XML_VERSION)})
588 for lane_id, lane in self.results.items():
589 eland_lane = lane.get_elements()
590 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
591 root.append(eland_lane)
594 def set_elements(self, tree):
595 if tree.tag.lower() != ELAND.ELAND.lower():
596 raise ValueError('Expecting %s', ELAND.ELAND)
597 for element in list(tree):
598 lane_id = element.attrib[ELAND.LANE_ID]
599 lane = ElandLane(xml=element)
600 self.results[lane_id] = lane
602 def eland(basedir, gerald=None, genome_maps=None):
604 for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
605 # yes the lane_id is also being computed in ElandLane._update
606 # I didn't want to clutter up my constructor
607 # but I needed to persist the sample_name/lane_id for
608 # runfolder summary_report
609 path, name = os.path.split(pathname)
610 split_name = name.split('_')
611 lane_id = split_name[1]
613 if genome_maps is not None:
614 genome_map = genome_maps[lane_id]
615 elif gerald is not None:
616 genome_dir = gerald.lanes[lane_id].eland_genome
617 genome_map = build_genome_fasta_map(genome_dir)
621 eland_result = ElandLane(pathname, genome_map)
622 e.results[lane_id] = eland_result