2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
11 from gaworkflow.pipeline.runfolder import \
16 from gaworkflow.util.ethelp import indent, flatten
17 from gaworkflow.util.opener import autoopen
21 Capture meaning out of the GERALD directory
25 RUN_PARAMETERS='RunParameters'
28 class LaneParameters(object):
30 Make it easy to access elements of LaneSpecificRunParameters from python
32 def __init__(self, gerald, key):
36 def __get_attribute(self, xml_tag):
37 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38 container = subtree.find(xml_tag)
39 if container is None or \
40 len(container.getchildren()) != LANES_PER_FLOWCELL:
41 raise RuntimeError('GERALD config.xml file changed')
42 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
43 index = lanes.index(self._key)
44 #element = container.find(self._key)
45 element = container[index]
47 def _get_analysis(self):
48 return self.__get_attribute('ANALYSIS')
49 analysis = property(_get_analysis)
51 def _get_eland_genome(self):
52 return self.__get_attribute('ELAND_GENOME')
53 eland_genome = property(_get_eland_genome)
55 def _get_read_length(self):
56 return self.__get_attribute('READ_LENGTH')
57 read_length = property(_get_read_length)
59 def _get_use_bases(self):
60 return self.__get_attribute('USE_BASES')
61 use_bases = property(_get_use_bases)
63 class LaneSpecificRunParameters(object):
65 Provide access to LaneSpecificRunParameters
67 def __init__(self, gerald):
70 def __getitem__(self, key):
71 return Gerald.LaneParameters(self._gerald, key)
73 if self._keys is None:
74 tree = self._gerald.tree
75 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
76 # according to the pipeline specs I think their fields
77 # are sampleName_laneID, with sampleName defaulting to s
78 # since laneIDs are constant lets just try using
80 self._keys = [ x.tag.split('_')[1] for x in analysis]
83 return [ self[x] for x in self.keys() ]
85 return zip(self.keys(), self.values())
87 return len(self.keys())
89 def __init__(self, xml=None):
93 # parse lane parameters out of the config.xml file
94 self.lanes = Gerald.LaneSpecificRunParameters(self)
97 self.eland_results = None
100 self.set_elements(xml)
103 if self.tree is None:
104 return datetime.today()
105 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
106 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
107 return datetime.fromtimestamp(epochstamp)
108 date = property(_get_date)
111 return time.mktime(self.date.timetuple())
112 time = property(_get_time, doc='return run time as seconds since epoch')
114 def _get_version(self):
115 if self.tree is None:
117 return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
118 version = property(_get_version)
122 Debugging function, report current object
124 print 'Gerald version:', self.version
125 print 'Gerald run date:', self.date
126 print 'Gerald config.xml:', self.tree
129 def get_elements(self):
130 if self.tree is None or self.summary is None:
133 gerald = ElementTree.Element(Gerald.GERALD,
134 {'version': unicode(Gerald.XML_VERSION)})
135 gerald.append(self.tree)
136 gerald.append(self.summary.get_elements())
137 if self.eland_results:
138 gerald.append(self.eland_results.get_elements())
141 def set_elements(self, tree):
142 if tree.tag != Gerald.GERALD:
143 raise ValueError('exptected GERALD')
144 xml_version = int(tree.attrib.get('version', 0))
145 if xml_version > Gerald.XML_VERSION:
146 logging.warn('XML tree is a higher version than this class')
147 for element in list(tree):
148 tag = element.tag.lower()
149 if tag == Gerald.RUN_PARAMETERS.lower():
151 elif tag == Gerald.SUMMARY.lower():
152 self.summary = Summary(xml=element)
153 elif tag == ELAND.ELAND.lower():
154 self.eland_results = ELAND(xml=element)
156 logging.warn("Unrecognized tag %s" % (element.tag,))
159 def gerald(pathname):
161 g.pathname = pathname
162 path, name = os.path.split(pathname)
163 config_pathname = os.path.join(pathname, 'config.xml')
164 g.tree = ElementTree.parse(config_pathname).getroot()
166 # parse Summary.htm file
167 summary_pathname = os.path.join(pathname, 'Summary.htm')
168 g.summary = Summary(summary_pathname)
170 g.eland_results = eland(g.pathname, g)
175 Convert a value to int if its an int otherwise a float.
179 except ValueError, e:
183 def parse_mean_range(value):
185 Parse values like 123 +/- 4.5
187 if value.strip() == 'unknown':
190 average, pm, deviation = value.split()
192 raise RuntimeError("Summary.htm file format changed")
193 return tonumber(average), tonumber(deviation)
195 def make_mean_range_element(parent, name, mean, deviation):
197 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
199 element = ElementTree.SubElement(parent, name,
200 { 'mean': unicode(mean),
201 'deviation': unicode(deviation)})
204 def parse_mean_range_element(element):
206 Grab mean/deviation out of element
208 return (tonumber(element.attrib['mean']),
209 tonumber(element.attrib['deviation']))
211 class Summary(object):
213 Extract some useful information from the Summary.htm file
218 class LaneResultSummary(object):
220 Parse the LaneResultSummary table out of Summary.htm
221 Mostly for the cluster number
223 LANE_RESULT_SUMMARY = 'LaneResultSummary'
225 'Cluster': 'cluster',
226 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
227 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
228 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
229 'PercentPassFilterAlign': 'percent_pass_filter_align',
230 'AverageAlignmentScore': 'average_alignment_score',
231 'PercentErrorRate': 'percent_error_rate'
234 def __init__(self, html=None, xml=None):
237 self.average_first_cycle_intensity = None
238 self.percent_intensity_after_20_cycles = None
239 self.percent_pass_filter_clusters = None
240 self.percent_pass_filter_align = None
241 self.average_alignment_score = None
242 self.percent_error_rate = None
245 self.set_elements_from_html(html)
247 self.set_elements(xml)
249 def set_elements_from_html(self, row_element):
250 data = [ flatten(x) for x in row_element ]
252 raise RuntimeError("Summary.htm file format changed")
255 self.cluster = parse_mean_range(data[1])
256 self.average_first_cycle_intensity = parse_mean_range(data[2])
257 self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
258 self.percent_pass_filter_clusters = parse_mean_range(data[4])
259 self.percent_pass_filter_align = parse_mean_range(data[5])
260 self.average_alignment_score = parse_mean_range(data[6])
261 self.percent_error_rate = parse_mean_range(data[7])
263 def get_elements(self):
264 lane_result = ElementTree.Element(
265 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
267 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
268 element = make_mean_range_element(
271 *getattr(self, variable_name)
275 def set_elements(self, tree):
276 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
277 raise ValueError('Expected %s' % (
278 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
279 self.lane = tree.attrib['lane']
280 tags = Summary.LaneResultSummary.TAGS
281 for element in list(tree):
283 variable_name = tags[element.tag]
284 setattr(self, variable_name,
285 parse_mean_range_element(element))
287 logging.warn('Unrecognized tag %s' % (element.tag,))
289 def __init__(self, filename=None, xml=None):
290 self.lane_results = {}
292 if filename is not None:
293 self._extract_lane_results(filename)
295 self.set_elements(xml)
297 def __getitem__(self, key):
298 return self.lane_results[key]
301 return len(self.lane_results)
304 return self.lane_results.keys()
307 return self.lane_results.values()
310 return self.lane_results.items()
312 def _extract_lane_results(self, pathname):
314 extract the Lane Results Summary table
316 tree = ElementTree.parse(pathname).getroot()
317 if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
318 raise RuntimeError("Summary.htm file format changed")
320 tables = tree.findall('*//table')
322 # parse lane result summary
323 lane_summary = tables[2]
324 rows = lane_summary.getchildren()
325 headers = rows[0].getchildren()
326 if flatten(headers[2]) != 'Av 1st Cycle Int ':
327 raise RuntimeError("Summary.htm file format changed")
330 lrs = Summary.LaneResultSummary(html=r)
331 self.lane_results[lrs.lane] = lrs
333 def get_elements(self):
334 summary = ElementTree.Element(Summary.SUMMARY,
335 {'version': unicode(Summary.XML_VERSION)})
336 for lane in self.lane_results.values():
337 summary.append(lane.get_elements())
340 def set_elements(self, tree):
341 if tree.tag != Summary.SUMMARY:
342 return ValueError("Expected %s" % (Summary.SUMMARY,))
343 xml_version = int(tree.attrib.get('version', 0))
344 if xml_version > Summary.XML_VERSION:
345 logging.warn('Summary XML tree is a higher version than this class')
346 for element in list(tree):
347 lrs = Summary.LaneResultSummary()
348 lrs.set_elements(element)
349 self.lane_results[lrs.lane] = lrs
353 Debugging function, report current object
358 def build_genome_fasta_map(genome_dir):
359 # build fasta to fasta file map
360 genome = genome_dir.split(os.path.sep)[-1]
362 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
364 if os.path.islink(vld_file):
366 vld_file = os.path.realpath(vld_file)
367 path, vld_name = os.path.split(vld_file)
368 name, ext = os.path.splitext(vld_name)
370 fasta_map[name] = name
372 fasta_map[name] = os.path.join(genome, name)
375 class ElandLane(object):
377 Process an eland result file
381 SAMPLE_NAME = 'SampleName'
383 GENOME_MAP = 'GenomeMap'
384 GENOME_ITEM = 'GenomeItem'
385 MAPPED_READS = 'MappedReads'
386 MAPPED_ITEM = 'MappedItem'
387 MATCH_CODES = 'MatchCodes'
391 def __init__(self, pathname=None, genome_map=None, xml=None):
392 self.pathname = pathname
393 self._sample_name = None
396 self._mapped_reads = None
397 self._match_codes = None
398 if genome_map is None:
400 self.genome_map = genome_map
403 self.set_elements(xml)
407 Actually read the file and actually count the reads
409 # can't do anything if we don't have a file to process
410 if self.pathname is None:
413 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
414 raise RuntimeError("Eland isn't done, try again later.")
419 match_codes = {'NM':0, 'QC':0, 'RM':0,
420 'U0':0, 'U1':0, 'U2':0,
421 'R0':0, 'R1':0, 'R2':0,
423 for line in autoopen(self.pathname,'r'):
425 fields = line.split()
427 # match_codes[code] = match_codes.setdefault(code, 0) + 1
428 # the QC/NM etc codes are in the 3rd field and always present
429 match_codes[fields[2]] += 1
430 # ignore lines that don't have a fasta filename
433 fasta = self.genome_map.get(fields[6], fields[6])
434 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
435 self._match_codes = match_codes
436 self._mapped_reads = mapped_reads
439 def _update_name(self):
440 # extract the sample name
441 if self.pathname is None:
444 path, name = os.path.split(self.pathname)
445 split_name = name.split('_')
446 self._sample_name = split_name[0]
447 self._lane_id = split_name[1]
449 def _get_sample_name(self):
450 if self._sample_name is None:
452 return self._sample_name
453 sample_name = property(_get_sample_name)
455 def _get_lane_id(self):
456 if self._lane_id is None:
459 lane_id = property(_get_lane_id)
461 def _get_reads(self):
462 if self._reads is None:
465 reads = property(_get_reads)
467 def _get_mapped_reads(self):
468 if self._mapped_reads is None:
470 return self._mapped_reads
471 mapped_reads = property(_get_mapped_reads)
473 def _get_match_codes(self):
474 if self._match_codes is None:
476 return self._match_codes
477 match_codes = property(_get_match_codes)
479 def get_elements(self):
480 lane = ElementTree.Element(ElandLane.LANE,
482 unicode(ElandLane.XML_VERSION)})
483 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
484 sample_tag.text = self.sample_name
485 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
486 lane_tag.text = self.lane_id
487 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
488 for k, v in self.genome_map.items():
489 item = ElementTree.SubElement(
490 genome_map, ElandLane.GENOME_ITEM,
491 {'name':k, 'value':unicode(v)})
492 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
493 for k, v in self.mapped_reads.items():
494 item = ElementTree.SubElement(
495 mapped_reads, ElandLane.MAPPED_ITEM,
496 {'name':k, 'value':unicode(v)})
497 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
498 for k, v in self.match_codes.items():
499 item = ElementTree.SubElement(
500 match_codes, ElandLane.MATCH_ITEM,
501 {'name':k, 'value':unicode(v)})
502 reads = ElementTree.SubElement(lane, ElandLane.READS)
503 reads.text = unicode(self.reads)
507 def set_elements(self, tree):
508 if tree.tag != ElandLane.LANE:
509 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
512 self._mapped_reads = {}
513 self._match_codes = {}
516 tag = element.tag.lower()
517 if tag == ElandLane.SAMPLE_NAME.lower():
518 self._sample_name = element.text
519 elif tag == ElandLane.LANE_ID.lower():
520 self._lane_id = element.text
521 elif tag == ElandLane.GENOME_MAP.lower():
522 for child in element:
523 name = child.attrib['name']
524 value = child.attrib['value']
525 self.genome_map[name] = value
526 elif tag == ElandLane.MAPPED_READS.lower():
527 for child in element:
528 name = child.attrib['name']
529 value = child.attrib['value']
530 self._mapped_reads[name] = int(value)
531 elif tag == ElandLane.MATCH_CODES.lower():
532 for child in element:
533 name = child.attrib['name']
534 value = int(child.attrib['value'])
535 self._match_codes[name] = value
536 elif tag == ElandLane.READS.lower():
537 self._reads = int(element.text)
539 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
541 def extract_eland_sequence(instream, outstream, start, end):
543 Extract a chunk of sequence out of an eland file
545 for line in instream:
546 record = line.split()
548 result = [record[0], record[1][start:end]]
550 result = [record[0][start:end]]
551 outstream.write("\t".join(result))
552 outstream.write(os.linesep)
556 Summarize information from eland files
560 ELAND = 'ElandCollection'
564 def __init__(self, xml=None):
565 # we need information from the gerald config.xml
569 self.set_elements(xml)
572 return len(self.results)
575 return self.results.keys()
578 return self.results.values()
581 return self.results.items()
583 def __getitem__(self, key):
584 return self.results[key]
586 def get_elements(self):
587 root = ElementTree.Element(ELAND.ELAND,
588 {'version': unicode(ELAND.XML_VERSION)})
589 for lane_id, lane in self.results.items():
590 eland_lane = lane.get_elements()
591 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
592 root.append(eland_lane)
595 def set_elements(self, tree):
596 if tree.tag.lower() != ELAND.ELAND.lower():
597 raise ValueError('Expecting %s', ELAND.ELAND)
598 for element in list(tree):
599 lane_id = element.attrib[ELAND.LANE_ID]
600 lane = ElandLane(xml=element)
601 self.results[lane_id] = lane
603 def eland(basedir, gerald=None, genome_maps=None):
606 file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
607 if len(file_list) == 0:
608 # lets handle compressed eland files too
609 file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
611 for pathname in file_list:
612 # yes the lane_id is also being computed in ElandLane._update
613 # I didn't want to clutter up my constructor
614 # but I needed to persist the sample_name/lane_id for
615 # runfolder summary_report
616 path, name = os.path.split(pathname)
617 split_name = name.split('_')
618 lane_id = split_name[1]
620 if genome_maps is not None:
621 genome_map = genome_maps[lane_id]
622 elif gerald is not None:
623 genome_dir = gerald.lanes[lane_id].eland_genome
624 genome_map = build_genome_fasta_map(genome_dir)
628 eland_result = ElandLane(pathname, genome_map)
629 e.results[lane_id] = eland_result