3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
7 * start/stop cycle numbers
8 * Firecrest, bustard, gerald version numbers
9 * Eland analysis types, and everything in the eland configuration file.
10 * cluster numbers and other values from the Summary.htm
11 LaneSpecificParameters table.
12 * How many reads mapped to a genome from an eland file
14 The ELAND "mapped reads" counter will also check for eland squashed file
15 that were symlinked from another directory. This is so I can track how
16 many reads landed on the genome of interest and on the spike ins.
18 Basically my subdirectories something like:
21 genomes/hg18/chr*.2bpb <- files for hg18 genome
23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
26 runfolder.py can also spit out a simple summary report (-s option)
27 that contains the per lane post filter cluster numbers and the mapped
28 read counts. (The report isn't currently very pretty)
37 from pprint import pprint
41 from xml.etree import ElementTree
42 except ImportError, e:
43 from elementtree import ElementTree
45 from gaworkflow.util.alphanum import alphanum
46 EUROPEAN_STRPTIME = "%d-%m-%Y"
47 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
48 VERSION_RE = "([0-9\.]+)"
49 USER_RE = "([a-zA-Z0-9]+)"
50 LANES_PER_FLOWCELL = 8
52 runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
54 def indent(elem, level=0):
56 reformat an element tree to be 'pretty' (indented)
60 if not elem.text or not elem.text.strip():
63 indent(child, level+1)
64 # we don't want the closing tag indented too far
66 if not elem.tail or not elem.tail.strip():
69 if level and (not elem.tail or not elem.tail.strip()):
72 def flatten(elem, include_tail=0):
74 Extract the text from an element tree
75 (AKA extract the text that not part of XML tags)
77 text = elem.text or ""
80 if include_tail and elem.tail: text += elem.tail
83 class Firecrest(object):
84 def __init__(self, pathname):
85 self.pathname = pathname
87 # parse firecrest directory name
88 path, name = os.path.split(pathname)
89 groups = name.split('_')
90 # grab the start/stop cycle information
91 cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
92 self.start = int(cycle.group(1))
93 self.stop = int(cycle.group(2))
95 version = re.search(VERSION_RE, groups[1])
96 self.version = (version.group(1))
98 self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
99 self.time = time.mktime(self.date)
101 self.user = groups[3]
103 # should I parse this deeper than just stashing the
104 # contents of the matrix file?
105 matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
106 self.matrix = open(matrix_pathname, 'r').read()
109 print "Starting cycle:", self.start
110 print "Ending cycle:", self.stop
111 print "Firecrest version:", self.version
112 print "Run date:", self.date
113 print "user:", self.user
115 def _get_elements(self):
116 firecrest = ElementTree.Element('Firecrest')
117 version = ElementTree.SubElement(firecrest, 'version')
118 version.text = self.version
119 start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
120 start_cycle.text = str(self.start)
121 stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
122 stop_cycle.text = str(self.stop)
123 run_date = ElementTree.SubElement(firecrest, 'run_time')
124 run_date.text = str(self.time)
125 matrix = ElementTree.SubElement(firecrest, 'matrix')
126 matrix.text = self.matrix
128 elements=property(_get_elements)
130 class Bustard(object):
131 def __init__(self, pathname):
132 self.pathname = pathname
134 path, name = os.path.split(pathname)
135 groups = name.split("_")
136 version = re.search(VERSION_RE, groups[0])
137 self.version = version.group(1)
138 self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
139 self.time = time.mktime(self.date)
140 self.user = groups[2]
144 def _load_params(self):
145 paramfiles = glob(os.path.join(self.pathname, "params*"))
146 for paramfile in paramfiles:
147 path, name = os.path.split(paramfile)
148 basename, ext = os.path.splitext(name)
149 # the last character of the param filename should be the
151 lane = int(basename[-1])
152 # we want the whole tree, not just the stuff under
154 param_tree = ElementTree.parse(paramfile).getroot()
155 self.phasing[lane] = param_tree
158 print "Bustard version:", self.version
159 print "Run date", self.date
160 print "user:", self.user
161 for lane, tree in self.phasing.items():
165 def _get_elements(self):
166 bustard = ElementTree.Element('Bustard')
167 version = ElementTree.SubElement(bustard, 'version')
168 version.text = self.version
169 run_date = ElementTree.SubElement(bustard, 'run_time')
170 run_date.text = str(self.time)
172 elements=property(_get_elements)
174 class GERALD(object):
176 Capture meaning out of the GERALD directory
178 class LaneParameters(object):
180 Make it easy to access elements of LaneSpecificRunParameters from python
182 def __init__(self, tree, key):
183 self._tree = tree.find('LaneSpecificRunParameters')
186 def __get_attribute(self, xml_tag):
187 container = self._tree.find(xml_tag)
188 if container is None or \
189 len(container.getchildren()) != LANES_PER_FLOWCELL:
190 raise RuntimeError('GERALD config.xml file changed')
191 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
192 index = lanes.index(self._key)
193 #element = container.find(self._key)
194 element = container[index]
196 def _get_analysis(self):
197 return self.__get_attribute('ANALYSIS')
198 analysis = property(_get_analysis)
200 def _get_eland_genome(self):
201 return self.__get_attribute('ELAND_GENOME')
202 eland_genome = property(_get_eland_genome)
204 def _get_read_length(self):
205 return self.__get_attribute('READ_LENGTH')
206 read_length = property(_get_read_length)
208 def _get_use_bases(self):
209 return self.__get_attribute('USE_BASES')
210 use_bases = property(_get_use_bases)
212 class LaneSpecificRunParameters(object):
214 Provide access to LaneSpecificRunParameters
216 def __init__(self, tree):
219 def __getitem__(self, key):
220 return GERALD.LaneParameters(self._tree, key)
222 if self._keys is None:
223 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
224 # according to the pipeline specs I think their fields
225 # are sampleName_laneID, with sampleName defaulting to s
226 # since laneIDs are constant lets just try using
227 # those consistently.
228 self._keys = [ x.tag.split('_')[1] for x in analysis]
231 return [ self[x] for x in self.keys() ]
233 return zip(self.keys(), self.values())
235 return len(self.keys)
237 def __init__(self, pathname):
238 self.pathname = pathname
239 path, name = os.path.split(pathname)
240 config_pathname = os.path.join(self.pathname, 'config.xml')
241 self.tree = ElementTree.parse(config_pathname).getroot()
242 self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
244 date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
245 self.date = time.strptime(date)
246 self.time = time.mktime(self.date)
248 # parse Summary.htm file
249 summary_pathname = os.path.join(self.pathname, 'Summary.htm')
250 self.summary = Summary(summary_pathname)
251 self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
252 self.eland_results = ELAND(self, self.pathname)
256 Debugging function, report current object
258 print 'Gerald version:', self.version
259 print 'Gerald run date:', self.date
260 print 'Gerald config.xml:', self.tree
263 def _get_elements(self):
264 gerald = ElementTree.Element('Gerald')
265 gerald.append(self.tree)
266 gerald.append(self.summary.elements)
268 elements = property(_get_elements)
272 Convert a value to int if its an int otherwise a float.
276 except ValueError, e:
280 def parse_mean_range(value):
282 Parse values like 123 +/- 4.5
284 if value.strip() == 'unknown':
287 average, pm, deviation = value.split()
289 raise RuntimeError("Summary.htm file format changed")
290 return tonumber(average), tonumber(deviation)
292 def mean_range_element(parent, name, mean, deviation):
294 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
296 element = ElementTree.SubElement(parent, name,
298 'deviation': str(deviation)})
301 class LaneResultSummary(object):
303 Parse the LaneResultSummary table out of Summary.htm
304 Mostly for the cluster number
306 def __init__(self, row_element):
307 data = [ flatten(x) for x in row_element ]
309 raise RuntimeError("Summary.htm file format changed")
312 self.cluster = parse_mean_range(data[1])
313 self.average_first_cycle_intensity = parse_mean_range(data[2])
314 self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
315 self.percent_pass_filter_clusters = parse_mean_range(data[4])
316 self.percent_pass_filter_align = parse_mean_range(data[5])
317 self.average_alignment_score = parse_mean_range(data[6])
318 self.percent_error_rate = parse_mean_range(data[7])
320 def _get_elements(self):
321 lane_result = ElementTree.Element('LaneResultSummary',
323 cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
324 first_cycle = mean_range_element(lane_result,
325 'AverageFirstCycleIntensity',
326 *self.average_first_cycle_intensity)
327 after_20 = mean_range_element(lane_result,
328 'PercentIntensityAfter20Cycles',
329 *self.percent_intensity_after_20_cycles)
330 pass_filter = mean_range_element(lane_result,
331 'PercentPassFilterClusters',
332 *self.percent_pass_filter_clusters)
333 alignment = mean_range_element(lane_result,
334 'AverageAlignmentScore',
335 *self.average_alignment_score)
336 error_rate = mean_range_element(lane_result,
338 *self.percent_error_rate)
340 elements = property(_get_elements)
342 class Summary(object):
344 Extract some useful information from the Summary.htm file
346 def __init__(self, pathname):
347 self.pathname = pathname
348 self.tree = ElementTree.parse(pathname).getroot()
349 self.lane_results = {}
351 self._extract_lane_results()
353 def _extract_lane_results(self):
355 extract the Lane Results Summary table
357 if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
358 raise RuntimeError("Summary.htm file format changed")
360 tables = self.tree.findall('*//table')
362 # parse lane result summary
363 lane_summary = tables[2]
364 rows = lane_summary.getchildren()
365 headers = rows[0].getchildren()
366 if flatten(headers[2]) != 'Av 1st Cycle Int ':
367 raise RuntimeError("Summary.htm file format changed")
370 lrs = LaneResultSummary(r)
371 self.lane_results[lrs.lane] = lrs
373 def _get_elements(self):
374 summary = ElementTree.Element('Summary')
375 for lane in self.lane_results.values():
376 summary.append(lane.elements)
378 elements = property(_get_elements)
382 Debugging function, report current object
389 Summarize information from eland files
391 class ElandResult(object):
393 Process an eland result file
395 def __init__(self, gerald, pathname):
397 self.pathname = pathname
398 # extract the sample name
399 path, name = os.path.split(self.pathname)
400 split_name = name.split('_')
401 self.sample_name = split_name[0]
402 self.lane_id = split_name[1]
404 self._mapped_reads = None
407 def _build_fasta_map(self, genome_dir):
408 # build fasta to fasta file map
409 genome = genome_dir.split(os.path.sep)[-1]
411 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
413 if os.path.islink(vld_file):
415 vld_file = os.path.realpath(vld_file)
416 path, vld_name = os.path.split(vld_file)
417 name, ext = os.path.splitext(vld_name)
419 fasta_map[name] = name
421 fasta_map[name] = os.path.join(genome, name)
422 self._fasta_map = fasta_map
426 Actually read the file and actually count the reads
428 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
429 raise RuntimeError("Eland isn't done, try again later.")
433 genome_dir = self.gerald.lanes[self.lane_id].eland_genome
434 self._build_fasta_map(genome_dir)
435 match_codes = {'NM':0, 'QC':0, 'RM':0,
436 'U0':0, 'U1':0, 'U2':0,
437 'R0':0, 'R1':0, 'R2':0,
439 for line in open(self.pathname):
441 fields = line.split()
443 # match_codes[code] = match_codes.setdefault(code, 0) + 1
444 # the QC/NM etc codes are in the 3rd field and always present
445 match_codes[fields[2]] += 1
446 # ignore lines that don't have a fasta filename
449 fasta = self._fasta_map.get(fields[6], fields[6])
450 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
451 self._match_codes = match_codes
452 self._mapped_reads = mapped_reads
455 def _get_reads(self):
456 if self._reads is None:
459 reads = property(_get_reads)
461 def _get_mapped_reads(self):
462 if self._mapped_reads is None:
464 return self._mapped_reads
465 mapped_reads = property(_get_mapped_reads)
467 def __init__(self, gerald, basedir):
468 # we need information from the gerald config.xml
471 for f in glob(os.path.join(basedir, "*_eland_result.txt")):
472 eland_result = ELAND.ElandResult(gerald, f)
473 self.results[eland_result.lane_id] = eland_result
475 class PipelineRun(object):
477 Capture "interesting" information about a pipeline run
479 def __init__(self, pathname, firecrest, bustard, gerald):
480 self.pathname = pathname
482 self._flowcell_id = None
483 self.firecrest = firecrest
484 self.bustard = bustard
487 def _get_flowcell_id(self):
488 # extract flowcell ID
489 if self._flowcell_id is None:
490 config_dir = os.path.join(self.pathname, 'Config')
491 flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
492 if os.path.exists(flowcell_id_path):
493 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
494 self._flowcell_id = flowcell_id_tree.findtext('Text')
497 "Unable to determine flowcell id as %s was not found" % (
499 self._flowcell_id = "unknown"
500 return self._flowcell_id
501 flowcell_id = property(_get_flowcell_id)
505 make one master xml file from all of our sub-components.
507 root = ElementTree.Element('PipelineRun')
508 flowcell = ElementTree.SubElement(root, 'FlowcellID')
509 flowcell.text = self.flowcell_id
510 root.append(self.firecrest.elements)
511 root.append(self.bustard.elements)
512 root.append(self.gerald.elements)
515 def _get_pretty_xml(self):
517 Generate indented xml file
519 root = self._get_xml()
522 xml = property(_get_pretty_xml)
524 def _get_run_name(self):
526 Given a run tuple, find the latest date and use that as our name
528 if self._name is None:
529 tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
530 timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
531 self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
533 name = property(_get_run_name)
536 logging.info("Saving run report "+ self.name)
537 ElementTree.ElementTree(self.xml).write(self.name)
539 def get_runs(runfolder):
541 Search through a run folder for all the various sub component runs
542 and then return a PipelineRun for each different combination.
544 For example if there are two different GERALD runs, this will
545 generate two different PipelineRun objects, that differ
546 in there gerald component.
548 datadir = os.path.join(runfolder, 'Data')
550 logging.info('Searching for runs in ' + datadir)
552 for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
553 f = Firecrest(firecrest_pathname)
554 bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
555 for bustard_pathname in glob(bustard_glob):
556 b = Bustard(bustard_pathname)
557 gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
558 for gerald_pathname in glob(gerald_glob):
560 g = GERALD(gerald_pathname)
561 runs.append(PipelineRun(runfolder, f, b, g))
563 print "Ignoring", str(e)
567 def extract_run_parameters(runs):
569 Search through runfolder_path for various runs and grab their parameters
574 def summarize_mapped_reads(mapped_reads):
576 Summarize per chromosome reads into a genome count
577 But handle spike-in/contamination symlinks seperately.
579 summarized_reads = {}
582 for k, v in mapped_reads.items():
583 path, k = os.path.split(k)
588 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
589 summarized_reads[genome] = genome_reads
590 return summarized_reads
592 def summary_report(runs):
594 Summarize cluster numbers and mapped read counts for a runfolder
599 report.append('Summary for %s' % (run.name,))
601 eland_keys = run.gerald.eland_results.results.keys()
602 eland_keys.sort(alphanum)
604 lane_results = run.gerald.summary.lane_results
605 for lane_id in eland_keys:
606 result = run.gerald.eland_results.results[lane_id]
607 report.append("Sample name %s" % (result.sample_name))
608 report.append("Lane id %s" % (result.lane_id,))
609 cluster = lane_results[result.lane_id].cluster
610 report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
611 report.append("Total Reads: %d" % (result.reads))
612 mc = result._match_codes
613 report.append("No Match: %d" % (mc['NM']))
614 report.append("QC Failed: %d" % (mc['QC']))
615 report.append('Unique (0,1,2 mismatches) %d %d %d' % \
616 (mc['U0'], mc['U1'], mc['U2']))
617 report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
618 (mc['R0'], mc['R1'], mc['R2']))
619 report.append("Mapped Reads")
620 mapped_reads = summarize_mapped_reads(result.mapped_reads)
621 for name, counts in mapped_reads.items():
622 report.append(" %s: %d" % (name, counts))
625 return os.linesep.join(report)
628 usage = 'usage: %prog [options] runfolder_root_dir'
629 parser = optparse.OptionParser(usage)
630 parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
632 help='turn on verbose mode')
633 parser.add_option('-s', '--summary', dest='summary', action='store_true',
635 help='produce summary report')
636 parser.add_option('-a', '--archive', dest='archive', action='store_true',
638 help='generate run configuration archive')
641 def main(cmdlist=None):
642 parser = make_parser()
643 opt, args = parser.parse_args(cmdlist)
646 parser.error('need path to a runfolder')
648 logging.basicConfig()
650 root_log = logging.getLogger()
651 root_log.setLevel(logging.INFO)
653 for runfolder in args:
654 runs = get_runs(runfolder)
656 print summary_report(runs)
658 extract_run_parameters(runs)
662 if __name__ == "__main__":
663 sys.exit(main(sys.argv[1:]))