3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
7 * start/stop cycle numbers
8 * Firecrest, bustard, gerald version numbers
9 * Eland analysis types, and everything in the eland configuration file.
10 * cluster numbers and other values from the Summary.htm
11 LaneSpecificParameters table.
12 * How many reads mapped to a genome from an eland file
14 The ELAND "mapped reads" counter will also check for eland squashed file
15 that were symlinked from another directory. This is so I can track how
16 many reads landed on the genome of interest and on the spike ins.
18 Basically my subdirectories something like:
21 genomes/hg18/chr*.2bpb <- files for hg18 genome
23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
26 runfolder.py can also spit out a simple summary report (-s option)
27 that contains the per lane post filter cluster numbers and the mapped
28 read counts. (The report isn't currently very pretty)
37 from pprint import pprint
41 from xml.etree import ElementTree
42 except ImportError, e:
43 from elementtree import ElementTree
45 from gaworkflow.util.alphanum import alphanum
46 EUROPEAN_STRPTIME = "%d-%m-%Y"
47 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
48 VERSION_RE = "([0-9\.]+)"
49 USER_RE = "([a-zA-Z0-9]+)"
50 LANES_PER_FLOWCELL = 8
52 runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
54 def indent(elem, level=0):
56 reformat an element tree to be 'pretty' (indented)
60 if not elem.text or not elem.text.strip():
63 indent(child, level+1)
64 # we don't want the closing tag indented too far
66 if not elem.tail or not elem.tail.strip():
69 if level and (not elem.tail or not elem.tail.strip()):
72 def flatten(elem, include_tail=0):
74 Extract the text from an element tree
75 (AKA extract the text that not part of XML tags)
77 text = elem.text or ""
80 if include_tail and elem.tail: text += elem.tail
83 class Firecrest(object):
84 def __init__(self, pathname):
85 self.pathname = pathname
87 # parse firecrest directory name
88 path, name = os.path.split(pathname)
89 groups = name.split('_')
90 # grab the start/stop cycle information
91 cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
92 self.start = int(cycle.group(1))
93 self.stop = int(cycle.group(2))
95 version = re.search(VERSION_RE, groups[1])
96 self.version = (version.group(1))
98 self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
99 self.time = time.mktime(self.date)
101 self.user = groups[3]
103 # should I parse this deeper than just stashing the
104 # contents of the matrix file?
105 matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
106 self.matrix = open(matrix_pathname, 'r').read()
109 print "Starting cycle:", self.start
110 print "Ending cycle:", self.stop
111 print "Firecrest version:", self.version
112 print "Run date:", self.date
113 print "user:", self.user
115 def _get_elements(self):
116 firecrest = ElementTree.Element('Firecrest')
117 version = ElementTree.SubElement(firecrest, 'version')
118 version.text = self.version
119 start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
120 start_cycle.text = str(self.start)
121 stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
122 stop_cycle.text = str(self.stop)
123 run_date = ElementTree.SubElement(firecrest, 'run_time')
124 run_date.text = str(self.time)
125 matrix = ElementTree.SubElement(firecrest, 'matrix')
126 matrix.text = self.matrix
128 elements=property(_get_elements)
130 class Bustard(object):
131 def __init__(self, pathname):
132 self.pathname = pathname
134 path, name = os.path.split(pathname)
135 groups = name.split("_")
136 version = re.search(VERSION_RE, groups[0])
137 self.version = version.group(1)
138 self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
139 self.time = time.mktime(self.date)
140 self.user = groups[2]
144 def _load_params(self):
145 paramfiles = glob(os.path.join(self.pathname, "params*"))
146 for paramfile in paramfiles:
147 path, name = os.path.split(paramfile)
148 basename, ext = os.path.splitext(name)
149 # the last character of the param filename should be the
151 lane = int(basename[-1])
152 # we want the whole tree, not just the stuff under
154 param_tree = ElementTree.parse(paramfile).getroot()
155 self.phasing[lane] = param_tree
158 print "Bustard version:", self.version
159 print "Run date", self.date
160 print "user:", self.user
161 for lane, tree in self.phasing.items():
165 def _get_elements(self):
166 bustard = ElementTree.Element('Bustard')
167 version = ElementTree.SubElement(bustard, 'version')
168 version.text = self.version
169 run_date = ElementTree.SubElement(bustard, 'run_time')
170 run_date.text = str(self.time)
172 elements=property(_get_elements)
174 class GERALD(object):
176 Capture meaning out of the GERALD directory
178 class LaneParameters(object):
180 Make it easy to access elements of LaneSpecificRunParameters from python
182 def __init__(self, tree, key):
183 self._tree = tree.find('LaneSpecificRunParameters')
186 def __get_attribute(self, xml_tag):
187 container = self._tree.find(xml_tag)
188 if container is None or \
189 len(container.getchildren()) != LANES_PER_FLOWCELL:
190 raise RuntimeError('GERALD config.xml file changed')
191 element = container.find(self._key)
193 def _get_analysis(self):
194 return self.__get_attribute('ANALYSIS')
195 analysis = property(_get_analysis)
197 def _get_eland_genome(self):
198 return self.__get_attribute('ELAND_GENOME')
199 eland_genome = property(_get_eland_genome)
201 def _get_read_length(self):
202 return self.__get_attribute('READ_LENGTH')
203 read_length = property(_get_read_length)
205 def _get_use_bases(self):
206 return self.__get_attribute('USE_BASES')
207 use_bases = property(_get_use_bases)
209 class LaneSpecificRunParameters(object):
211 Provide access to LaneSpecificRunParameters
213 def __init__(self, tree):
216 def __getitem__(self, key):
217 return GERALD.LaneParameters(self._tree, key)
219 if self._keys is None:
220 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
221 self._keys = [ x.tag for x in analysis]
224 return [ self[x] for x in self.keys() ]
226 return zip(self.keys(), self.values())
228 return len(self.keys)
230 def __init__(self, pathname):
231 self.pathname = pathname
232 path, name = os.path.split(pathname)
233 config_pathname = os.path.join(self.pathname, 'config.xml')
234 self.tree = ElementTree.parse(config_pathname).getroot()
235 self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
237 date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
238 self.date = time.strptime(date)
239 self.time = time.mktime(self.date)
241 # parse Summary.htm file
242 summary_pathname = os.path.join(self.pathname, 'Summary.htm')
243 self.summary = Summary(summary_pathname)
244 self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
245 self.eland_results = ELAND(self, self.pathname)
249 Debugging function, report current object
251 print 'Gerald version:', self.version
252 print 'Gerald run date:', self.date
253 print 'Gerald config.xml:', self.tree
256 def _get_elements(self):
257 gerald = ElementTree.Element('Gerald')
258 gerald.append(self.tree)
259 gerald.append(self.summary.elements)
261 elements = property(_get_elements)
265 Convert a value to int if its an int otherwise a float.
269 except ValueError, e:
273 def parse_mean_range(value):
275 Parse values like 123 +/- 4.5
277 if value.strip() == 'unknown':
280 average, pm, deviation = value.split()
282 raise RuntimeError("Summary.htm file format changed")
283 return tonumber(average), tonumber(deviation)
285 def mean_range_element(parent, name, mean, deviation):
287 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
289 element = ElementTree.SubElement(parent, name,
291 'deviation': str(deviation)})
294 class LaneResultSummary(object):
296 Parse the LaneResultSummary table out of Summary.htm
297 Mostly for the cluster number
299 def __init__(self, row_element):
300 data = [ flatten(x) for x in row_element ]
302 raise RuntimeError("Summary.htm file format changed")
305 self.cluster = parse_mean_range(data[1])
306 self.average_first_cycle_intensity = parse_mean_range(data[2])
307 self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
308 self.percent_pass_filter_clusters = parse_mean_range(data[4])
309 self.percent_pass_filter_align = parse_mean_range(data[5])
310 self.average_alignment_score = parse_mean_range(data[6])
311 self.percent_error_rate = parse_mean_range(data[7])
313 def _get_elements(self):
314 lane_result = ElementTree.Element('LaneResultSummary',
316 cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
317 first_cycle = mean_range_element(lane_result,
318 'AverageFirstCycleIntensity',
319 *self.average_first_cycle_intensity)
320 after_20 = mean_range_element(lane_result,
321 'PercentIntensityAfter20Cycles',
322 *self.percent_intensity_after_20_cycles)
323 pass_filter = mean_range_element(lane_result,
324 'PercentPassFilterClusters',
325 *self.percent_pass_filter_clusters)
326 alignment = mean_range_element(lane_result,
327 'AverageAlignmentScore',
328 *self.average_alignment_score)
329 error_rate = mean_range_element(lane_result,
331 *self.percent_error_rate)
333 elements = property(_get_elements)
335 class Summary(object):
337 Extract some useful information from the Summary.htm file
339 def __init__(self, pathname):
340 self.pathname = pathname
341 self.tree = ElementTree.parse(pathname).getroot()
342 self.lane_results = []
344 self._extract_lane_results()
346 def _extract_lane_results(self):
348 extract the Lane Results Summary table
350 if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
351 raise RuntimeError("Summary.htm file format changed")
353 table = self.tree.findall('*//table')[2]
354 rows = table.getchildren()
355 headers = rows[0].getchildren()
356 if flatten(headers[2]) != 'Av 1st Cycle Int ':
357 raise RuntimeError("Summary.htm file format changed")
360 self.lane_results.append(LaneResultSummary(r))
362 def _get_elements(self):
363 summary = ElementTree.Element('Summary')
364 for lane in self.lane_results:
365 summary.append(lane.elements)
367 elements = property(_get_elements)
371 Debugging function, report current object
378 Summarize information from eland files
380 class ElandResult(object):
382 Process an eland result file
384 def __init__(self, gerald, pathname):
386 self.pathname = pathname
387 # extract the sample name
388 path, name = os.path.split(self.pathname)
389 self.sample_name = name.replace("_eland_result.txt","")
391 self._mapped_reads = None
394 def _build_fasta_map(self, genome_dir):
395 # build fasta to fasta file map
396 genome = genome_dir.split(os.path.sep)[-1]
398 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
400 if os.path.islink(vld_file):
402 vld_file = os.path.realpath(vld_file)
403 path, vld_name = os.path.split(vld_file)
404 name, ext = os.path.splitext(vld_name)
406 fasta_map[name] = name
408 fasta_map[name] = os.path.join(genome, name)
409 self._fasta_map = fasta_map
413 Actually read the file and actually count the reads
415 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
416 raise RuntimeError("Eland isn't done, try again later.")
420 genome_dir = self.gerald.lanes[self.sample_name].eland_genome
421 self._build_fasta_map(genome_dir)
422 match_codes = {'NM':0, 'QC':0, 'RM':0,
423 'U0':0, 'U1':0, 'U2':0,
424 'R0':0, 'R1':0, 'R2':0,
426 for line in open(self.pathname):
428 fields = line.split()
430 # match_codes[code] = match_codes.setdefault(code, 0) + 1
431 # the QC/NM etc codes are in the 3rd field and always present
432 match_codes[fields[2]] += 1
433 # ignore lines that don't have a fasta filename
436 fasta = self._fasta_map.get(fields[6], fields[6])
437 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
438 self._match_codes = match_codes
439 self._mapped_reads = mapped_reads
442 def _get_reads(self):
443 if self._reads is None:
446 reads = property(_get_reads)
448 def _get_mapped_reads(self):
449 if self._mapped_reads is None:
451 return self._mapped_reads
452 mapped_reads = property(_get_mapped_reads)
454 def __init__(self, gerald, basedir):
455 # we need information from the gerald config.xml
458 for f in glob(os.path.join(basedir, "*_eland_result.txt")):
459 eland_result = ELAND.ElandResult(gerald, f)
460 self.results[eland_result.sample_name] = eland_result
462 class PipelineRun(object):
464 Capture "interesting" information about a pipeline run
466 def __init__(self, pathname, firecrest, bustard, gerald):
467 self.pathname = pathname
469 self._flowcell_id = None
470 self.firecrest = firecrest
471 self.bustard = bustard
474 def _get_flowcell_id(self):
475 # extract flowcell ID
476 if self._flowcell_id is None:
477 config_dir = os.path.join(self.pathname, 'Config')
478 flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
479 if os.path.exists(flowcell_id_path):
480 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
481 self._flowcell_id = flowcell_id_tree.findtext('Text')
484 "Unable to determine flowcell id as %s was not found" % (
486 self._flowcell_id = "unknown"
487 return self._flowcell_id
488 flowcell_id = property(_get_flowcell_id)
492 make one master xml file from all of our sub-components.
494 root = ElementTree.Element('PipelineRun')
495 flowcell = ElementTree.SubElement(root, 'FlowcellID')
496 flowcell.text = self.flowcell_id
497 root.append(self.firecrest.elements)
498 root.append(self.bustard.elements)
499 root.append(self.gerald.elements)
502 def _get_pretty_xml(self):
504 Generate indented xml file
506 root = self._get_xml()
509 xml = property(_get_pretty_xml)
511 def _get_run_name(self):
513 Given a run tuple, find the latest date and use that as our name
515 if self._name is None:
516 tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
517 timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
518 self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
520 name = property(_get_run_name)
523 logging.info("Saving run report "+ self.name)
524 ElementTree.ElementTree(self.xml).write(self.name)
526 def get_runs(runfolder):
528 Search through a run folder for all the various sub component runs
529 and then return a PipelineRun for each different combination.
531 For example if there are two different GERALD runs, this will
532 generate two different PipelineRun objects, that differ
533 in there gerald component.
535 datadir = os.path.join(runfolder, 'Data')
537 logging.info('Searching for runs in ' + datadir)
539 for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
540 f = Firecrest(firecrest_pathname)
541 bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
542 for bustard_pathname in glob(bustard_glob):
543 b = Bustard(bustard_pathname)
544 gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
545 for gerald_pathname in glob(gerald_glob):
547 g = GERALD(gerald_pathname)
548 runs.append(PipelineRun(runfolder, f, b, g))
550 print "Ignoring", str(e)
554 def extract_run_parameters(runs):
556 Search through runfolder_path for various runs and grab their parameters
561 def summarize_mapped_reads(mapped_reads):
563 Summarize per chromosome reads into a genome count
564 But handle spike-in/contamination symlinks seperately.
566 summarized_reads = {}
569 for k, v in mapped_reads.items():
570 path, k = os.path.split(k)
575 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
576 summarized_reads[genome] = genome_reads
577 return summarized_reads
579 def summary_report(runs):
581 Summarize cluster numbers and mapped read counts for a runfolder
585 print 'Summary for', run.name
586 for lane in run.gerald.summary.lane_results:
587 print 'lane', lane.lane, 'clusters', lane.cluster[0], '+/-',
588 print lane.cluster[1]
591 sample_keys = run.gerald.eland_results.results.keys()
592 sample_keys.sort(alphanum)
593 for sample in sample_keys:
595 result = run.gerald.eland_results.results[sample]
596 print "Sample name", sample
597 print "Total Reads", result.reads
598 mc = result._match_codes
599 print "No Match", mc['NM']
600 print "QC Failed", mc['QC']
601 print 'Unique (0,1,2 mismatches)', mc['U0'], mc['U1'], mc['U2']
602 print 'Repeat (0,1,2 mismatches)', mc['R0'], mc['R1'], mc['R2']
604 pprint(summarize_mapped_reads(result.mapped_reads))
607 usage = 'usage: %prog [options] runfolder_root_dir'
608 parser = optparse.OptionParser(usage)
609 parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
611 help='turn on verbose mode')
612 parser.add_option('-s', '--summary', dest='summary', action='store_true',
614 help='produce summary report')
615 parser.add_option('-a', '--archive', dest='archive', action='store_true',
617 help='generate run configuration archive')
620 def main(cmdlist=None):
621 parser = make_parser()
622 opt, args = parser.parse_args(cmdlist)
625 parser.error('need path to a runfolder')
627 logging.basicConfig()
629 root_log = logging.getLogger()
630 root_log.setLevel(logging.INFO)
632 for runfolder in args:
633 runs = get_runs(runfolder)
637 extract_run_parameters(runs)
641 if __name__ == "__main__":
642 sys.exit(main(sys.argv[1:]))