e19790ed8e5a54aacc76ad24b71330073912763b
[htsworkflow.git] / scripts / runfolder.py
1 #!/usr/bin/env python
2 """
3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
4
5   * Flowcell ID
6   * run dates
7   * start/stop cycle numbers
8   * Firecrest, bustard, gerald version numbers
9   * Eland analysis types, and everything in the eland configuration file.
10   * cluster numbers and other values from the Summary.htm 
11     LaneSpecificParameters table. 
12   * How many reads mapped to a genome from an eland file
13
14 The ELAND "mapped reads" counter will also check for eland squashed file
15 that were symlinked from another directory. This is so I can track how 
16 many reads landed on the genome of interest and on the spike ins. 
17
18 Basically my subdirectories something like:
19
20 genomes/hg18
21 genomes/hg18/chr*.2bpb <- files for hg18 genome
22 genomes/hg18/chr*.vld  
23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
24 genomes/spikein 
25
26 runfolder.py can also spit out a simple summary report (-s option) 
27 that contains the per lane post filter cluster numbers and the mapped 
28 read counts. (The report isn't currently very pretty)
29 """
30 import time
31 import logging
32 import os
33 import re
34 import stat
35 import sys
36 from glob import glob
37 from pprint import pprint
38 import optparse
39
40 try:
41   from xml.etree import ElementTree
42 except ImportError, e:
43   from elementtree import ElementTree
44
45 from gaworkflow.util.alphanum import alphanum
46 EUROPEAN_STRPTIME = "%d-%m-%Y"
47 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
48 VERSION_RE = "([0-9\.]+)"
49 USER_RE = "([a-zA-Z0-9]+)"
50 LANES_PER_FLOWCELL = 8
51
52 runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
53
54 def indent(elem, level=0):
55     """
56     reformat an element tree to be 'pretty' (indented)
57     """
58     i = "\n" + level*"  "
59     if len(elem):
60         if not elem.text or not elem.text.strip():
61             elem.text = i + "  "
62         for child in elem:
63             indent(child, level+1)
64         # we don't want the closing tag indented too far
65         child.tail = i
66         if not elem.tail or not elem.tail.strip():
67             elem.tail = i
68     else:
69         if level and (not elem.tail or not elem.tail.strip()):
70             elem.tail = i
71
72 def flatten(elem, include_tail=0):
73     """
74     Extract the text from an element tree 
75     (AKA extract the text that not part of XML tags)
76     """
77     text = elem.text or ""
78     for e in elem:
79         text += flatten(e, 1)
80     if include_tail and elem.tail: text += elem.tail
81     return text
82
83 class Firecrest(object):
84     def __init__(self, pathname):
85         self.pathname = pathname
86
87         # parse firecrest directory name
88         path, name = os.path.split(pathname)
89         groups = name.split('_')
90         # grab the start/stop cycle information
91         cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
92         self.start = int(cycle.group(1))
93         self.stop = int(cycle.group(2))
94         # firecrest version
95         version = re.search(VERSION_RE, groups[1])
96         self.version = (version.group(1))
97         # datetime
98         self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
99         self.time = time.mktime(self.date)
100         # username
101         self.user = groups[3]
102
103         # should I parse this deeper than just stashing the 
104         # contents of the matrix file?
105         matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
106         self.matrix = open(matrix_pathname, 'r').read()
107         
108     def dump(self):
109         print "Starting cycle:", self.start
110         print "Ending cycle:", self.stop
111         print "Firecrest version:", self.version
112         print "Run date:", self.date
113         print "user:", self.user
114
115     def _get_elements(self):
116         firecrest = ElementTree.Element('Firecrest')
117         version = ElementTree.SubElement(firecrest, 'version')
118         version.text = self.version
119         start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
120         start_cycle.text = str(self.start)
121         stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
122         stop_cycle.text = str(self.stop)
123         run_date = ElementTree.SubElement(firecrest, 'run_time')
124         run_date.text = str(self.time)
125         matrix = ElementTree.SubElement(firecrest, 'matrix')
126         matrix.text = self.matrix
127         return firecrest
128     elements=property(_get_elements)
129
130 class Bustard(object):
131     def __init__(self, pathname):
132         self.pathname = pathname
133
134         path, name = os.path.split(pathname)
135         groups = name.split("_")
136         version = re.search(VERSION_RE, groups[0])
137         self.version = version.group(1)
138         self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
139         self.time = time.mktime(self.date)
140         self.user = groups[2]
141         self.phasing = {}
142         self._load_params()
143
144     def _load_params(self):
145         paramfiles = glob(os.path.join(self.pathname, "params*"))
146         for paramfile in paramfiles:
147             path, name = os.path.split(paramfile)
148             basename, ext = os.path.splitext(name)
149             # the last character of the param filename should be the
150             # lane number
151             lane = int(basename[-1])
152             # we want the whole tree, not just the stuff under
153             # the first tag
154             param_tree = ElementTree.parse(paramfile).getroot()
155             self.phasing[lane] = param_tree
156
157     def dump(self):
158         print "Bustard version:", self.version
159         print "Run date", self.date
160         print "user:", self.user
161         for lane, tree in self.phasing.items():
162             print lane
163             print tree
164
165     def _get_elements(self):
166         bustard = ElementTree.Element('Bustard')
167         version = ElementTree.SubElement(bustard, 'version')
168         version.text = self.version
169         run_date = ElementTree.SubElement(bustard, 'run_time')
170         run_date.text = str(self.time)
171         return bustard
172     elements=property(_get_elements)
173         
174 class GERALD(object):
175     """
176     Capture meaning out of the GERALD directory
177     """
178     class LaneParameters(object):
179         """
180         Make it easy to access elements of LaneSpecificRunParameters from python
181         """
182         def __init__(self, tree, key):
183             self._tree = tree.find('LaneSpecificRunParameters')
184             self._key = key
185         
186         def __get_attribute(self, xml_tag):
187             container = self._tree.find(xml_tag)
188             if container is None or \
189                len(container.getchildren()) != LANES_PER_FLOWCELL:
190                 raise RuntimeError('GERALD config.xml file changed')
191             element = container.find(self._key)
192             return element.text
193         def _get_analysis(self):
194             return self.__get_attribute('ANALYSIS')
195         analysis = property(_get_analysis)
196
197         def _get_eland_genome(self):
198             return self.__get_attribute('ELAND_GENOME')
199         eland_genome = property(_get_eland_genome)
200
201         def _get_read_length(self):
202             return self.__get_attribute('READ_LENGTH')
203         read_length = property(_get_read_length)
204
205         def _get_use_bases(self):
206             return self.__get_attribute('USE_BASES')
207         use_bases = property(_get_use_bases)
208
209     class LaneSpecificRunParameters(object):
210         """
211         Provide access to LaneSpecificRunParameters
212         """
213         def __init__(self, tree):
214             self._tree = tree
215             self._keys = None
216         def __getitem__(self, key):
217             return GERALD.LaneParameters(self._tree, key)
218         def keys(self):
219             if self._keys is None:
220                 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
221                 self._keys = [ x.tag for x in analysis]
222             return self._keys
223         def values(self):
224             return [ self[x] for x in self.keys() ]
225         def items(self):
226             return zip(self.keys(), self.values())
227         def __len__(self):
228             return len(self.keys)
229
230     def __init__(self, pathname):
231         self.pathname = pathname
232         path, name = os.path.split(pathname)
233         config_pathname = os.path.join(self.pathname, 'config.xml')
234         self.tree = ElementTree.parse(config_pathname).getroot()
235         self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
236
237         date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
238         self.date = time.strptime(date)
239         self.time = time.mktime(self.date)
240         
241         # parse Summary.htm file
242         summary_pathname = os.path.join(self.pathname, 'Summary.htm')
243         self.summary = Summary(summary_pathname)
244         self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
245         self.eland_results = ELAND(self, self.pathname)
246
247     def dump(self):
248         """
249         Debugging function, report current object
250         """
251         print 'Gerald version:', self.version
252         print 'Gerald run date:', self.date
253         print 'Gerald config.xml:', self.tree
254         self.summary.dump()
255
256     def _get_elements(self):
257         gerald = ElementTree.Element('Gerald')
258         gerald.append(self.tree)
259         gerald.append(self.summary.elements)
260         return gerald
261     elements = property(_get_elements)
262
263 def tonumber(v):
264     """
265     Convert a value to int if its an int otherwise a float.
266     """
267     try:
268         v = int(v)
269     except ValueError, e:
270         v = float(v)
271     return v
272
273 def parse_mean_range(value):
274     """
275     Parse values like 123 +/- 4.5
276     """
277     if value.strip() == 'unknown':
278         return 0, 0
279
280     average, pm, deviation = value.split()
281     if pm != '+/-':
282         raise RuntimeError("Summary.htm file format changed")
283     return tonumber(average), tonumber(deviation)
284
285 def mean_range_element(parent, name, mean, deviation):
286     """
287     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
288     """
289     element = ElementTree.SubElement(parent, name,
290                                      { 'mean': str(mean),
291                                        'deviation': str(deviation)})
292     return element
293
294 class LaneResultSummary(object):
295     """
296     Parse the LaneResultSummary table out of Summary.htm
297     Mostly for the cluster number
298     """
299     def __init__(self, row_element):
300         data = [ flatten(x) for x in row_element ]
301         if len(data) != 8:
302             raise RuntimeError("Summary.htm file format changed")
303
304         self.lane = data[0]
305         self.cluster = parse_mean_range(data[1])
306         self.average_first_cycle_intensity = parse_mean_range(data[2])
307         self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
308         self.percent_pass_filter_clusters = parse_mean_range(data[4])
309         self.percent_pass_filter_align = parse_mean_range(data[5])
310         self.average_alignment_score = parse_mean_range(data[6])
311         self.percent_error_rate = parse_mean_range(data[7])
312
313     def _get_elements(self):
314         lane_result = ElementTree.Element('LaneResultSummary', 
315                                           {'lane': self.lane})
316         cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
317         first_cycle = mean_range_element(lane_result, 
318                                          'AverageFirstCycleIntensity',
319                                          *self.average_first_cycle_intensity)
320         after_20 = mean_range_element(lane_result,
321                                       'PercentIntensityAfter20Cycles',
322                                       *self.percent_intensity_after_20_cycles)
323         pass_filter = mean_range_element(lane_result,
324                                          'PercentPassFilterClusters',
325                                          *self.percent_pass_filter_clusters)
326         alignment = mean_range_element(lane_result,
327                                        'AverageAlignmentScore',
328                                        *self.average_alignment_score)
329         error_rate = mean_range_element(lane_result,
330                                         'PercentErrorRate',
331                                         *self.percent_error_rate)
332         return lane_result
333     elements = property(_get_elements)
334
335 class Summary(object):
336     """
337     Extract some useful information from the Summary.htm file
338     """
339     def __init__(self, pathname):
340         self.pathname = pathname
341         self.tree = ElementTree.parse(pathname).getroot()
342         self.lane_results = []
343
344         self._extract_lane_results()
345
346     def _extract_lane_results(self):
347         """
348         extract the Lane Results Summary table
349         """
350         if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
351             raise RuntimeError("Summary.htm file format changed")
352
353         table = self.tree.findall('*//table')[2]
354         rows = table.getchildren()
355         headers = rows[0].getchildren()
356         if flatten(headers[2]) != 'Av 1st Cycle Int ':
357             raise RuntimeError("Summary.htm file format changed")
358
359         for r in rows[1:]:
360             self.lane_results.append(LaneResultSummary(r))
361
362     def _get_elements(self):
363         summary = ElementTree.Element('Summary')
364         for lane in self.lane_results:
365             summary.append(lane.elements)
366         return summary
367     elements = property(_get_elements)
368
369     def dump(self):
370         """
371         Debugging function, report current object
372         """
373         pass
374
375     
376 class ELAND(object):
377     """
378     Summarize information from eland files
379     """
380     class ElandResult(object):
381         """
382         Process an eland result file
383         """
384         def __init__(self, gerald, pathname):
385             self.gerald = gerald
386             self.pathname = pathname
387             # extract the sample name
388             path, name = os.path.split(self.pathname)
389             self.sample_name = name.replace("_eland_result.txt","")
390             self._reads = None
391             self._mapped_reads = None
392             self._fasta_map = {}
393
394         def _build_fasta_map(self, genome_dir):
395             # build fasta to fasta file map
396             genome = genome_dir.split(os.path.sep)[-1]
397             fasta_map = {}
398             for vld_file in glob(os.path.join(genome_dir, '*.vld')):
399                 is_link = False
400                 if os.path.islink(vld_file):
401                     is_link = True
402                 vld_file = os.path.realpath(vld_file)
403                 path, vld_name = os.path.split(vld_file)
404                 name, ext = os.path.splitext(vld_name)
405                 if is_link:
406                     fasta_map[name] = name
407                 else:
408                     fasta_map[name] = os.path.join(genome, name)
409             self._fasta_map = fasta_map
410
411         def _update(self):
412             """
413             Actually read the file and actually count the reads
414             """
415             if os.stat(self.pathname)[stat.ST_SIZE] == 0:
416                 raise RuntimeError("Eland isn't done, try again later.")
417             
418             reads = 0
419             mapped_reads = {}
420             genome_dir = self.gerald.lanes[self.sample_name].eland_genome
421             self._build_fasta_map(genome_dir)
422
423             for line in open(self.pathname):
424                 reads += 1
425                 fields = line.split()
426                 # ignore lines that don't have a fasta filename
427                 if len(fields) < 7:
428                     continue
429                 fasta = self._fasta_map.get(fields[6], fields[6])
430                 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
431             self._mapped_reads = mapped_reads
432             self._reads = reads
433         
434         def _get_reads(self):
435             if self._reads is None:
436                 self._update()
437             return self._reads
438         reads = property(_get_reads)
439
440         def _get_mapped_reads(self):
441             if self._mapped_reads is None:
442                 self._update()
443             return self._mapped_reads
444         mapped_reads = property(_get_mapped_reads)
445             
446     def __init__(self, gerald, basedir):
447         # we need information from the gerald config.xml
448         self.gerald = gerald
449         self.results = {}
450         for f in glob(os.path.join(basedir, "*_eland_result.txt")):
451             eland_result = ELAND.ElandResult(gerald, f)
452             self.results[eland_result.sample_name] = eland_result
453
454 class PipelineRun(object):
455     """
456     Capture "interesting" information about a pipeline run
457     """
458     def __init__(self, pathname, firecrest, bustard, gerald):
459         self.pathname = pathname
460         self._name = None
461         self._flowcell_id = None
462         self.firecrest = firecrest
463         self.bustard = bustard
464         self.gerald = gerald
465     
466     def _get_flowcell_id(self):
467         # extract flowcell ID
468         if self._flowcell_id is None:
469           config_dir = os.path.join(self.pathname, 'Config')
470           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
471           flowcell_id_tree = ElementTree.parse(flowcell_id_path)
472           self._flowcell_id = flowcell_id_tree.findtext('Text')
473         return self._flowcell_id
474     flowcell_id = property(_get_flowcell_id)
475
476     def _get_xml(self):
477         """
478         make one master xml file from all of our sub-components.
479         """
480         root = ElementTree.Element('PipelineRun')
481         flowcell = ElementTree.SubElement(root, 'FlowcellID')
482         flowcell.text = self.flowcell_id
483         root.append(self.firecrest.elements)
484         root.append(self.bustard.elements)
485         root.append(self.gerald.elements)
486         return root
487
488     def _get_pretty_xml(self):
489         """
490         Generate indented xml file
491         """
492         root = self._get_xml()
493         indent(root)
494         return root
495     xml = property(_get_pretty_xml)
496
497     def _get_run_name(self):
498         """
499         Given a run tuple, find the latest date and use that as our name
500         """
501         if self._name is None:
502           tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
503           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
504           self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
505         return self._name
506     name = property(_get_run_name)
507
508     def save(self):
509         logging.info("Saving run report "+ self.name)
510         ElementTree.ElementTree(self.xml).write(self.name)
511
512 def get_runs(runfolder):
513     """
514     Search through a run folder for all the various sub component runs
515     and then return a PipelineRun for each different combination.
516
517     For example if there are two different GERALD runs, this will
518     generate two different PipelineRun objects, that differ
519     in there gerald component.
520     """
521     datadir = os.path.join(runfolder, 'Data')
522
523     logging.info('Searching for runs in ' + datadir)
524     runs = []
525     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
526         f = Firecrest(firecrest_pathname)
527         bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
528         for bustard_pathname in glob(bustard_glob):
529             b = Bustard(bustard_pathname)
530             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
531             for gerald_pathname in glob(gerald_glob):
532                 g = GERALD(gerald_pathname)
533                 runs.append(PipelineRun(runfolder, f, b, g))
534     return runs
535                 
536     
537 def extract_run_parameters(runs):
538     """
539     Search through runfolder_path for various runs and grab their parameters
540     """
541     for run in runs:
542       run.save()
543
544 def summary(runs):
545     def summarize_mapped_reads(mapped_reads):
546         summarized_reads = {}
547         genome_reads = 0
548         genome = 'unknown'
549         for k, v in mapped_reads.items():
550             path, k = os.path.split(k)
551             if len(path) > 0:
552                 genome = path
553                 genome_reads += v
554             else:
555                 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
556         summarized_reads[genome] = genome_reads
557         return summarized_reads
558
559     for run in runs:
560         # print a run name?
561         logging.info('Summarizing ' + run.name)
562         for lane in run.gerald.summary.lane_results:
563             print 'lane', lane.lane, 'clusters', lane.cluster[0], '+/-',
564             print lane.cluster[1]
565         print ""
566         # sort the report
567         sample_keys = run.gerald.eland_results.results.keys()
568         sample_keys.sort(alphanum)
569         for sample in sample_keys:
570             print '---'
571             result = run.gerald.eland_results.results[sample]
572             print "Sample name", sample
573             print "Total Reads", result.reads
574             print "Mapped Reads"
575             pprint(summarize_mapped_reads(result.mapped_reads))
576
577 def make_parser():
578     usage = 'usage: %prog [options] runfolder_root_dir'
579     parser = optparse.OptionParser(usage)
580     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
581                       default=False,
582                       help='turn on verbose mode')
583     parser.add_option('-s', '--summary', dest='summary', action='store_true',
584                       default=False,
585                       help='produce summary report')
586     parser.add_option('-a', '--archive', dest='archive', action='store_true',
587                       default=False,
588                       help='generate run configuration archive')
589     return parser
590
591 def main(cmdlist=None):
592     parser = make_parser()
593     opt, args = parser.parse_args(cmdlist)
594
595     if len(args) == 0:
596         parser.error('need path to a runfolder')
597     
598     logging.basicConfig()
599     if opt.verbose:
600         root_log = logging.getLogger()
601         root_log.setLevel(logging.INFO)
602
603     for runfolder in args:
604         runs = get_runs(runfolder)
605         if opt.summary:
606             summary(runs)
607         if opt.archive:
608             extract_run_parameters(runs)
609
610     return 0
611
612 if __name__ == "__main__":
613   sys.exit(main(sys.argv[1:]))