ae04db2049d8e80ab85e8c4b1c6adeb0a950c8e9
[htsworkflow.git] / scripts / runfolder.py
1 #!/usr/bin/env python
2 """
3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
4
5   * Flowcell ID
6   * run dates
7   * start/stop cycle numbers
8   * Firecrest, bustard, gerald version numbers
9   * Eland analysis types, and everything in the eland configuration file.
10   * cluster numbers and other values from the Summary.htm 
11     LaneSpecificParameters table. 
12   * How many reads mapped to a genome from an eland file
13
14 The ELAND "mapped reads" counter will also check for eland squashed file
15 that were symlinked from another directory. This is so I can track how 
16 many reads landed on the genome of interest and on the spike ins. 
17
18 Basically my subdirectories something like:
19
20 genomes/hg18
21 genomes/hg18/chr*.2bpb <- files for hg18 genome
22 genomes/hg18/chr*.vld  
23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
24 genomes/spikein 
25
26 runfolder.py can also spit out a simple summary report (-s option) 
27 that contains the per lane post filter cluster numbers and the mapped 
28 read counts. (The report isn't currently very pretty)
29 """
30 import time
31 import logging
32 import os
33 import re
34 import stat
35 import sys
36 from glob import glob
37 from pprint import pprint
38 import optparse
39
40 try:
41   from xml.etree import ElementTree
42 except ImportError, e:
43   from elementtree import ElementTree
44
45 from gaworkflow.util.alphanum import alphanum
46 EUROPEAN_STRPTIME = "%d-%m-%Y"
47 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
48 VERSION_RE = "([0-9\.]+)"
49 USER_RE = "([a-zA-Z0-9]+)"
50 LANES_PER_FLOWCELL = 8
51
52 runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
53
54 def indent(elem, level=0):
55     """
56     reformat an element tree to be 'pretty' (indented)
57     """
58     i = "\n" + level*"  "
59     if len(elem):
60         if not elem.text or not elem.text.strip():
61             elem.text = i + "  "
62         for child in elem:
63             indent(child, level+1)
64         # we don't want the closing tag indented too far
65         child.tail = i
66         if not elem.tail or not elem.tail.strip():
67             elem.tail = i
68     else:
69         if level and (not elem.tail or not elem.tail.strip()):
70             elem.tail = i
71
72 def flatten(elem, include_tail=0):
73     """
74     Extract the text from an element tree 
75     (AKA extract the text that not part of XML tags)
76     """
77     text = elem.text or ""
78     for e in elem:
79         text += flatten(e, 1)
80     if include_tail and elem.tail: text += elem.tail
81     return text
82
83 class Firecrest(object):
84     def __init__(self, pathname):
85         self.pathname = pathname
86
87         # parse firecrest directory name
88         path, name = os.path.split(pathname)
89         groups = name.split('_')
90         # grab the start/stop cycle information
91         cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
92         self.start = int(cycle.group(1))
93         self.stop = int(cycle.group(2))
94         # firecrest version
95         version = re.search(VERSION_RE, groups[1])
96         self.version = (version.group(1))
97         # datetime
98         self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
99         self.time = time.mktime(self.date)
100         # username
101         self.user = groups[3]
102
103         # should I parse this deeper than just stashing the 
104         # contents of the matrix file?
105         matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
106         self.matrix = open(matrix_pathname, 'r').read()
107         
108     def dump(self):
109         print "Starting cycle:", self.start
110         print "Ending cycle:", self.stop
111         print "Firecrest version:", self.version
112         print "Run date:", self.date
113         print "user:", self.user
114
115     def _get_elements(self):
116         firecrest = ElementTree.Element('Firecrest')
117         version = ElementTree.SubElement(firecrest, 'version')
118         version.text = self.version
119         start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
120         start_cycle.text = str(self.start)
121         stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
122         stop_cycle.text = str(self.stop)
123         run_date = ElementTree.SubElement(firecrest, 'run_time')
124         run_date.text = str(self.time)
125         matrix = ElementTree.SubElement(firecrest, 'matrix')
126         matrix.text = self.matrix
127         return firecrest
128     elements=property(_get_elements)
129
130 class Bustard(object):
131     def __init__(self, pathname):
132         self.pathname = pathname
133
134         path, name = os.path.split(pathname)
135         groups = name.split("_")
136         version = re.search(VERSION_RE, groups[0])
137         self.version = version.group(1)
138         self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
139         self.time = time.mktime(self.date)
140         self.user = groups[2]
141         self.phasing = {}
142         self._load_params()
143
144     def _load_params(self):
145         paramfiles = glob(os.path.join(self.pathname, "params*"))
146         for paramfile in paramfiles:
147             path, name = os.path.split(paramfile)
148             basename, ext = os.path.splitext(name)
149             # the last character of the param filename should be the
150             # lane number
151             lane = int(basename[-1])
152             # we want the whole tree, not just the stuff under
153             # the first tag
154             param_tree = ElementTree.parse(paramfile).getroot()
155             self.phasing[lane] = param_tree
156
157     def dump(self):
158         print "Bustard version:", self.version
159         print "Run date", self.date
160         print "user:", self.user
161         for lane, tree in self.phasing.items():
162             print lane
163             print tree
164
165     def _get_elements(self):
166         bustard = ElementTree.Element('Bustard')
167         version = ElementTree.SubElement(bustard, 'version')
168         version.text = self.version
169         run_date = ElementTree.SubElement(bustard, 'run_time')
170         run_date.text = str(self.time)
171         return bustard
172     elements=property(_get_elements)
173         
174 class GERALD(object):
175     """
176     Capture meaning out of the GERALD directory
177     """
178     class LaneParameters(object):
179         """
180         Make it easy to access elements of LaneSpecificRunParameters from python
181         """
182         def __init__(self, tree, key):
183             self._tree = tree.find('LaneSpecificRunParameters')
184             self._key = key
185         
186         def __get_attribute(self, xml_tag):
187             container = self._tree.find(xml_tag)
188             if container is None or \
189                len(container.getchildren()) != LANES_PER_FLOWCELL:
190                 raise RuntimeError('GERALD config.xml file changed')
191             element = container.find(self._key)
192             return element.text
193         def _get_analysis(self):
194             return self.__get_attribute('ANALYSIS')
195         analysis = property(_get_analysis)
196
197         def _get_eland_genome(self):
198             return self.__get_attribute('ELAND_GENOME')
199         eland_genome = property(_get_eland_genome)
200
201         def _get_read_length(self):
202             return self.__get_attribute('READ_LENGTH')
203         read_length = property(_get_read_length)
204
205         def _get_use_bases(self):
206             return self.__get_attribute('USE_BASES')
207         use_bases = property(_get_use_bases)
208
209     class LaneSpecificRunParameters(object):
210         """
211         Provide access to LaneSpecificRunParameters
212         """
213         def __init__(self, tree):
214             self._tree = tree
215             self._keys = None
216         def __getitem__(self, key):
217             return GERALD.LaneParameters(self._tree, key)
218         def keys(self):
219             if self._keys is None:
220                 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
221                 self._keys = [ x.tag for x in analysis]
222             return self._keys
223         def values(self):
224             return [ self[x] for x in self.keys() ]
225         def items(self):
226             return zip(self.keys(), self.values())
227         def __len__(self):
228             return len(self.keys)
229
230     def __init__(self, pathname):
231         self.pathname = pathname
232         path, name = os.path.split(pathname)
233         config_pathname = os.path.join(self.pathname, 'config.xml')
234         self.tree = ElementTree.parse(config_pathname).getroot()
235         self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
236
237         date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
238         self.date = time.strptime(date)
239         self.time = time.mktime(self.date)
240         
241         # parse Summary.htm file
242         summary_pathname = os.path.join(self.pathname, 'Summary.htm')
243         self.summary = Summary(summary_pathname)
244         self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
245         self.eland_results = ELAND(self, self.pathname)
246
247     def dump(self):
248         """
249         Debugging function, report current object
250         """
251         print 'Gerald version:', self.version
252         print 'Gerald run date:', self.date
253         print 'Gerald config.xml:', self.tree
254         self.summary.dump()
255
256     def _get_elements(self):
257         gerald = ElementTree.Element('Gerald')
258         gerald.append(self.tree)
259         gerald.append(self.summary.elements)
260         return gerald
261     elements = property(_get_elements)
262
263 def tonumber(v):
264     """
265     Convert a value to int if its an int otherwise a float.
266     """
267     try:
268         v = int(v)
269     except ValueError, e:
270         v = float(v)
271     return v
272
273 def parse_mean_range(value):
274     """
275     Parse values like 123 +/- 4.5
276     """
277     if value.strip() == 'unknown':
278         return 0, 0
279
280     average, pm, deviation = value.split()
281     if pm != '+/-':
282         raise RuntimeError("Summary.htm file format changed")
283     return tonumber(average), tonumber(deviation)
284
285 def mean_range_element(parent, name, mean, deviation):
286     """
287     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
288     """
289     element = ElementTree.SubElement(parent, name,
290                                      { 'mean': str(mean),
291                                        'deviation': str(deviation)})
292     return element
293
294 class LaneResultSummary(object):
295     """
296     Parse the LaneResultSummary table out of Summary.htm
297     Mostly for the cluster number
298     """
299     def __init__(self, row_element):
300         data = [ flatten(x) for x in row_element ]
301         if len(data) != 8:
302             raise RuntimeError("Summary.htm file format changed")
303
304         self.lane = data[0]
305         self.cluster = parse_mean_range(data[1])
306         self.average_first_cycle_intensity = parse_mean_range(data[2])
307         self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
308         self.percent_pass_filter_clusters = parse_mean_range(data[4])
309         self.percent_pass_filter_align = parse_mean_range(data[5])
310         self.average_alignment_score = parse_mean_range(data[6])
311         self.percent_error_rate = parse_mean_range(data[7])
312
313     def _get_elements(self):
314         lane_result = ElementTree.Element('LaneResultSummary', 
315                                           {'lane': self.lane})
316         cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
317         first_cycle = mean_range_element(lane_result, 
318                                          'AverageFirstCycleIntensity',
319                                          *self.average_first_cycle_intensity)
320         after_20 = mean_range_element(lane_result,
321                                       'PercentIntensityAfter20Cycles',
322                                       *self.percent_intensity_after_20_cycles)
323         pass_filter = mean_range_element(lane_result,
324                                          'PercentPassFilterClusters',
325                                          *self.percent_pass_filter_clusters)
326         alignment = mean_range_element(lane_result,
327                                        'AverageAlignmentScore',
328                                        *self.average_alignment_score)
329         error_rate = mean_range_element(lane_result,
330                                         'PercentErrorRate',
331                                         *self.percent_error_rate)
332         return lane_result
333     elements = property(_get_elements)
334
335 class Summary(object):
336     """
337     Extract some useful information from the Summary.htm file
338     """
339     def __init__(self, pathname):
340         self.pathname = pathname
341         self.tree = ElementTree.parse(pathname).getroot()
342         self.lane_results = []
343
344         self._extract_lane_results()
345
346     def _extract_lane_results(self):
347         """
348         extract the Lane Results Summary table
349         """
350         if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
351             raise RuntimeError("Summary.htm file format changed")
352
353         table = self.tree.findall('*//table')[2]
354         rows = table.getchildren()
355         headers = rows[0].getchildren()
356         if flatten(headers[2]) != 'Av 1st Cycle Int ':
357             raise RuntimeError("Summary.htm file format changed")
358
359         for r in rows[1:]:
360             self.lane_results.append(LaneResultSummary(r))
361
362     def _get_elements(self):
363         summary = ElementTree.Element('Summary')
364         for lane in self.lane_results:
365             summary.append(lane.elements)
366         return summary
367     elements = property(_get_elements)
368
369     def dump(self):
370         """
371         Debugging function, report current object
372         """
373         pass
374
375     
376 class ELAND(object):
377     """
378     Summarize information from eland files
379     """
380     class ElandResult(object):
381         """
382         Process an eland result file
383         """
384         def __init__(self, gerald, pathname):
385             self.gerald = gerald
386             self.pathname = pathname
387             # extract the sample name
388             path, name = os.path.split(self.pathname)
389             self.sample_name = name.replace("_eland_result.txt","")
390             self._reads = None
391             self._mapped_reads = None
392             self._fasta_map = {}
393
394         def _build_fasta_map(self, genome_dir):
395             # build fasta to fasta file map
396             genome = genome_dir.split(os.path.sep)[-1]
397             fasta_map = {}
398             for vld_file in glob(os.path.join(genome_dir, '*.vld')):
399                 is_link = False
400                 if os.path.islink(vld_file):
401                     is_link = True
402                 vld_file = os.path.realpath(vld_file)
403                 path, vld_name = os.path.split(vld_file)
404                 name, ext = os.path.splitext(vld_name)
405                 if is_link:
406                     fasta_map[name] = name
407                 else:
408                     fasta_map[name] = os.path.join(genome, name)
409             self._fasta_map = fasta_map
410
411         def _update(self):
412             """
413             Actually read the file and actually count the reads
414             """
415             if os.stat(self.pathname)[stat.ST_SIZE] == 0:
416                 raise RuntimeError("Eland isn't done, try again later.")
417             
418             reads = 0
419             mapped_reads = {}
420             genome_dir = self.gerald.lanes[self.sample_name].eland_genome
421             self._build_fasta_map(genome_dir)
422             match_codes = {'NM':0, 'QC':0, 'RM':0, 
423                            'U0':0, 'U1':0, 'U2':0,
424                            'R0':0, 'R1':0, 'R2':0,
425                           }
426             for line in open(self.pathname):
427                 reads += 1
428                 fields = line.split()
429                 # code = fields[2]
430                 # match_codes[code] = match_codes.setdefault(code, 0) + 1
431                 # the QC/NM etc codes are in the 3rd field and always present
432                 match_codes[fields[2]] += 1
433                 # ignore lines that don't have a fasta filename
434                 if len(fields) < 7:
435                     continue
436                 fasta = self._fasta_map.get(fields[6], fields[6])
437                 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
438             self._match_codes = match_codes
439             self._mapped_reads = mapped_reads
440             self._reads = reads
441         
442         def _get_reads(self):
443             if self._reads is None:
444                 self._update()
445             return self._reads
446         reads = property(_get_reads)
447
448         def _get_mapped_reads(self):
449             if self._mapped_reads is None:
450                 self._update()
451             return self._mapped_reads
452         mapped_reads = property(_get_mapped_reads)
453             
454     def __init__(self, gerald, basedir):
455         # we need information from the gerald config.xml
456         self.gerald = gerald
457         self.results = {}
458         for f in glob(os.path.join(basedir, "*_eland_result.txt")):
459             eland_result = ELAND.ElandResult(gerald, f)
460             self.results[eland_result.sample_name] = eland_result
461
462 class PipelineRun(object):
463     """
464     Capture "interesting" information about a pipeline run
465     """
466     def __init__(self, pathname, firecrest, bustard, gerald):
467         self.pathname = pathname
468         self._name = None
469         self._flowcell_id = None
470         self.firecrest = firecrest
471         self.bustard = bustard
472         self.gerald = gerald
473     
474     def _get_flowcell_id(self):
475         # extract flowcell ID
476         if self._flowcell_id is None:
477           config_dir = os.path.join(self.pathname, 'Config')
478           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
479           if os.path.exists(flowcell_id_path):
480             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
481             self._flowcell_id = flowcell_id_tree.findtext('Text')
482           else:
483             logging.warning(
484               "Unable to determine flowcell id as %s was not found" % (
485                  flowcell_id_path))
486             self._flowcell_id = "unknown"
487         return self._flowcell_id
488     flowcell_id = property(_get_flowcell_id)
489
490     def _get_xml(self):
491         """
492         make one master xml file from all of our sub-components.
493         """
494         root = ElementTree.Element('PipelineRun')
495         flowcell = ElementTree.SubElement(root, 'FlowcellID')
496         flowcell.text = self.flowcell_id
497         root.append(self.firecrest.elements)
498         root.append(self.bustard.elements)
499         root.append(self.gerald.elements)
500         return root
501
502     def _get_pretty_xml(self):
503         """
504         Generate indented xml file
505         """
506         root = self._get_xml()
507         indent(root)
508         return root
509     xml = property(_get_pretty_xml)
510
511     def _get_run_name(self):
512         """
513         Given a run tuple, find the latest date and use that as our name
514         """
515         if self._name is None:
516           tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
517           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
518           self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
519         return self._name
520     name = property(_get_run_name)
521
522     def save(self):
523         logging.info("Saving run report "+ self.name)
524         ElementTree.ElementTree(self.xml).write(self.name)
525
526 def get_runs(runfolder):
527     """
528     Search through a run folder for all the various sub component runs
529     and then return a PipelineRun for each different combination.
530
531     For example if there are two different GERALD runs, this will
532     generate two different PipelineRun objects, that differ
533     in there gerald component.
534     """
535     datadir = os.path.join(runfolder, 'Data')
536
537     logging.info('Searching for runs in ' + datadir)
538     runs = []
539     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
540         f = Firecrest(firecrest_pathname)
541         bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
542         for bustard_pathname in glob(bustard_glob):
543             b = Bustard(bustard_pathname)
544             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
545             for gerald_pathname in glob(gerald_glob):
546                 try:
547                     g = GERALD(gerald_pathname)
548                     runs.append(PipelineRun(runfolder, f, b, g))
549                 except IOError, e:
550                     print "Ignoring", str(e)
551     return runs
552                 
553     
554 def extract_run_parameters(runs):
555     """
556     Search through runfolder_path for various runs and grab their parameters
557     """
558     for run in runs:
559       run.save()
560
561 def summarize_mapped_reads(mapped_reads):
562     """
563     Summarize per chromosome reads into a genome count
564     But handle spike-in/contamination symlinks seperately.
565     """
566     summarized_reads = {}
567     genome_reads = 0
568     genome = 'unknown'
569     for k, v in mapped_reads.items():
570         path, k = os.path.split(k)
571         if len(path) > 0:
572             genome = path
573             genome_reads += v
574         else:
575             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
576     summarized_reads[genome] = genome_reads
577     return summarized_reads
578
579 def summary_report(runs):
580     """
581     Summarize cluster numbers and mapped read counts for a runfolder
582     """
583     for run in runs:
584         # print a run name?
585         print 'Summary for', run.name
586         for lane in run.gerald.summary.lane_results:
587             print 'lane', lane.lane, 'clusters', lane.cluster[0], '+/-',
588             print lane.cluster[1]
589         print ""
590         # sort the report
591         sample_keys = run.gerald.eland_results.results.keys()
592         sample_keys.sort(alphanum)
593         for sample in sample_keys:
594             print '---'
595             result = run.gerald.eland_results.results[sample]
596             print "Sample name", sample
597             print "Total Reads", result.reads
598             mc = result._match_codes
599             print "No Match", mc['NM']
600             print "QC Failed", mc['QC']
601             print 'Unique (0,1,2 mismatches)', mc['U0'], mc['U1'], mc['U2']
602             print 'Repeat (0,1,2 mismatches)', mc['R0'], mc['R1'], mc['R2']
603             print "Mapped Reads"
604             pprint(summarize_mapped_reads(result.mapped_reads))
605
606 def make_parser():
607     usage = 'usage: %prog [options] runfolder_root_dir'
608     parser = optparse.OptionParser(usage)
609     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
610                       default=False,
611                       help='turn on verbose mode')
612     parser.add_option('-s', '--summary', dest='summary', action='store_true',
613                       default=False,
614                       help='produce summary report')
615     parser.add_option('-a', '--archive', dest='archive', action='store_true',
616                       default=False,
617                       help='generate run configuration archive')
618     return parser
619
620 def main(cmdlist=None):
621     parser = make_parser()
622     opt, args = parser.parse_args(cmdlist)
623
624     if len(args) == 0:
625         parser.error('need path to a runfolder')
626     
627     logging.basicConfig()
628     if opt.verbose:
629         root_log = logging.getLogger()
630         root_log.setLevel(logging.INFO)
631
632     for runfolder in args:
633         runs = get_runs(runfolder)
634         if opt.summary:
635             summary_report(runs)
636         if opt.archive:
637             extract_run_parameters(runs)
638
639     return 0
640
641 if __name__ == "__main__":
642   sys.exit(main(sys.argv[1:]))