Report cluster results with the rest of the lane summary information.
[htsworkflow.git] / scripts / runfolder.py
1 #!/usr/bin/env python
2 """
3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
4
5   * Flowcell ID
6   * run dates
7   * start/stop cycle numbers
8   * Firecrest, bustard, gerald version numbers
9   * Eland analysis types, and everything in the eland configuration file.
10   * cluster numbers and other values from the Summary.htm 
11     LaneSpecificParameters table. 
12   * How many reads mapped to a genome from an eland file
13
14 The ELAND "mapped reads" counter will also check for eland squashed file
15 that were symlinked from another directory. This is so I can track how 
16 many reads landed on the genome of interest and on the spike ins. 
17
18 Basically my subdirectories something like:
19
20 genomes/hg18
21 genomes/hg18/chr*.2bpb <- files for hg18 genome
22 genomes/hg18/chr*.vld  
23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
24 genomes/spikein 
25
26 runfolder.py can also spit out a simple summary report (-s option) 
27 that contains the per lane post filter cluster numbers and the mapped 
28 read counts. (The report isn't currently very pretty)
29 """
30 import time
31 import logging
32 import os
33 import re
34 import stat
35 import sys
36 from glob import glob
37 from pprint import pprint
38 import optparse
39
40 try:
41   from xml.etree import ElementTree
42 except ImportError, e:
43   from elementtree import ElementTree
44
45 from gaworkflow.util.alphanum import alphanum
46 EUROPEAN_STRPTIME = "%d-%m-%Y"
47 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
48 VERSION_RE = "([0-9\.]+)"
49 USER_RE = "([a-zA-Z0-9]+)"
50 LANES_PER_FLOWCELL = 8
51
52 runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
53
54 def indent(elem, level=0):
55     """
56     reformat an element tree to be 'pretty' (indented)
57     """
58     i = "\n" + level*"  "
59     if len(elem):
60         if not elem.text or not elem.text.strip():
61             elem.text = i + "  "
62         for child in elem:
63             indent(child, level+1)
64         # we don't want the closing tag indented too far
65         child.tail = i
66         if not elem.tail or not elem.tail.strip():
67             elem.tail = i
68     else:
69         if level and (not elem.tail or not elem.tail.strip()):
70             elem.tail = i
71
72 def flatten(elem, include_tail=0):
73     """
74     Extract the text from an element tree 
75     (AKA extract the text that not part of XML tags)
76     """
77     text = elem.text or ""
78     for e in elem:
79         text += flatten(e, 1)
80     if include_tail and elem.tail: text += elem.tail
81     return text
82
83 class Firecrest(object):
84     def __init__(self, pathname):
85         self.pathname = pathname
86
87         # parse firecrest directory name
88         path, name = os.path.split(pathname)
89         groups = name.split('_')
90         # grab the start/stop cycle information
91         cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
92         self.start = int(cycle.group(1))
93         self.stop = int(cycle.group(2))
94         # firecrest version
95         version = re.search(VERSION_RE, groups[1])
96         self.version = (version.group(1))
97         # datetime
98         self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
99         self.time = time.mktime(self.date)
100         # username
101         self.user = groups[3]
102
103         # should I parse this deeper than just stashing the 
104         # contents of the matrix file?
105         matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
106         self.matrix = open(matrix_pathname, 'r').read()
107         
108     def dump(self):
109         print "Starting cycle:", self.start
110         print "Ending cycle:", self.stop
111         print "Firecrest version:", self.version
112         print "Run date:", self.date
113         print "user:", self.user
114
115     def _get_elements(self):
116         firecrest = ElementTree.Element('Firecrest')
117         version = ElementTree.SubElement(firecrest, 'version')
118         version.text = self.version
119         start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
120         start_cycle.text = str(self.start)
121         stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
122         stop_cycle.text = str(self.stop)
123         run_date = ElementTree.SubElement(firecrest, 'run_time')
124         run_date.text = str(self.time)
125         matrix = ElementTree.SubElement(firecrest, 'matrix')
126         matrix.text = self.matrix
127         return firecrest
128     elements=property(_get_elements)
129
130 class Bustard(object):
131     def __init__(self, pathname):
132         self.pathname = pathname
133
134         path, name = os.path.split(pathname)
135         groups = name.split("_")
136         version = re.search(VERSION_RE, groups[0])
137         self.version = version.group(1)
138         self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
139         self.time = time.mktime(self.date)
140         self.user = groups[2]
141         self.phasing = {}
142         self._load_params()
143
144     def _load_params(self):
145         paramfiles = glob(os.path.join(self.pathname, "params*"))
146         for paramfile in paramfiles:
147             path, name = os.path.split(paramfile)
148             basename, ext = os.path.splitext(name)
149             # the last character of the param filename should be the
150             # lane number
151             lane = int(basename[-1])
152             # we want the whole tree, not just the stuff under
153             # the first tag
154             param_tree = ElementTree.parse(paramfile).getroot()
155             self.phasing[lane] = param_tree
156
157     def dump(self):
158         print "Bustard version:", self.version
159         print "Run date", self.date
160         print "user:", self.user
161         for lane, tree in self.phasing.items():
162             print lane
163             print tree
164
165     def _get_elements(self):
166         bustard = ElementTree.Element('Bustard')
167         version = ElementTree.SubElement(bustard, 'version')
168         version.text = self.version
169         run_date = ElementTree.SubElement(bustard, 'run_time')
170         run_date.text = str(self.time)
171         return bustard
172     elements=property(_get_elements)
173         
174 class GERALD(object):
175     """
176     Capture meaning out of the GERALD directory
177     """
178     class LaneParameters(object):
179         """
180         Make it easy to access elements of LaneSpecificRunParameters from python
181         """
182         def __init__(self, tree, key):
183             self._tree = tree.find('LaneSpecificRunParameters')
184             self._key = key
185         
186         def __get_attribute(self, xml_tag):
187             container = self._tree.find(xml_tag)
188             if container is None or \
189                len(container.getchildren()) != LANES_PER_FLOWCELL:
190                 raise RuntimeError('GERALD config.xml file changed')
191             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
192             index = lanes.index(self._key)
193             #element = container.find(self._key)
194             element = container[index]
195             return element.text
196         def _get_analysis(self):
197             return self.__get_attribute('ANALYSIS')
198         analysis = property(_get_analysis)
199
200         def _get_eland_genome(self):
201             return self.__get_attribute('ELAND_GENOME')
202         eland_genome = property(_get_eland_genome)
203
204         def _get_read_length(self):
205             return self.__get_attribute('READ_LENGTH')
206         read_length = property(_get_read_length)
207
208         def _get_use_bases(self):
209             return self.__get_attribute('USE_BASES')
210         use_bases = property(_get_use_bases)
211
212     class LaneSpecificRunParameters(object):
213         """
214         Provide access to LaneSpecificRunParameters
215         """
216         def __init__(self, tree):
217             self._tree = tree
218             self._keys = None
219         def __getitem__(self, key):
220             return GERALD.LaneParameters(self._tree, key)
221         def keys(self):
222             if self._keys is None:
223                 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
224                 # according to the pipeline specs I think their fields 
225                 # are sampleName_laneID, with sampleName defaulting to s
226                 # since laneIDs are constant lets just try using 
227                 # those consistently.
228                 self._keys = [ x.tag.split('_')[1] for x in analysis]
229             return self._keys
230         def values(self):
231             return [ self[x] for x in self.keys() ]
232         def items(self):
233             return zip(self.keys(), self.values())
234         def __len__(self):
235             return len(self.keys)
236
237     def __init__(self, pathname):
238         self.pathname = pathname
239         path, name = os.path.split(pathname)
240         config_pathname = os.path.join(self.pathname, 'config.xml')
241         self.tree = ElementTree.parse(config_pathname).getroot()
242         self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
243
244         date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
245         self.date = time.strptime(date)
246         self.time = time.mktime(self.date)
247         
248         # parse Summary.htm file
249         summary_pathname = os.path.join(self.pathname, 'Summary.htm')
250         self.summary = Summary(summary_pathname)
251         self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
252         self.eland_results = ELAND(self, self.pathname)
253
254     def dump(self):
255         """
256         Debugging function, report current object
257         """
258         print 'Gerald version:', self.version
259         print 'Gerald run date:', self.date
260         print 'Gerald config.xml:', self.tree
261         self.summary.dump()
262
263     def _get_elements(self):
264         gerald = ElementTree.Element('Gerald')
265         gerald.append(self.tree)
266         gerald.append(self.summary.elements)
267         return gerald
268     elements = property(_get_elements)
269
270 def tonumber(v):
271     """
272     Convert a value to int if its an int otherwise a float.
273     """
274     try:
275         v = int(v)
276     except ValueError, e:
277         v = float(v)
278     return v
279
280 def parse_mean_range(value):
281     """
282     Parse values like 123 +/- 4.5
283     """
284     if value.strip() == 'unknown':
285         return 0, 0
286
287     average, pm, deviation = value.split()
288     if pm != '+/-':
289         raise RuntimeError("Summary.htm file format changed")
290     return tonumber(average), tonumber(deviation)
291
292 def mean_range_element(parent, name, mean, deviation):
293     """
294     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
295     """
296     element = ElementTree.SubElement(parent, name,
297                                      { 'mean': str(mean),
298                                        'deviation': str(deviation)})
299     return element
300
301 class LaneResultSummary(object):
302     """
303     Parse the LaneResultSummary table out of Summary.htm
304     Mostly for the cluster number
305     """
306     def __init__(self, row_element):
307         data = [ flatten(x) for x in row_element ]
308         if len(data) != 8:
309             raise RuntimeError("Summary.htm file format changed")
310
311         self.lane = data[0]
312         self.cluster = parse_mean_range(data[1])
313         self.average_first_cycle_intensity = parse_mean_range(data[2])
314         self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
315         self.percent_pass_filter_clusters = parse_mean_range(data[4])
316         self.percent_pass_filter_align = parse_mean_range(data[5])
317         self.average_alignment_score = parse_mean_range(data[6])
318         self.percent_error_rate = parse_mean_range(data[7])
319
320     def _get_elements(self):
321         lane_result = ElementTree.Element('LaneResultSummary', 
322                                           {'lane': self.lane})
323         cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
324         first_cycle = mean_range_element(lane_result, 
325                                          'AverageFirstCycleIntensity',
326                                          *self.average_first_cycle_intensity)
327         after_20 = mean_range_element(lane_result,
328                                       'PercentIntensityAfter20Cycles',
329                                       *self.percent_intensity_after_20_cycles)
330         pass_filter = mean_range_element(lane_result,
331                                          'PercentPassFilterClusters',
332                                          *self.percent_pass_filter_clusters)
333         alignment = mean_range_element(lane_result,
334                                        'AverageAlignmentScore',
335                                        *self.average_alignment_score)
336         error_rate = mean_range_element(lane_result,
337                                         'PercentErrorRate',
338                                         *self.percent_error_rate)
339         return lane_result
340     elements = property(_get_elements)
341
342 class Summary(object):
343     """
344     Extract some useful information from the Summary.htm file
345     """
346     def __init__(self, pathname):
347         self.pathname = pathname
348         self.tree = ElementTree.parse(pathname).getroot()
349         self.lane_results = {}
350
351         self._extract_lane_results()
352
353     def _extract_lane_results(self):
354         """
355         extract the Lane Results Summary table
356         """
357         if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
358             raise RuntimeError("Summary.htm file format changed")
359
360         tables = self.tree.findall('*//table')
361
362         # parse lane result summary
363         lane_summary = tables[2]
364         rows = lane_summary.getchildren()
365         headers = rows[0].getchildren()
366         if flatten(headers[2]) != 'Av 1st Cycle Int ':
367             raise RuntimeError("Summary.htm file format changed")
368
369         for r in rows[1:]:
370             lrs = LaneResultSummary(r)
371             self.lane_results[lrs.lane] = lrs
372
373     def _get_elements(self):
374         summary = ElementTree.Element('Summary')
375         for lane in self.lane_results.values():
376             summary.append(lane.elements)
377         return summary
378     elements = property(_get_elements)
379
380     def dump(self):
381         """
382         Debugging function, report current object
383         """
384         pass
385
386     
387 class ELAND(object):
388     """
389     Summarize information from eland files
390     """
391     class ElandResult(object):
392         """
393         Process an eland result file
394         """
395         def __init__(self, gerald, pathname):
396             self.gerald = gerald
397             self.pathname = pathname
398             # extract the sample name
399             path, name = os.path.split(self.pathname)
400             split_name = name.split('_')
401             self.sample_name = split_name[0]
402             self.lane_id = split_name[1]
403             self._reads = None
404             self._mapped_reads = None
405             self._fasta_map = {}
406
407         def _build_fasta_map(self, genome_dir):
408             # build fasta to fasta file map
409             genome = genome_dir.split(os.path.sep)[-1]
410             fasta_map = {}
411             for vld_file in glob(os.path.join(genome_dir, '*.vld')):
412                 is_link = False
413                 if os.path.islink(vld_file):
414                     is_link = True
415                 vld_file = os.path.realpath(vld_file)
416                 path, vld_name = os.path.split(vld_file)
417                 name, ext = os.path.splitext(vld_name)
418                 if is_link:
419                     fasta_map[name] = name
420                 else:
421                     fasta_map[name] = os.path.join(genome, name)
422             self._fasta_map = fasta_map
423
424         def _update(self):
425             """
426             Actually read the file and actually count the reads
427             """
428             if os.stat(self.pathname)[stat.ST_SIZE] == 0:
429                 raise RuntimeError("Eland isn't done, try again later.")
430             
431             reads = 0
432             mapped_reads = {}
433             genome_dir = self.gerald.lanes[self.lane_id].eland_genome
434             self._build_fasta_map(genome_dir)
435             match_codes = {'NM':0, 'QC':0, 'RM':0, 
436                            'U0':0, 'U1':0, 'U2':0,
437                            'R0':0, 'R1':0, 'R2':0,
438                           }
439             for line in open(self.pathname):
440                 reads += 1
441                 fields = line.split()
442                 # code = fields[2]
443                 # match_codes[code] = match_codes.setdefault(code, 0) + 1
444                 # the QC/NM etc codes are in the 3rd field and always present
445                 match_codes[fields[2]] += 1
446                 # ignore lines that don't have a fasta filename
447                 if len(fields) < 7:
448                     continue
449                 fasta = self._fasta_map.get(fields[6], fields[6])
450                 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
451             self._match_codes = match_codes
452             self._mapped_reads = mapped_reads
453             self._reads = reads
454         
455         def _get_reads(self):
456             if self._reads is None:
457                 self._update()
458             return self._reads
459         reads = property(_get_reads)
460
461         def _get_mapped_reads(self):
462             if self._mapped_reads is None:
463                 self._update()
464             return self._mapped_reads
465         mapped_reads = property(_get_mapped_reads)
466             
467     def __init__(self, gerald, basedir):
468         # we need information from the gerald config.xml
469         self.gerald = gerald
470         self.results = {}
471         for f in glob(os.path.join(basedir, "*_eland_result.txt")):
472             eland_result = ELAND.ElandResult(gerald, f)
473             self.results[eland_result.lane_id] = eland_result
474
475 class PipelineRun(object):
476     """
477     Capture "interesting" information about a pipeline run
478     """
479     def __init__(self, pathname, firecrest, bustard, gerald):
480         self.pathname = pathname
481         self._name = None
482         self._flowcell_id = None
483         self.firecrest = firecrest
484         self.bustard = bustard
485         self.gerald = gerald
486     
487     def _get_flowcell_id(self):
488         # extract flowcell ID
489         if self._flowcell_id is None:
490           config_dir = os.path.join(self.pathname, 'Config')
491           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
492           if os.path.exists(flowcell_id_path):
493             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
494             self._flowcell_id = flowcell_id_tree.findtext('Text')
495           else:
496             logging.warning(
497               "Unable to determine flowcell id as %s was not found" % (
498                  flowcell_id_path))
499             self._flowcell_id = "unknown"
500         return self._flowcell_id
501     flowcell_id = property(_get_flowcell_id)
502
503     def _get_xml(self):
504         """
505         make one master xml file from all of our sub-components.
506         """
507         root = ElementTree.Element('PipelineRun')
508         flowcell = ElementTree.SubElement(root, 'FlowcellID')
509         flowcell.text = self.flowcell_id
510         root.append(self.firecrest.elements)
511         root.append(self.bustard.elements)
512         root.append(self.gerald.elements)
513         return root
514
515     def _get_pretty_xml(self):
516         """
517         Generate indented xml file
518         """
519         root = self._get_xml()
520         indent(root)
521         return root
522     xml = property(_get_pretty_xml)
523
524     def _get_run_name(self):
525         """
526         Given a run tuple, find the latest date and use that as our name
527         """
528         if self._name is None:
529           tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
530           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
531           self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
532         return self._name
533     name = property(_get_run_name)
534
535     def save(self):
536         logging.info("Saving run report "+ self.name)
537         ElementTree.ElementTree(self.xml).write(self.name)
538
539 def get_runs(runfolder):
540     """
541     Search through a run folder for all the various sub component runs
542     and then return a PipelineRun for each different combination.
543
544     For example if there are two different GERALD runs, this will
545     generate two different PipelineRun objects, that differ
546     in there gerald component.
547     """
548     datadir = os.path.join(runfolder, 'Data')
549
550     logging.info('Searching for runs in ' + datadir)
551     runs = []
552     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
553         f = Firecrest(firecrest_pathname)
554         bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
555         for bustard_pathname in glob(bustard_glob):
556             b = Bustard(bustard_pathname)
557             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
558             for gerald_pathname in glob(gerald_glob):
559                 try:
560                     g = GERALD(gerald_pathname)
561                     runs.append(PipelineRun(runfolder, f, b, g))
562                 except IOError, e:
563                     print "Ignoring", str(e)
564     return runs
565                 
566     
567 def extract_run_parameters(runs):
568     """
569     Search through runfolder_path for various runs and grab their parameters
570     """
571     for run in runs:
572       run.save()
573
574 def summarize_mapped_reads(mapped_reads):
575     """
576     Summarize per chromosome reads into a genome count
577     But handle spike-in/contamination symlinks seperately.
578     """
579     summarized_reads = {}
580     genome_reads = 0
581     genome = 'unknown'
582     for k, v in mapped_reads.items():
583         path, k = os.path.split(k)
584         if len(path) > 0:
585             genome = path
586             genome_reads += v
587         else:
588             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
589     summarized_reads[genome] = genome_reads
590     return summarized_reads
591
592 def summary_report(runs):
593     """
594     Summarize cluster numbers and mapped read counts for a runfolder
595     """
596     report = []
597     for run in runs:
598         # print a run name?
599         report.append('Summary for %s' % (run.name,))
600         # sort the report
601         eland_keys = run.gerald.eland_results.results.keys()
602         eland_keys.sort(alphanum)
603
604         lane_results = run.gerald.summary.lane_results
605         for lane_id in eland_keys:
606             result = run.gerald.eland_results.results[lane_id]
607             report.append("Sample name %s" % (result.sample_name))
608             report.append("Lane id %s" % (result.lane_id,))
609             cluster = lane_results[result.lane_id].cluster
610             report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
611             report.append("Total Reads: %d" % (result.reads))
612             mc = result._match_codes
613             report.append("No Match: %d" % (mc['NM']))
614             report.append("QC Failed: %d" % (mc['QC']))
615             report.append('Unique (0,1,2 mismatches) %d %d %d' % \
616                           (mc['U0'], mc['U1'], mc['U2']))
617             report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
618                           (mc['R0'], mc['R1'], mc['R2']))
619             report.append("Mapped Reads")
620             mapped_reads = summarize_mapped_reads(result.mapped_reads)
621             for name, counts in mapped_reads.items():
622               report.append("  %s: %d" % (name, counts))
623             report.append('---')
624             report.append('')
625         return os.linesep.join(report)
626
627 def make_parser():
628     usage = 'usage: %prog [options] runfolder_root_dir'
629     parser = optparse.OptionParser(usage)
630     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
631                       default=False,
632                       help='turn on verbose mode')
633     parser.add_option('-s', '--summary', dest='summary', action='store_true',
634                       default=False,
635                       help='produce summary report')
636     parser.add_option('-a', '--archive', dest='archive', action='store_true',
637                       default=False,
638                       help='generate run configuration archive')
639     return parser
640
641 def main(cmdlist=None):
642     parser = make_parser()
643     opt, args = parser.parse_args(cmdlist)
644
645     if len(args) == 0:
646         parser.error('need path to a runfolder')
647     
648     logging.basicConfig()
649     if opt.verbose:
650         root_log = logging.getLogger()
651         root_log.setLevel(logging.INFO)
652
653     for runfolder in args:
654         runs = get_runs(runfolder)
655         if opt.summary:
656             print summary_report(runs)
657         if opt.archive:
658             extract_run_parameters(runs)
659
660     return 0
661
662 if __name__ == "__main__":
663   sys.exit(main(sys.argv[1:]))