Partially handle the changed Summary.htm file from the 0.3 version of the
[htsworkflow.git] / gaworkflow / pipeline / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 from glob import glob
6 import logging
7 import os
8 import stat
9 import time
10
11 from gaworkflow.pipeline.runfolder import \
12    ElementTree, \
13    EUROPEAN_STRPTIME, \
14    LANES_PER_FLOWCELL, \
15    VERSION_RE
16 from gaworkflow.util.ethelp import indent, flatten
17 from gaworkflow.util.opener import autoopen
18
19 class Gerald(object):
20     """
21     Capture meaning out of the GERALD directory
22     """
23     XML_VERSION = 1
24     GERALD='Gerald'
25     RUN_PARAMETERS='RunParameters'
26     SUMMARY='Summary'
27
28     class LaneParameters(object):
29         """
30         Make it easy to access elements of LaneSpecificRunParameters from python
31         """
32         def __init__(self, gerald, key):
33             self._gerald = gerald
34             self._key = key
35         
36         def __get_attribute(self, xml_tag):
37             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38             container = subtree.find(xml_tag)
39             if container is None:
40                 return None
41             if len(container.getchildren()) != LANES_PER_FLOWCELL:
42                 raise RuntimeError('GERALD config.xml file changed')
43             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
44             index = lanes.index(self._key)
45             element = container[index]
46             return element.text
47         def _get_analysis(self):
48             return self.__get_attribute('ANALYSIS')
49         analysis = property(_get_analysis)
50
51         def _get_eland_genome(self):
52             genome = self.__get_attribute('ELAND_GENOME')
53             # default to the chipwide parameters if there isn't an
54             # entry in the lane specific paramaters
55             if genome is None:
56                 subtree = self._gerald.tree.find('ChipWideRunParameters')
57                 container = subtree.find('ELAND_GENOME')
58                 genome = container.text
59             return genome
60         eland_genome = property(_get_eland_genome)
61
62         def _get_read_length(self):
63             return self.__get_attribute('READ_LENGTH')
64         read_length = property(_get_read_length)
65
66         def _get_use_bases(self):
67             return self.__get_attribute('USE_BASES')
68         use_bases = property(_get_use_bases)
69
70     class LaneSpecificRunParameters(object):
71         """
72         Provide access to LaneSpecificRunParameters
73         """
74         def __init__(self, gerald):
75             self._gerald = gerald
76             self._keys = None
77         def __getitem__(self, key):
78             return Gerald.LaneParameters(self._gerald, key)
79         def keys(self):
80             if self._keys is None:
81                 tree = self._gerald.tree
82                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
83                 # according to the pipeline specs I think their fields 
84                 # are sampleName_laneID, with sampleName defaulting to s
85                 # since laneIDs are constant lets just try using 
86                 # those consistently.
87                 self._keys = [ x.tag.split('_')[1] for x in analysis]
88             return self._keys
89         def values(self):
90             return [ self[x] for x in self.keys() ]
91         def items(self):
92             return zip(self.keys(), self.values())
93         def __len__(self):
94             return len(self.keys())
95
96     def __init__(self, xml=None):
97         self.pathname = None
98         self.tree = None
99
100         # parse lane parameters out of the config.xml file
101         self.lanes = Gerald.LaneSpecificRunParameters(self)
102
103         self.summary = None
104         self.eland_results = None
105
106         if xml is not None:
107             self.set_elements(xml)
108
109     def _get_date(self):
110         if self.tree is None:
111             return datetime.today()
112         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
113         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
114         return datetime.fromtimestamp(epochstamp)
115     date = property(_get_date)
116
117     def _get_time(self):
118         return time.mktime(self.date.timetuple())
119     time = property(_get_time, doc='return run time as seconds since epoch')
120
121     def _get_version(self):
122         if self.tree is None:
123             return None
124         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
125     version = property(_get_version)
126
127     def dump(self):
128         """
129         Debugging function, report current object
130         """
131         print 'Gerald version:', self.version
132         print 'Gerald run date:', self.date
133         print 'Gerald config.xml:', self.tree
134         self.summary.dump()
135
136     def get_elements(self):
137         if self.tree is None or self.summary is None:
138             return None
139
140         gerald = ElementTree.Element(Gerald.GERALD, 
141                                      {'version': unicode(Gerald.XML_VERSION)})
142         gerald.append(self.tree)
143         gerald.append(self.summary.get_elements())
144         if self.eland_results:
145             gerald.append(self.eland_results.get_elements())
146         return gerald
147
148     def set_elements(self, tree):
149         if tree.tag !=  Gerald.GERALD:
150             raise ValueError('exptected GERALD')
151         xml_version = int(tree.attrib.get('version', 0))
152         if xml_version > Gerald.XML_VERSION:
153             logging.warn('XML tree is a higher version than this class')
154         for element in list(tree):
155             tag = element.tag.lower()
156             if tag == Gerald.RUN_PARAMETERS.lower():
157                 self.tree = element
158             elif tag == Gerald.SUMMARY.lower():
159                 self.summary = Summary(xml=element)
160             elif tag == ELAND.ELAND.lower():
161                 self.eland_results = ELAND(xml=element)
162             else:
163                 logging.warn("Unrecognized tag %s" % (element.tag,))
164         
165
166 def gerald(pathname):
167     g = Gerald()
168     g.pathname = pathname
169     path, name = os.path.split(pathname)
170     config_pathname = os.path.join(pathname, 'config.xml')
171     g.tree = ElementTree.parse(config_pathname).getroot()
172
173     # parse Summary.htm file
174     summary_pathname = os.path.join(pathname, 'Summary.htm')
175     g.summary = Summary(summary_pathname)
176     # parse eland files
177     g.eland_results = eland(g.pathname, g)
178     return g
179
180 def tonumber(v):
181     """
182     Convert a value to int if its an int otherwise a float.
183     """
184     try:
185         v = int(v)
186     except ValueError, e:
187         v = float(v)
188     return v
189
190 def parse_mean_range(value):
191     """
192     Parse values like 123 +/- 4.5
193     """
194     if value.strip() == 'unknown':
195         return 0, 0
196
197     average, pm, deviation = value.split()
198     if pm != '+/-':
199         raise RuntimeError("Summary.htm file format changed")
200     return tonumber(average), tonumber(deviation)
201
202 def make_mean_range_element(parent, name, mean, deviation):
203     """
204     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
205     """
206     element = ElementTree.SubElement(parent, name,
207                                      { 'mean': unicode(mean),
208                                        'deviation': unicode(deviation)})
209     return element
210
211 def parse_mean_range_element(element):
212     """
213     Grab mean/deviation out of element
214     """
215     return (tonumber(element.attrib['mean']), 
216             tonumber(element.attrib['deviation']))
217
218 class Summary(object):
219     """
220     Extract some useful information from the Summary.htm file
221     """
222     XML_VERSION = 1
223     SUMMARY = 'Summary'
224
225     class LaneResultSummary(object):
226         """
227         Parse the LaneResultSummary table out of Summary.htm
228         Mostly for the cluster number
229         """
230         LANE_RESULT_SUMMARY = 'LaneResultSummary'
231         TAGS = { 
232           'Cluster': 'cluster',
233           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
234           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
235           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
236           'PercentPassFilterAlign': 'percent_pass_filter_align',
237           'AverageAlignmentScore': 'average_alignment_score',
238           'PercentErrorRate': 'percent_error_rate'
239         }
240                  
241         def __init__(self, html=None, xml=None):
242             self.lane = None
243             self.cluster = None
244             self.average_first_cycle_intensity = None
245             self.percent_intensity_after_20_cycles = None
246             self.percent_pass_filter_clusters = None
247             self.percent_pass_filter_align = None
248             self.average_alignment_score = None
249             self.percent_error_rate = None
250
251             if html is not None:
252                 self.set_elements_from_html(html)
253             if xml is not None:
254                 self.set_elements(xml)
255
256         def set_elements_from_html(self, data):
257             if not len(data) in (8,10):
258                 raise RuntimeError("Summary.htm file format changed")
259
260             # same in pre-0.3.0 Summary file and 0.3 summary file
261             self.lane = data[0]
262
263             if len(data) == 8:
264                 # this is the < 0.3 Pipeline version
265                 self.cluster = parse_mean_range(data[1])
266                 self.average_first_cycle_intensity = parse_mean_range(data[2])
267                 self.percent_intensity_after_20_cycles = \
268                     parse_mean_range(data[3])
269                 self.percent_pass_filter_clusters = parse_mean_range(data[4])
270                 self.percent_pass_filter_align = parse_mean_range(data[5])
271                 self.average_alignment_score = parse_mean_range(data[6])
272                 self.percent_error_rate = parse_mean_range(data[7])
273             elif len(data) == 10:
274                 # this is the >= 0.3 summary file
275                 self.cluster_raw = data[1]
276                 self.cluster = parse_mean_range(data[2])
277                 # FIXME: think of generic way to capture the variable data
278                 
279
280         def get_elements(self):
281             lane_result = ElementTree.Element(
282                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
283                             {'lane': self.lane})
284             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
285                 element = make_mean_range_element(
286                     lane_result,
287                     tag,
288                     *getattr(self, variable_name)
289                 )
290             return lane_result
291
292         def set_elements(self, tree):
293             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
294                 raise ValueError('Expected %s' % (
295                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
296             self.lane = tree.attrib['lane']
297             tags = Summary.LaneResultSummary.TAGS
298             for element in list(tree):
299                 try:
300                     variable_name = tags[element.tag]
301                     setattr(self, variable_name, 
302                             parse_mean_range_element(element))
303                 except KeyError, e:
304                     logging.warn('Unrecognized tag %s' % (element.tag,))
305
306     def __init__(self, filename=None, xml=None):
307         self.lane_results = {}
308
309         if filename is not None:
310             self._extract_lane_results(filename)
311         if xml is not None:
312             self.set_elements(xml)
313
314     def __getitem__(self, key):
315         return self.lane_results[key]
316
317     def __len__(self):
318         return len(self.lane_results)
319
320     def keys(self):
321         return self.lane_results.keys()
322
323     def values(self):
324         return self.lane_results.values()
325
326     def items(self):
327         return self.lane_results.items()
328
329     def _flattened_row(self, row):
330         """
331         flatten the children of a <tr>...</tr>
332         """
333         return [flatten(x) for x in row.getchildren() ]
334     
335     def _parse_table(self, table):
336         """
337         assumes the first line is the header of a table, 
338         and that the remaining rows are data
339         """
340         rows = table.getchildren()
341         data = []
342         for r in rows:
343             data.append(self._flattened_row(r))
344         return data
345     
346     def _extract_named_tables(self, pathname):
347         """
348         extract all the 'named' tables from a Summary.htm file
349         and return as a dictionary
350         
351         Named tables are <h2>...</h2><table>...</table> pairs
352         The contents of the h2 tag is considered to the name
353         of the table.
354         """
355         tree = ElementTree.parse(pathname).getroot()
356         body = tree.find('body')
357         tables = {}
358         for i in range(len(body)):
359             if body[i].tag == 'h2' and body[i+1].tag == 'table':
360                 # we have an interesting table
361                 name = flatten(body[i])
362                 table = body[i+1]
363                 data = self._parse_table(table)
364                 tables[name] = data
365         return tables
366
367     def _extract_lane_results(self, pathname):
368         """
369         extract the Lane Results Summary table
370         """
371
372         tables = self._extract_named_tables(pathname)
373
374         # parse lane result summary
375         lane_summary = tables['Lane Results Summary']
376         # this is version 1 of the summary file
377         if len(lane_summary[-1]) == 8:
378             # strip header
379             headers = lane_summary[0]
380             # grab the lane by lane data
381             lane_summary = lane_summary[1:]
382
383         # this is version 2 of the summary file
384         if len(lane_summary[-1]) == 10:
385             # lane_summary[0] is a different less specific header row
386             headers = lane_summary[1]
387             lane_summary = lane_summary[2:10]
388             # after the last lane, there's a set of chip wide averages
389
390         for r in lane_summary:
391             lrs = Summary.LaneResultSummary(html=r)
392             self.lane_results[lrs.lane] = lrs
393
394     def get_elements(self):
395         summary = ElementTree.Element(Summary.SUMMARY, 
396                                       {'version': unicode(Summary.XML_VERSION)})
397         for lane in self.lane_results.values():
398             summary.append(lane.get_elements())
399         return summary
400
401     def set_elements(self, tree):
402         if tree.tag != Summary.SUMMARY:
403             return ValueError("Expected %s" % (Summary.SUMMARY,))
404         xml_version = int(tree.attrib.get('version', 0))
405         if xml_version > Summary.XML_VERSION:
406             logging.warn('Summary XML tree is a higher version than this class')
407         for element in list(tree):
408             lrs = Summary.LaneResultSummary()
409             lrs.set_elements(element)
410             self.lane_results[lrs.lane] = lrs
411
412     def dump(self):
413         """
414         Debugging function, report current object
415         """
416         pass
417
418
419 def build_genome_fasta_map(genome_dir):
420     # build fasta to fasta file map
421     genome = genome_dir.split(os.path.sep)[-1]
422     fasta_map = {}
423     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
424         is_link = False
425         if os.path.islink(vld_file):
426             is_link = True
427         vld_file = os.path.realpath(vld_file)
428         path, vld_name = os.path.split(vld_file)
429         name, ext = os.path.splitext(vld_name)
430         if is_link:
431             fasta_map[name] = name
432         else:
433             fasta_map[name] = os.path.join(genome, name)
434     return fasta_map
435     
436 class ElandLane(object):
437     """
438     Process an eland result file
439     """
440     XML_VERSION = 1
441     LANE = 'ElandLane'
442     SAMPLE_NAME = 'SampleName'
443     LANE_ID = 'LaneID'
444     GENOME_MAP = 'GenomeMap'
445     GENOME_ITEM = 'GenomeItem'
446     MAPPED_READS = 'MappedReads'
447     MAPPED_ITEM = 'MappedItem'
448     MATCH_CODES = 'MatchCodes'
449     MATCH_ITEM = 'Code'
450     READS = 'Reads'
451
452     def __init__(self, pathname=None, genome_map=None, xml=None):
453         self.pathname = pathname
454         self._sample_name = None
455         self._lane_id = None
456         self._reads = None
457         self._mapped_reads = None
458         self._match_codes = None
459         if genome_map is None:
460             genome_map = {}
461         self.genome_map = genome_map
462         
463         if xml is not None:
464             self.set_elements(xml)
465
466     def _update(self):
467         """
468         Actually read the file and actually count the reads
469         """
470         # can't do anything if we don't have a file to process
471         if self.pathname is None:
472             return
473
474         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
475             raise RuntimeError("Eland isn't done, try again later.")
476
477         reads = 0
478         mapped_reads = {}
479
480         match_codes = {'NM':0, 'QC':0, 'RM':0, 
481                        'U0':0, 'U1':0, 'U2':0,
482                        'R0':0, 'R1':0, 'R2':0,
483                       }
484         for line in autoopen(self.pathname,'r'):
485             reads += 1
486             fields = line.split()
487             # code = fields[2]
488             # match_codes[code] = match_codes.setdefault(code, 0) + 1
489             # the QC/NM etc codes are in the 3rd field and always present
490             match_codes[fields[2]] += 1
491             # ignore lines that don't have a fasta filename
492             if len(fields) < 7:
493                 continue
494             fasta = self.genome_map.get(fields[6], fields[6])
495             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
496         self._match_codes = match_codes
497         self._mapped_reads = mapped_reads
498         self._reads = reads
499
500     def _update_name(self):
501         # extract the sample name
502         if self.pathname is None:
503             return
504
505         path, name = os.path.split(self.pathname)
506         split_name = name.split('_')
507         self._sample_name = split_name[0]
508         self._lane_id = split_name[1]
509
510     def _get_sample_name(self):
511         if self._sample_name is None:
512             self._update_name()
513         return self._sample_name
514     sample_name = property(_get_sample_name)
515
516     def _get_lane_id(self):
517         if self._lane_id is None:
518             self._update_name()
519         return self._lane_id
520     lane_id = property(_get_lane_id)
521
522     def _get_reads(self):
523         if self._reads is None:
524             self._update()
525         return self._reads
526     reads = property(_get_reads)
527
528     def _get_mapped_reads(self):
529         if self._mapped_reads is None:
530             self._update()
531         return self._mapped_reads
532     mapped_reads = property(_get_mapped_reads)
533
534     def _get_match_codes(self):
535         if self._match_codes is None:
536             self._update()
537         return self._match_codes
538     match_codes = property(_get_match_codes)
539
540     def get_elements(self):
541         lane = ElementTree.Element(ElandLane.LANE, 
542                                    {'version': 
543                                     unicode(ElandLane.XML_VERSION)})
544         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
545         sample_tag.text = self.sample_name
546         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
547         lane_tag.text = self.lane_id
548         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
549         for k, v in self.genome_map.items():
550             item = ElementTree.SubElement(
551                 genome_map, ElandLane.GENOME_ITEM, 
552                 {'name':k, 'value':unicode(v)})
553         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
554         for k, v in self.mapped_reads.items():
555             item = ElementTree.SubElement(
556                 mapped_reads, ElandLane.MAPPED_ITEM, 
557                 {'name':k, 'value':unicode(v)})
558         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
559         for k, v in self.match_codes.items():
560             item = ElementTree.SubElement(
561                 match_codes, ElandLane.MATCH_ITEM, 
562                 {'name':k, 'value':unicode(v)})
563         reads = ElementTree.SubElement(lane, ElandLane.READS)
564         reads.text = unicode(self.reads)
565
566         return lane
567
568     def set_elements(self, tree):
569         if tree.tag != ElandLane.LANE:
570             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
571
572         # reset dictionaries
573         self._mapped_reads = {}
574         self._match_codes = {}
575         
576         for element in tree:
577             tag = element.tag.lower()
578             if tag == ElandLane.SAMPLE_NAME.lower():
579                 self._sample_name = element.text
580             elif tag == ElandLane.LANE_ID.lower():
581                 self._lane_id = element.text
582             elif tag == ElandLane.GENOME_MAP.lower():
583                 for child in element:
584                     name = child.attrib['name']
585                     value = child.attrib['value']
586                     self.genome_map[name] = value
587             elif tag == ElandLane.MAPPED_READS.lower():
588                 for child in element:
589                     name = child.attrib['name']
590                     value = child.attrib['value']
591                     self._mapped_reads[name] = int(value)
592             elif tag == ElandLane.MATCH_CODES.lower():
593                 for child in element:
594                     name = child.attrib['name']
595                     value = int(child.attrib['value'])
596                     self._match_codes[name] = value
597             elif tag == ElandLane.READS.lower():
598                 self._reads = int(element.text)
599             else:
600                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
601
602 def extract_eland_sequence(instream, outstream, start, end):
603     """
604     Extract a chunk of sequence out of an eland file
605     """
606     for line in instream:
607         record = line.split()
608         if len(record) > 1:
609             result = [record[0], record[1][start:end]]
610         else:
611             result = [record[0][start:end]]
612         outstream.write("\t".join(result))
613         outstream.write(os.linesep)
614
615 class ELAND(object):
616     """
617     Summarize information from eland files
618     """
619     XML_VERSION = 1
620
621     ELAND = 'ElandCollection'
622     LANE = 'Lane'
623     LANE_ID = 'id'
624
625     def __init__(self, xml=None):
626         # we need information from the gerald config.xml
627         self.results = {}
628         
629         if xml is not None:
630             self.set_elements(xml)
631
632     def __len__(self):
633         return len(self.results)
634
635     def keys(self):
636         return self.results.keys()
637     
638     def values(self):
639         return self.results.values()
640
641     def items(self):
642         return self.results.items()
643
644     def __getitem__(self, key):
645         return self.results[key]
646
647     def get_elements(self):
648         root = ElementTree.Element(ELAND.ELAND, 
649                                    {'version': unicode(ELAND.XML_VERSION)})
650         for lane_id, lane in self.results.items():
651             eland_lane = lane.get_elements()
652             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
653             root.append(eland_lane)
654         return root
655
656     def set_elements(self, tree):
657         if tree.tag.lower() != ELAND.ELAND.lower():
658             raise ValueError('Expecting %s', ELAND.ELAND)
659         for element in list(tree):
660             lane_id = element.attrib[ELAND.LANE_ID]
661             lane = ElandLane(xml=element)
662             self.results[lane_id] = lane
663
664 def eland(basedir, gerald=None, genome_maps=None):
665     e = ELAND()
666
667     file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
668     if len(file_list) == 0:
669         # lets handle compressed eland files too
670         file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
671
672     for pathname in file_list:
673         # yes the lane_id is also being computed in ElandLane._update
674         # I didn't want to clutter up my constructor
675         # but I needed to persist the sample_name/lane_id for
676         # runfolder summary_report
677         path, name = os.path.split(pathname)
678         split_name = name.split('_')
679         lane_id = split_name[1]
680
681         if genome_maps is not None:
682             genome_map = genome_maps[lane_id]
683         elif gerald is not None:
684             genome_dir = gerald.lanes[lane_id].eland_genome
685             genome_map = build_genome_fasta_map(genome_dir)
686         else:
687             genome_map = {}
688
689         eland_result = ElandLane(pathname, genome_map)
690         e.results[lane_id] = eland_result
691     return e