59608da12ed402e51e7b1caa8d7443e62c73e880
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 from glob import glob
6 import logging
7 import os
8 import stat
9 import time
10 import types
11
12 from htsworkflow.pipelines.runfolder import \
13    ElementTree, \
14    EUROPEAN_STRPTIME, \
15    LANES_PER_FLOWCELL, \
16    VERSION_RE
17 from htsworkflow.util.ethelp import indent, flatten
18 from htsworkflow.util.opener import autoopen
19
20 class Gerald(object):
21     """
22     Capture meaning out of the GERALD directory
23     """
24     XML_VERSION = 1
25     GERALD='Gerald'
26     RUN_PARAMETERS='RunParameters'
27     SUMMARY='Summary'
28
29     class LaneParameters(object):
30         """
31         Make it easy to access elements of LaneSpecificRunParameters from python
32         """
33         def __init__(self, gerald, key):
34             self._gerald = gerald
35             self._key = key
36
37         def __get_attribute(self, xml_tag):
38             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
39             container = subtree.find(xml_tag)
40             if container is None:
41                 return None
42             if len(container.getchildren()) > LANES_PER_FLOWCELL:
43                 raise RuntimeError('GERALD config.xml file changed')
44             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
45             index = lanes.index(self._key)
46             element = container[index]
47             return element.text
48         def _get_analysis(self):
49             return self.__get_attribute('ANALYSIS')
50         analysis = property(_get_analysis)
51
52         def _get_eland_genome(self):
53             genome = self.__get_attribute('ELAND_GENOME')
54             # default to the chipwide parameters if there isn't an
55             # entry in the lane specific paramaters
56             if genome is None:
57                 subtree = self._gerald.tree.find('ChipWideRunParameters')
58                 container = subtree.find('ELAND_GENOME')
59                 genome = container.text
60             return genome
61         eland_genome = property(_get_eland_genome)
62
63         def _get_read_length(self):
64             return self.__get_attribute('READ_LENGTH')
65         read_length = property(_get_read_length)
66
67         def _get_use_bases(self):
68             return self.__get_attribute('USE_BASES')
69         use_bases = property(_get_use_bases)
70
71     class LaneSpecificRunParameters(object):
72         """
73         Provide access to LaneSpecificRunParameters
74         """
75         def __init__(self, gerald):
76             self._gerald = gerald
77             self._keys = None
78         def __getitem__(self, key):
79             return Gerald.LaneParameters(self._gerald, key)
80         def keys(self):
81             if self._keys is None:
82                 tree = self._gerald.tree
83                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
84                 # according to the pipeline specs I think their fields
85                 # are sampleName_laneID, with sampleName defaulting to s
86                 # since laneIDs are constant lets just try using
87                 # those consistently.
88                 self._keys = [ x.tag.split('_')[1] for x in analysis]
89             return self._keys
90         def values(self):
91             return [ self[x] for x in self.keys() ]
92         def items(self):
93             return zip(self.keys(), self.values())
94         def __len__(self):
95             return len(self.keys())
96
97     def __init__(self, xml=None):
98         self.pathname = None
99         self.tree = None
100
101         # parse lane parameters out of the config.xml file
102         self.lanes = Gerald.LaneSpecificRunParameters(self)
103
104         self.summary = None
105         self.eland_results = None
106
107         if xml is not None:
108             self.set_elements(xml)
109
110     def _get_date(self):
111         if self.tree is None:
112             return datetime.today()
113         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
114         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
115         return datetime.fromtimestamp(epochstamp)
116     date = property(_get_date)
117
118     def _get_time(self):
119         return time.mktime(self.date.timetuple())
120     time = property(_get_time, doc='return run time as seconds since epoch')
121
122     def _get_version(self):
123         if self.tree is None:
124             return None
125         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
126     version = property(_get_version)
127
128     def dump(self):
129         """
130         Debugging function, report current object
131         """
132         print 'Gerald version:', self.version
133         print 'Gerald run date:', self.date
134         print 'Gerald config.xml:', self.tree
135         self.summary.dump()
136
137     def get_elements(self):
138         if self.tree is None or self.summary is None:
139             return None
140
141         gerald = ElementTree.Element(Gerald.GERALD,
142                                      {'version': unicode(Gerald.XML_VERSION)})
143         gerald.append(self.tree)
144         gerald.append(self.summary.get_elements())
145         if self.eland_results:
146             gerald.append(self.eland_results.get_elements())
147         return gerald
148
149     def set_elements(self, tree):
150         if tree.tag !=  Gerald.GERALD:
151             raise ValueError('exptected GERALD')
152         xml_version = int(tree.attrib.get('version', 0))
153         if xml_version > Gerald.XML_VERSION:
154             logging.warn('XML tree is a higher version than this class')
155         for element in list(tree):
156             tag = element.tag.lower()
157             if tag == Gerald.RUN_PARAMETERS.lower():
158                 self.tree = element
159             elif tag == Gerald.SUMMARY.lower():
160                 self.summary = Summary(xml=element)
161             elif tag == ELAND.ELAND.lower():
162                 self.eland_results = ELAND(xml=element)
163             else:
164                 logging.warn("Unrecognized tag %s" % (element.tag,))
165
166
167 def gerald(pathname):
168     g = Gerald()
169     g.pathname = pathname
170     path, name = os.path.split(pathname)
171     logging.info("Parsing gerald config.xml")
172     config_pathname = os.path.join(pathname, 'config.xml')
173     g.tree = ElementTree.parse(config_pathname).getroot()
174
175     # parse Summary.htm file
176     logging.info("Parsing Summary.htm")
177     summary_pathname = os.path.join(pathname, 'Summary.htm')
178     g.summary = Summary(summary_pathname)
179     # parse eland files
180     g.eland_results = eland(g.pathname, g)
181     return g
182
183 def tonumber(v):
184     """
185     Convert a value to int if its an int otherwise a float.
186     """
187     try:
188         v = int(v)
189     except ValueError, e:
190         v = float(v)
191     return v
192
193 def parse_mean_range(value):
194     """
195     Parse values like 123 +/- 4.5
196     """
197     if value.strip() == 'unknown':
198         return 0, 0
199
200     average, pm, deviation = value.split()
201     if pm != '+/-':
202         raise RuntimeError("Summary.htm file format changed")
203     return tonumber(average), tonumber(deviation)
204
205 def make_mean_range_element(parent, name, mean, deviation):
206     """
207     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
208     """
209     element = ElementTree.SubElement(parent, name,
210                                      { 'mean': unicode(mean),
211                                        'deviation': unicode(deviation)})
212     return element
213
214 def parse_mean_range_element(element):
215     """
216     Grab mean/deviation out of element
217     """
218     return (tonumber(element.attrib['mean']),
219             tonumber(element.attrib['deviation']))
220
221 def parse_summary_element(element):
222     """
223     Determine if we have a simple element or a mean/deviation element
224     """
225     if len(element.attrib) > 0:
226         return parse_mean_range_element(element)
227     else:
228         return element.text
229
230 class Summary(object):
231     """
232     Extract some useful information from the Summary.htm file
233     """
234     XML_VERSION = 2
235     SUMMARY = 'Summary'
236
237     class LaneResultSummary(object):
238         """
239         Parse the LaneResultSummary table out of Summary.htm
240         Mostly for the cluster number
241         """
242         LANE_RESULT_SUMMARY = 'LaneResultSummary'
243         TAGS = {
244           'LaneYield': 'lane_yield',
245           'Cluster': 'cluster', # Raw
246           'ClusterPF': 'cluster_pass_filter',
247           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
248           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
249           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
250           'PercentPassFilterAlign': 'percent_pass_filter_align',
251           'AverageAlignmentScore': 'average_alignment_score',
252           'PercentErrorRate': 'percent_error_rate'
253         }
254
255         def __init__(self, html=None, xml=None):
256             self.lane = None
257             self.lane_yield = None
258             self.cluster = None
259             self.cluster_pass_filter = None
260             self.average_first_cycle_intensity = None
261             self.percent_intensity_after_20_cycles = None
262             self.percent_pass_filter_clusters = None
263             self.percent_pass_filter_align = None
264             self.average_alignment_score = None
265             self.percent_error_rate = None
266
267             if html is not None:
268                 self.set_elements_from_html(html)
269             if xml is not None:
270                 self.set_elements(xml)
271
272         def set_elements_from_html(self, data):
273             if not len(data) in (8,10):
274                 raise RuntimeError("Summary.htm file format changed")
275
276             # same in pre-0.3.0 Summary file and 0.3 summary file
277             self.lane = data[0]
278
279             if len(data) == 8:
280                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
281                 # this is the < 0.3 Pipeline version
282                 self.cluster = parsed_data[0]
283                 self.average_first_cycle_intensity = parsed_data[1]
284                 self.percent_intensity_after_20_cycles = parsed_data[2]
285                 self.percent_pass_filter_clusters = parsed_data[3]
286                 self.percent_pass_filter_align = parsed_data[4]
287                 self.average_alignment_score = parsed_data[5]
288                 self.percent_error_rate = parsed_data[6]
289             elif len(data) == 10:
290                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
291                 # this is the >= 0.3 summary file
292                 self.lane_yield = data[1]
293                 self.cluster = parsed_data[0]
294                 self.cluster_pass_filter = parsed_data[1]
295                 self.average_first_cycle_intensity = parsed_data[2]
296                 self.percent_intensity_after_20_cycles = parsed_data[3]
297                 self.percent_pass_filter_clusters = parsed_data[4]
298                 self.percent_pass_filter_align = parsed_data[5]
299                 self.average_alignment_score = parsed_data[6]
300                 self.percent_error_rate = parsed_data[7]
301
302         def get_elements(self):
303             lane_result = ElementTree.Element(
304                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
305                             {'lane': self.lane})
306             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
307                 value = getattr(self, variable_name)
308                 if value is None:
309                     continue
310                 # it looks like a sequence
311                 elif type(value) in (types.TupleType, types.ListType):
312                     element = make_mean_range_element(
313                       lane_result,
314                       tag,
315                       *value
316                     )
317                 else:
318                     element = ElementTree.SubElement(lane_result, tag)
319                     element.text = value
320             return lane_result
321
322         def set_elements(self, tree):
323             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
324                 raise ValueError('Expected %s' % (
325                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
326             self.lane = tree.attrib['lane']
327             tags = Summary.LaneResultSummary.TAGS
328             for element in list(tree):
329                 try:
330                     variable_name = tags[element.tag]
331                     setattr(self, variable_name,
332                             parse_summary_element(element))
333                 except KeyError, e:
334                     logging.warn('Unrecognized tag %s' % (element.tag,))
335
336     def __init__(self, filename=None, xml=None):
337         self.lane_results = {}
338
339         if filename is not None:
340             self._extract_lane_results(filename)
341         if xml is not None:
342             self.set_elements(xml)
343
344     def __getitem__(self, key):
345         return self.lane_results[key]
346
347     def __len__(self):
348         return len(self.lane_results)
349
350     def keys(self):
351         return self.lane_results.keys()
352
353     def values(self):
354         return self.lane_results.values()
355
356     def items(self):
357         return self.lane_results.items()
358
359     def _flattened_row(self, row):
360         """
361         flatten the children of a <tr>...</tr>
362         """
363         return [flatten(x) for x in row.getchildren() ]
364
365     def _parse_table(self, table):
366         """
367         assumes the first line is the header of a table,
368         and that the remaining rows are data
369         """
370         rows = table.getchildren()
371         data = []
372         for r in rows:
373             data.append(self._flattened_row(r))
374         return data
375
376     def _extract_named_tables(self, pathname):
377         """
378         extract all the 'named' tables from a Summary.htm file
379         and return as a dictionary
380
381         Named tables are <h2>...</h2><table>...</table> pairs
382         The contents of the h2 tag is considered to the name
383         of the table.
384         """
385         tree = ElementTree.parse(pathname).getroot()
386         body = tree.find('body')
387         tables = {}
388         for i in range(len(body)):
389             if body[i].tag == 'h2' and body[i+1].tag == 'table':
390                 # we have an interesting table
391                 name = flatten(body[i])
392                 table = body[i+1]
393                 data = self._parse_table(table)
394                 tables[name] = data
395         return tables
396
397     def _extract_lane_results(self, pathname):
398         """
399         extract the Lane Results Summary table
400         """
401
402         tables = self._extract_named_tables(pathname)
403
404         # parse lane result summary
405         lane_summary = tables['Lane Results Summary']
406         # this is version 1 of the summary file
407         if len(lane_summary[-1]) == 8:
408             # strip header
409             headers = lane_summary[0]
410             # grab the lane by lane data
411             lane_summary = lane_summary[1:]
412
413         # this is version 2 of the summary file
414         if len(lane_summary[-1]) == 10:
415             # lane_summary[0] is a different less specific header row
416             headers = lane_summary[1]
417             lane_summary = lane_summary[2:10]
418             # after the last lane, there's a set of chip wide averages
419
420         for r in lane_summary:
421             lrs = Summary.LaneResultSummary(html=r)
422             self.lane_results[lrs.lane] = lrs
423
424     def get_elements(self):
425         summary = ElementTree.Element(Summary.SUMMARY,
426                                       {'version': unicode(Summary.XML_VERSION)})
427         for lane in self.lane_results.values():
428             summary.append(lane.get_elements())
429         return summary
430
431     def set_elements(self, tree):
432         if tree.tag != Summary.SUMMARY:
433             return ValueError("Expected %s" % (Summary.SUMMARY,))
434         xml_version = int(tree.attrib.get('version', 0))
435         if xml_version > Summary.XML_VERSION:
436             logging.warn('Summary XML tree is a higher version than this class')
437         for element in list(tree):
438             lrs = Summary.LaneResultSummary()
439             lrs.set_elements(element)
440             self.lane_results[lrs.lane] = lrs
441
442     def dump(self):
443         """
444         Debugging function, report current object
445         """
446         pass
447
448
449 def build_genome_fasta_map(genome_dir):
450     # build fasta to fasta file map
451     logging.info("Building genome map")
452     genome = genome_dir.split(os.path.sep)[-1]
453     fasta_map = {}
454     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
455         is_link = False
456         if os.path.islink(vld_file):
457             is_link = True
458         vld_file = os.path.realpath(vld_file)
459         path, vld_name = os.path.split(vld_file)
460         name, ext = os.path.splitext(vld_name)
461         if is_link:
462             fasta_map[name] = name
463         else:
464             fasta_map[name] = os.path.join(genome, name)
465     return fasta_map
466
467 class ElandLane(object):
468     """
469     Process an eland result file
470     """
471     XML_VERSION = 1
472     LANE = 'ElandLane'
473     SAMPLE_NAME = 'SampleName'
474     LANE_ID = 'LaneID'
475     GENOME_MAP = 'GenomeMap'
476     GENOME_ITEM = 'GenomeItem'
477     MAPPED_READS = 'MappedReads'
478     MAPPED_ITEM = 'MappedItem'
479     MATCH_CODES = 'MatchCodes'
480     MATCH_ITEM = 'Code'
481     READS = 'Reads'
482
483     def __init__(self, pathname=None, genome_map=None, xml=None):
484         self.pathname = pathname
485         self._sample_name = None
486         self._lane_id = None
487         self._reads = None
488         self._mapped_reads = None
489         self._match_codes = None
490         if genome_map is None:
491             genome_map = {}
492         self.genome_map = genome_map
493
494         if xml is not None:
495             self.set_elements(xml)
496
497     def _update(self):
498         """
499         Actually read the file and actually count the reads
500         """
501         # can't do anything if we don't have a file to process
502         if self.pathname is None:
503             return
504
505         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
506             raise RuntimeError("Eland isn't done, try again later.")
507
508         logging.info("summarizing results for %s" % (self.pathname))
509         reads = 0
510         mapped_reads = {}
511
512         match_codes = {'NM':0, 'QC':0, 'RM':0,
513                        'U0':0, 'U1':0, 'U2':0,
514                        'R0':0, 'R1':0, 'R2':0,
515                       }
516         for line in autoopen(self.pathname,'r'):
517             reads += 1
518             fields = line.split()
519             # code = fields[2]
520             # match_codes[code] = match_codes.setdefault(code, 0) + 1
521             # the QC/NM etc codes are in the 3rd field and always present
522             match_codes[fields[2]] += 1
523             # ignore lines that don't have a fasta filename
524             if len(fields) < 7:
525                 continue
526             fasta = self.genome_map.get(fields[6], fields[6])
527             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
528         self._match_codes = match_codes
529         self._mapped_reads = mapped_reads
530         self._reads = reads
531
532     def _update_name(self):
533         # extract the sample name
534         if self.pathname is None:
535             return
536
537         path, name = os.path.split(self.pathname)
538         split_name = name.split('_')
539         self._sample_name = split_name[0]
540         self._lane_id = split_name[1]
541
542     def _get_sample_name(self):
543         if self._sample_name is None:
544             self._update_name()
545         return self._sample_name
546     sample_name = property(_get_sample_name)
547
548     def _get_lane_id(self):
549         if self._lane_id is None:
550             self._update_name()
551         return self._lane_id
552     lane_id = property(_get_lane_id)
553
554     def _get_reads(self):
555         if self._reads is None:
556             self._update()
557         return self._reads
558     reads = property(_get_reads)
559
560     def _get_mapped_reads(self):
561         if self._mapped_reads is None:
562             self._update()
563         return self._mapped_reads
564     mapped_reads = property(_get_mapped_reads)
565
566     def _get_match_codes(self):
567         if self._match_codes is None:
568             self._update()
569         return self._match_codes
570     match_codes = property(_get_match_codes)
571
572     def get_elements(self):
573         lane = ElementTree.Element(ElandLane.LANE,
574                                    {'version':
575                                     unicode(ElandLane.XML_VERSION)})
576         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
577         sample_tag.text = self.sample_name
578         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
579         lane_tag.text = self.lane_id
580         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
581         for k, v in self.genome_map.items():
582             item = ElementTree.SubElement(
583                 genome_map, ElandLane.GENOME_ITEM,
584                 {'name':k, 'value':unicode(v)})
585         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
586         for k, v in self.mapped_reads.items():
587             item = ElementTree.SubElement(
588                 mapped_reads, ElandLane.MAPPED_ITEM,
589                 {'name':k, 'value':unicode(v)})
590         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
591         for k, v in self.match_codes.items():
592             item = ElementTree.SubElement(
593                 match_codes, ElandLane.MATCH_ITEM,
594                 {'name':k, 'value':unicode(v)})
595         reads = ElementTree.SubElement(lane, ElandLane.READS)
596         reads.text = unicode(self.reads)
597
598         return lane
599
600     def set_elements(self, tree):
601         if tree.tag != ElandLane.LANE:
602             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
603
604         # reset dictionaries
605         self._mapped_reads = {}
606         self._match_codes = {}
607
608         for element in tree:
609             tag = element.tag.lower()
610             if tag == ElandLane.SAMPLE_NAME.lower():
611                 self._sample_name = element.text
612             elif tag == ElandLane.LANE_ID.lower():
613                 self._lane_id = element.text
614             elif tag == ElandLane.GENOME_MAP.lower():
615                 for child in element:
616                     name = child.attrib['name']
617                     value = child.attrib['value']
618                     self.genome_map[name] = value
619             elif tag == ElandLane.MAPPED_READS.lower():
620                 for child in element:
621                     name = child.attrib['name']
622                     value = child.attrib['value']
623                     self._mapped_reads[name] = int(value)
624             elif tag == ElandLane.MATCH_CODES.lower():
625                 for child in element:
626                     name = child.attrib['name']
627                     value = int(child.attrib['value'])
628                     self._match_codes[name] = value
629             elif tag == ElandLane.READS.lower():
630                 self._reads = int(element.text)
631             else:
632                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
633
634 def extract_eland_sequence(instream, outstream, start, end):
635     """
636     Extract a chunk of sequence out of an eland file
637     """
638     for line in instream:
639         record = line.split()
640         if len(record) > 1:
641             result = [record[0], record[1][start:end]]
642         else:
643             result = [record[0][start:end]]
644         outstream.write("\t".join(result))
645         outstream.write(os.linesep)
646
647 class ELAND(object):
648     """
649     Summarize information from eland files
650     """
651     XML_VERSION = 1
652
653     ELAND = 'ElandCollection'
654     LANE = 'Lane'
655     LANE_ID = 'id'
656
657     def __init__(self, xml=None):
658         # we need information from the gerald config.xml
659         self.results = {}
660
661         if xml is not None:
662             self.set_elements(xml)
663
664     def __len__(self):
665         return len(self.results)
666
667     def keys(self):
668         return self.results.keys()
669
670     def values(self):
671         return self.results.values()
672
673     def items(self):
674         return self.results.items()
675
676     def __getitem__(self, key):
677         return self.results[key]
678
679     def get_elements(self):
680         root = ElementTree.Element(ELAND.ELAND,
681                                    {'version': unicode(ELAND.XML_VERSION)})
682         for lane_id, lane in self.results.items():
683             eland_lane = lane.get_elements()
684             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
685             root.append(eland_lane)
686         return root
687
688     def set_elements(self, tree):
689         if tree.tag.lower() != ELAND.ELAND.lower():
690             raise ValueError('Expecting %s', ELAND.ELAND)
691         for element in list(tree):
692             lane_id = element.attrib[ELAND.LANE_ID]
693             lane = ElandLane(xml=element)
694             self.results[lane_id] = lane
695
696 def eland(basedir, gerald=None, genome_maps=None):
697     e = ELAND()
698
699     file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
700     if len(file_list) == 0:
701         # lets handle compressed eland files too
702         file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
703
704     for pathname in file_list:
705         # yes the lane_id is also being computed in ElandLane._update
706         # I didn't want to clutter up my constructor
707         # but I needed to persist the sample_name/lane_id for
708         # runfolder summary_report
709         path, name = os.path.split(pathname)
710         logging.info("Adding eland file %s" %(name,))
711         split_name = name.split('_')
712         lane_id = split_name[1]
713
714         if genome_maps is not None:
715             genome_map = genome_maps[lane_id]
716         elif gerald is not None:
717             genome_dir = gerald.lanes[lane_id].eland_genome
718             genome_map = build_genome_fasta_map(genome_dir)
719         else:
720             genome_map = {}
721
722         eland_result = ElandLane(pathname, genome_map)
723         e.results[lane_id] = eland_result
724     return e