1a6bce18cff7eb45f217bbb8133faaa39a8250d3
[htsworkflow.git] / gaworkflow / pipeline / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 from glob import glob
6 import logging
7 import os
8 import stat
9 import time
10
11 from gaworkflow.pipeline.runfolder import \
12    ElementTree, \
13    EUROPEAN_STRPTIME, \
14    LANES_PER_FLOWCELL, \
15    VERSION_RE
16 from gaworkflow.util.ethelp import indent, flatten
17 from gaworkflow.util.opener import autoopen
18
19 class Gerald(object):
20     """
21     Capture meaning out of the GERALD directory
22     """
23     XML_VERSION = 1
24     GERALD='Gerald'
25     RUN_PARAMETERS='RunParameters'
26     SUMMARY='Summary'
27
28     class LaneParameters(object):
29         """
30         Make it easy to access elements of LaneSpecificRunParameters from python
31         """
32         def __init__(self, gerald, key):
33             self._gerald = gerald
34             self._key = key
35         
36         def __get_attribute(self, xml_tag):
37             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38             container = subtree.find(xml_tag)
39             if container is None or \
40                len(container.getchildren()) != LANES_PER_FLOWCELL:
41                 raise RuntimeError('GERALD config.xml file changed')
42             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
43             index = lanes.index(self._key)
44             #element = container.find(self._key)
45             element = container[index]
46             return element.text
47         def _get_analysis(self):
48             return self.__get_attribute('ANALYSIS')
49         analysis = property(_get_analysis)
50
51         def _get_eland_genome(self):
52             return self.__get_attribute('ELAND_GENOME')
53         eland_genome = property(_get_eland_genome)
54
55         def _get_read_length(self):
56             return self.__get_attribute('READ_LENGTH')
57         read_length = property(_get_read_length)
58
59         def _get_use_bases(self):
60             return self.__get_attribute('USE_BASES')
61         use_bases = property(_get_use_bases)
62
63     class LaneSpecificRunParameters(object):
64         """
65         Provide access to LaneSpecificRunParameters
66         """
67         def __init__(self, gerald):
68             self._gerald = gerald
69             self._keys = None
70         def __getitem__(self, key):
71             return Gerald.LaneParameters(self._gerald, key)
72         def keys(self):
73             if self._keys is None:
74                 tree = self._gerald.tree
75                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
76                 # according to the pipeline specs I think their fields 
77                 # are sampleName_laneID, with sampleName defaulting to s
78                 # since laneIDs are constant lets just try using 
79                 # those consistently.
80                 self._keys = [ x.tag.split('_')[1] for x in analysis]
81             return self._keys
82         def values(self):
83             return [ self[x] for x in self.keys() ]
84         def items(self):
85             return zip(self.keys(), self.values())
86         def __len__(self):
87             return len(self.keys())
88
89     def __init__(self, xml=None):
90         self.pathname = None
91         self.tree = None
92
93         # parse lane parameters out of the config.xml file
94         self.lanes = Gerald.LaneSpecificRunParameters(self)
95
96         self.summary = None
97         self.eland_results = None
98
99         if xml is not None:
100             self.set_elements(xml)
101
102     def _get_date(self):
103         if self.tree is None:
104             return datetime.today()
105         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
106         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
107         return datetime.fromtimestamp(epochstamp)
108     date = property(_get_date)
109
110     def _get_time(self):
111         return time.mktime(self.date.timetuple())
112     time = property(_get_time, doc='return run time as seconds since epoch')
113
114     def _get_version(self):
115         if self.tree is None:
116             return None
117         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
118     version = property(_get_version)
119
120     def dump(self):
121         """
122         Debugging function, report current object
123         """
124         print 'Gerald version:', self.version
125         print 'Gerald run date:', self.date
126         print 'Gerald config.xml:', self.tree
127         self.summary.dump()
128
129     def get_elements(self):
130         if self.tree is None or self.summary is None:
131             return None
132
133         gerald = ElementTree.Element(Gerald.GERALD, 
134                                      {'version': unicode(Gerald.XML_VERSION)})
135         gerald.append(self.tree)
136         gerald.append(self.summary.get_elements())
137         if self.eland_results:
138             gerald.append(self.eland_results.get_elements())
139         return gerald
140
141     def set_elements(self, tree):
142         if tree.tag !=  Gerald.GERALD:
143             raise ValueError('exptected GERALD')
144         xml_version = int(tree.attrib.get('version', 0))
145         if xml_version > Gerald.XML_VERSION:
146             logging.warn('XML tree is a higher version than this class')
147         for element in list(tree):
148             tag = element.tag.lower()
149             if tag == Gerald.RUN_PARAMETERS.lower():
150                 self.tree = element
151             elif tag == Gerald.SUMMARY.lower():
152                 self.summary = Summary(xml=element)
153             elif tag == ELAND.ELAND.lower():
154                 self.eland_results = ELAND(xml=element)
155             else:
156                 logging.warn("Unrecognized tag %s" % (element.tag,))
157         
158
159 def gerald(pathname):
160     g = Gerald()
161     g.pathname = pathname
162     path, name = os.path.split(pathname)
163     config_pathname = os.path.join(pathname, 'config.xml')
164     g.tree = ElementTree.parse(config_pathname).getroot()
165
166     # parse Summary.htm file
167     summary_pathname = os.path.join(pathname, 'Summary.htm')
168     g.summary = Summary(summary_pathname)
169     # parse eland files
170     g.eland_results = eland(g.pathname, g)
171     return g
172
173 def tonumber(v):
174     """
175     Convert a value to int if its an int otherwise a float.
176     """
177     try:
178         v = int(v)
179     except ValueError, e:
180         v = float(v)
181     return v
182
183 def parse_mean_range(value):
184     """
185     Parse values like 123 +/- 4.5
186     """
187     if value.strip() == 'unknown':
188         return 0, 0
189
190     average, pm, deviation = value.split()
191     if pm != '+/-':
192         raise RuntimeError("Summary.htm file format changed")
193     return tonumber(average), tonumber(deviation)
194
195 def make_mean_range_element(parent, name, mean, deviation):
196     """
197     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
198     """
199     element = ElementTree.SubElement(parent, name,
200                                      { 'mean': unicode(mean),
201                                        'deviation': unicode(deviation)})
202     return element
203
204 def parse_mean_range_element(element):
205     """
206     Grab mean/deviation out of element
207     """
208     return (tonumber(element.attrib['mean']), 
209             tonumber(element.attrib['deviation']))
210
211 class Summary(object):
212     """
213     Extract some useful information from the Summary.htm file
214     """
215     XML_VERSION = 1
216     SUMMARY = 'Summary'
217
218     class LaneResultSummary(object):
219         """
220         Parse the LaneResultSummary table out of Summary.htm
221         Mostly for the cluster number
222         """
223         LANE_RESULT_SUMMARY = 'LaneResultSummary'
224         TAGS = { 
225           'Cluster': 'cluster',
226           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
227           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
228           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
229           'PercentPassFilterAlign': 'percent_pass_filter_align',
230           'AverageAlignmentScore': 'average_alignment_score',
231           'PercentErrorRate': 'percent_error_rate'
232         }
233                  
234         def __init__(self, html=None, xml=None):
235             self.lane = None
236             self.cluster = None
237             self.average_first_cycle_intensity = None
238             self.percent_intensity_after_20_cycles = None
239             self.percent_pass_filter_clusters = None
240             self.percent_pass_filter_align = None
241             self.average_alignment_score = None
242             self.percent_error_rate = None
243
244             if html is not None:
245                 self.set_elements_from_html(html)
246             if xml is not None:
247                 self.set_elements(xml)
248
249         def set_elements_from_html(self, row_element):
250             data = [ flatten(x) for x in row_element ]
251             if len(data) != 8:
252                 raise RuntimeError("Summary.htm file format changed")
253
254             self.lane = data[0]
255             self.cluster = parse_mean_range(data[1])
256             self.average_first_cycle_intensity = parse_mean_range(data[2])
257             self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
258             self.percent_pass_filter_clusters = parse_mean_range(data[4])
259             self.percent_pass_filter_align = parse_mean_range(data[5])
260             self.average_alignment_score = parse_mean_range(data[6])
261             self.percent_error_rate = parse_mean_range(data[7])
262
263         def get_elements(self):
264             lane_result = ElementTree.Element(
265                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
266                             {'lane': self.lane})
267             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
268                 element = make_mean_range_element(
269                     lane_result,
270                     tag,
271                     *getattr(self, variable_name)
272                 )
273             return lane_result
274
275         def set_elements(self, tree):
276             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
277                 raise ValueError('Expected %s' % (
278                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
279             self.lane = tree.attrib['lane']
280             tags = Summary.LaneResultSummary.TAGS
281             for element in list(tree):
282                 try:
283                     variable_name = tags[element.tag]
284                     setattr(self, variable_name, 
285                             parse_mean_range_element(element))
286                 except KeyError, e:
287                     logging.warn('Unrecognized tag %s' % (element.tag,))
288
289     def __init__(self, filename=None, xml=None):
290         self.lane_results = {}
291
292         if filename is not None:
293             self._extract_lane_results(filename)
294         if xml is not None:
295             self.set_elements(xml)
296
297     def __getitem__(self, key):
298         return self.lane_results[key]
299
300     def __len__(self):
301         return len(self.lane_results)
302
303     def keys(self):
304         return self.lane_results.keys()
305
306     def values(self):
307         return self.lane_results.values()
308
309     def items(self):
310         return self.lane_results.items()
311
312     def _extract_lane_results(self, pathname):
313         """
314         extract the Lane Results Summary table
315         """
316         tree = ElementTree.parse(pathname).getroot()
317         if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
318             raise RuntimeError("Summary.htm file format changed")
319
320         tables = tree.findall('*//table')
321
322         # parse lane result summary
323         lane_summary = tables[2]
324         rows = lane_summary.getchildren()
325         headers = rows[0].getchildren()
326         if flatten(headers[2]) != 'Av 1st Cycle Int ':
327             raise RuntimeError("Summary.htm file format changed")
328
329         for r in rows[1:]:
330             lrs = Summary.LaneResultSummary(html=r)
331             self.lane_results[lrs.lane] = lrs
332
333     def get_elements(self):
334         summary = ElementTree.Element(Summary.SUMMARY, 
335                                       {'version': unicode(Summary.XML_VERSION)})
336         for lane in self.lane_results.values():
337             summary.append(lane.get_elements())
338         return summary
339
340     def set_elements(self, tree):
341         if tree.tag != Summary.SUMMARY:
342             return ValueError("Expected %s" % (Summary.SUMMARY,))
343         xml_version = int(tree.attrib.get('version', 0))
344         if xml_version > Summary.XML_VERSION:
345             logging.warn('Summary XML tree is a higher version than this class')
346         for element in list(tree):
347             lrs = Summary.LaneResultSummary()
348             lrs.set_elements(element)
349             self.lane_results[lrs.lane] = lrs
350
351     def dump(self):
352         """
353         Debugging function, report current object
354         """
355         pass
356
357
358 def build_genome_fasta_map(genome_dir):
359     # build fasta to fasta file map
360     genome = genome_dir.split(os.path.sep)[-1]
361     fasta_map = {}
362     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
363         is_link = False
364         if os.path.islink(vld_file):
365             is_link = True
366         vld_file = os.path.realpath(vld_file)
367         path, vld_name = os.path.split(vld_file)
368         name, ext = os.path.splitext(vld_name)
369         if is_link:
370             fasta_map[name] = name
371         else:
372             fasta_map[name] = os.path.join(genome, name)
373     return fasta_map
374     
375 class ElandLane(object):
376     """
377     Process an eland result file
378     """
379     XML_VERSION = 1
380     LANE = 'ElandLane'
381     SAMPLE_NAME = 'SampleName'
382     LANE_ID = 'LaneID'
383     GENOME_MAP = 'GenomeMap'
384     GENOME_ITEM = 'GenomeItem'
385     MAPPED_READS = 'MappedReads'
386     MAPPED_ITEM = 'MappedItem'
387     MATCH_CODES = 'MatchCodes'
388     MATCH_ITEM = 'Code'
389     READS = 'Reads'
390
391     def __init__(self, pathname=None, genome_map=None, xml=None):
392         self.pathname = pathname
393         self._sample_name = None
394         self._lane_id = None
395         self._reads = None
396         self._mapped_reads = None
397         self._match_codes = None
398         if genome_map is None:
399             genome_map = {}
400         self.genome_map = genome_map
401         
402         if xml is not None:
403             self.set_elements(xml)
404
405     def _update(self):
406         """
407         Actually read the file and actually count the reads
408         """
409         # can't do anything if we don't have a file to process
410         if self.pathname is None:
411             return
412
413         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
414             raise RuntimeError("Eland isn't done, try again later.")
415
416         reads = 0
417         mapped_reads = {}
418
419         match_codes = {'NM':0, 'QC':0, 'RM':0, 
420                        'U0':0, 'U1':0, 'U2':0,
421                        'R0':0, 'R1':0, 'R2':0,
422                       }
423         for line in autoopen(self.pathname,'r'):
424             reads += 1
425             fields = line.split()
426             # code = fields[2]
427             # match_codes[code] = match_codes.setdefault(code, 0) + 1
428             # the QC/NM etc codes are in the 3rd field and always present
429             match_codes[fields[2]] += 1
430             # ignore lines that don't have a fasta filename
431             if len(fields) < 7:
432                 continue
433             fasta = self.genome_map.get(fields[6], fields[6])
434             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
435         self._match_codes = match_codes
436         self._mapped_reads = mapped_reads
437         self._reads = reads
438
439     def _update_name(self):
440         # extract the sample name
441         if self.pathname is None:
442             return
443
444         path, name = os.path.split(self.pathname)
445         split_name = name.split('_')
446         self._sample_name = split_name[0]
447         self._lane_id = split_name[1]
448
449     def _get_sample_name(self):
450         if self._sample_name is None:
451             self._update_name()
452         return self._sample_name
453     sample_name = property(_get_sample_name)
454
455     def _get_lane_id(self):
456         if self._lane_id is None:
457             self._update_name()
458         return self._lane_id
459     lane_id = property(_get_lane_id)
460
461     def _get_reads(self):
462         if self._reads is None:
463             self._update()
464         return self._reads
465     reads = property(_get_reads)
466
467     def _get_mapped_reads(self):
468         if self._mapped_reads is None:
469             self._update()
470         return self._mapped_reads
471     mapped_reads = property(_get_mapped_reads)
472
473     def _get_match_codes(self):
474         if self._match_codes is None:
475             self._update()
476         return self._match_codes
477     match_codes = property(_get_match_codes)
478
479     def get_elements(self):
480         lane = ElementTree.Element(ElandLane.LANE, 
481                                    {'version': 
482                                     unicode(ElandLane.XML_VERSION)})
483         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
484         sample_tag.text = self.sample_name
485         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
486         lane_tag.text = self.lane_id
487         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
488         for k, v in self.genome_map.items():
489             item = ElementTree.SubElement(
490                 genome_map, ElandLane.GENOME_ITEM, 
491                 {'name':k, 'value':unicode(v)})
492         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
493         for k, v in self.mapped_reads.items():
494             item = ElementTree.SubElement(
495                 mapped_reads, ElandLane.MAPPED_ITEM, 
496                 {'name':k, 'value':unicode(v)})
497         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
498         for k, v in self.match_codes.items():
499             item = ElementTree.SubElement(
500                 match_codes, ElandLane.MATCH_ITEM, 
501                 {'name':k, 'value':unicode(v)})
502         reads = ElementTree.SubElement(lane, ElandLane.READS)
503         reads.text = unicode(self.reads)
504
505         return lane
506
507     def set_elements(self, tree):
508         if tree.tag != ElandLane.LANE:
509             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
510
511         # reset dictionaries
512         self._mapped_reads = {}
513         self._match_codes = {}
514         
515         for element in tree:
516             tag = element.tag.lower()
517             if tag == ElandLane.SAMPLE_NAME.lower():
518                 self._sample_name = element.text
519             elif tag == ElandLane.LANE_ID.lower():
520                 self._lane_id = element.text
521             elif tag == ElandLane.GENOME_MAP.lower():
522                 for child in element:
523                     name = child.attrib['name']
524                     value = child.attrib['value']
525                     self.genome_map[name] = value
526             elif tag == ElandLane.MAPPED_READS.lower():
527                 for child in element:
528                     name = child.attrib['name']
529                     value = child.attrib['value']
530                     self._mapped_reads[name] = int(value)
531             elif tag == ElandLane.MATCH_CODES.lower():
532                 for child in element:
533                     name = child.attrib['name']
534                     value = int(child.attrib['value'])
535                     self._match_codes[name] = value
536             elif tag == ElandLane.READS.lower():
537                 self._reads = int(element.text)
538             else:
539                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
540
541 def extract_eland_sequence(instream, outstream, start, end):
542     """
543     Extract a chunk of sequence out of an eland file
544     """
545     for line in instream:
546         record = line.split()
547         if len(record) > 1:
548             result = [record[0], record[1][start:end]]
549         else:
550             result = [record[0][start:end]]
551         outstream.write("\t".join(result))
552         outstream.write(os.linesep)
553
554 class ELAND(object):
555     """
556     Summarize information from eland files
557     """
558     XML_VERSION = 1
559
560     ELAND = 'ElandCollection'
561     LANE = 'Lane'
562     LANE_ID = 'id'
563
564     def __init__(self, xml=None):
565         # we need information from the gerald config.xml
566         self.results = {}
567         
568         if xml is not None:
569             self.set_elements(xml)
570
571     def __len__(self):
572         return len(self.results)
573
574     def keys(self):
575         return self.results.keys()
576     
577     def values(self):
578         return self.results.values()
579
580     def items(self):
581         return self.results.items()
582
583     def __getitem__(self, key):
584         return self.results[key]
585
586     def get_elements(self):
587         root = ElementTree.Element(ELAND.ELAND, 
588                                    {'version': unicode(ELAND.XML_VERSION)})
589         for lane_id, lane in self.results.items():
590             eland_lane = lane.get_elements()
591             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
592             root.append(eland_lane)
593         return root
594
595     def set_elements(self, tree):
596         if tree.tag.lower() != ELAND.ELAND.lower():
597             raise ValueError('Expecting %s', ELAND.ELAND)
598         for element in list(tree):
599             lane_id = element.attrib[ELAND.LANE_ID]
600             lane = ElandLane(xml=element)
601             self.results[lane_id] = lane
602
603 def eland(basedir, gerald=None, genome_maps=None):
604     e = ELAND()
605
606     file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
607     if len(file_list) == 0:
608         # lets handle compressed eland files too
609         file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
610
611     for pathname in file_list:
612         # yes the lane_id is also being computed in ElandLane._update
613         # I didn't want to clutter up my constructor
614         # but I needed to persist the sample_name/lane_id for
615         # runfolder summary_report
616         path, name = os.path.split(pathname)
617         split_name = name.split('_')
618         lane_id = split_name[1]
619
620         if genome_maps is not None:
621             genome_map = genome_maps[lane_id]
622         elif gerald is not None:
623             genome_dir = gerald.lanes[lane_id].eland_genome
624             genome_map = build_genome_fasta_map(genome_dir)
625         else:
626             genome_map = {}
627
628         eland_result = ElandLane(pathname, genome_map)
629         e.results[lane_id] = eland_result
630     return e