83646075355b1d9e565c514ecb4acceb18aae5e6
[htsworkflow.git] / gaworkflow / pipeline / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 from glob import glob
6 import logging
7 import os
8 import stat
9 import time
10
11 from gaworkflow.pipeline.runfolder import \
12    ElementTree, \
13    EUROPEAN_STRPTIME, \
14    LANES_PER_FLOWCELL, \
15    VERSION_RE
16 from gaworkflow.util.ethelp import indent, flatten
17
18 class Gerald(object):
19     """
20     Capture meaning out of the GERALD directory
21     """
22     XML_VERSION = 1
23     GERALD='Gerald'
24     RUN_PARAMETERS='RunParameters'
25     SUMMARY='Summary'
26
27     class LaneParameters(object):
28         """
29         Make it easy to access elements of LaneSpecificRunParameters from python
30         """
31         def __init__(self, gerald, key):
32             self._gerald = gerald
33             self._key = key
34         
35         def __get_attribute(self, xml_tag):
36             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
37             container = subtree.find(xml_tag)
38             if container is None or \
39                len(container.getchildren()) != LANES_PER_FLOWCELL:
40                 raise RuntimeError('GERALD config.xml file changed')
41             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
42             index = lanes.index(self._key)
43             #element = container.find(self._key)
44             element = container[index]
45             return element.text
46         def _get_analysis(self):
47             return self.__get_attribute('ANALYSIS')
48         analysis = property(_get_analysis)
49
50         def _get_eland_genome(self):
51             return self.__get_attribute('ELAND_GENOME')
52         eland_genome = property(_get_eland_genome)
53
54         def _get_read_length(self):
55             return self.__get_attribute('READ_LENGTH')
56         read_length = property(_get_read_length)
57
58         def _get_use_bases(self):
59             return self.__get_attribute('USE_BASES')
60         use_bases = property(_get_use_bases)
61
62     class LaneSpecificRunParameters(object):
63         """
64         Provide access to LaneSpecificRunParameters
65         """
66         def __init__(self, gerald):
67             self._gerald = gerald
68             self._keys = None
69         def __getitem__(self, key):
70             return Gerald.LaneParameters(self._gerald, key)
71         def keys(self):
72             if self._keys is None:
73                 tree = self._gerald.tree
74                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
75                 # according to the pipeline specs I think their fields 
76                 # are sampleName_laneID, with sampleName defaulting to s
77                 # since laneIDs are constant lets just try using 
78                 # those consistently.
79                 self._keys = [ x.tag.split('_')[1] for x in analysis]
80             return self._keys
81         def values(self):
82             return [ self[x] for x in self.keys() ]
83         def items(self):
84             return zip(self.keys(), self.values())
85         def __len__(self):
86             return len(self.keys())
87
88     def __init__(self, xml=None):
89         self.pathname = None
90         self.tree = None
91
92         # parse lane parameters out of the config.xml file
93         self.lanes = Gerald.LaneSpecificRunParameters(self)
94
95         self.summary = None
96         self.eland_results = None
97
98         if xml is not None:
99             self.set_elements(xml)
100
101     def _get_date(self):
102         if self.tree is None:
103             return datetime.today()
104         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
105         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
106         return datetime.fromtimestamp(epochstamp)
107     date = property(_get_date)
108
109     def _get_time(self):
110         return time.mktime(self.date.timetuple())
111     time = property(_get_time, doc='return run time as seconds since epoch')
112
113     def _get_version(self):
114         if self.tree is None:
115             return None
116         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
117     version = property(_get_version)
118
119     def dump(self):
120         """
121         Debugging function, report current object
122         """
123         print 'Gerald version:', self.version
124         print 'Gerald run date:', self.date
125         print 'Gerald config.xml:', self.tree
126         self.summary.dump()
127
128     def get_elements(self):
129         if self.tree is None or self.summary is None:
130             return None
131
132         gerald = ElementTree.Element(Gerald.GERALD, 
133                                      {'version': unicode(Gerald.XML_VERSION)})
134         gerald.append(self.tree)
135         gerald.append(self.summary.get_elements())
136         if self.eland_results:
137             gerald.append(self.eland_results.get_elements())
138         return gerald
139
140     def set_elements(self, tree):
141         if tree.tag !=  Gerald.GERALD:
142             raise ValueError('exptected GERALD')
143         xml_version = int(tree.attrib.get('version', 0))
144         if xml_version > Gerald.XML_VERSION:
145             logging.warn('XML tree is a higher version than this class')
146         for element in list(tree):
147             tag = element.tag.lower()
148             if tag == Gerald.RUN_PARAMETERS.lower():
149                 self.tree = element
150             elif tag == Gerald.SUMMARY.lower():
151                 self.summary = Summary(xml=element)
152             elif tag == ELAND.ELAND.lower():
153                 self.eland_results = ELAND(xml=element)
154             else:
155                 logging.warn("Unrecognized tag %s" % (element.tag,))
156         
157
158 def gerald(pathname):
159     g = Gerald()
160     g.pathname = pathname
161     path, name = os.path.split(pathname)
162     config_pathname = os.path.join(pathname, 'config.xml')
163     g.tree = ElementTree.parse(config_pathname).getroot()
164
165     # parse Summary.htm file
166     summary_pathname = os.path.join(pathname, 'Summary.htm')
167     g.summary = Summary(summary_pathname)
168     # parse eland files
169     g.eland_results = eland(g.pathname, g)
170     return g
171
172 def tonumber(v):
173     """
174     Convert a value to int if its an int otherwise a float.
175     """
176     try:
177         v = int(v)
178     except ValueError, e:
179         v = float(v)
180     return v
181
182 def parse_mean_range(value):
183     """
184     Parse values like 123 +/- 4.5
185     """
186     if value.strip() == 'unknown':
187         return 0, 0
188
189     average, pm, deviation = value.split()
190     if pm != '+/-':
191         raise RuntimeError("Summary.htm file format changed")
192     return tonumber(average), tonumber(deviation)
193
194 def make_mean_range_element(parent, name, mean, deviation):
195     """
196     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
197     """
198     element = ElementTree.SubElement(parent, name,
199                                      { 'mean': unicode(mean),
200                                        'deviation': unicode(deviation)})
201     return element
202
203 def parse_mean_range_element(element):
204     """
205     Grab mean/deviation out of element
206     """
207     return (tonumber(element.attrib['mean']), 
208             tonumber(element.attrib['deviation']))
209
210 class Summary(object):
211     """
212     Extract some useful information from the Summary.htm file
213     """
214     XML_VERSION = 1
215     SUMMARY = 'Summary'
216
217     class LaneResultSummary(object):
218         """
219         Parse the LaneResultSummary table out of Summary.htm
220         Mostly for the cluster number
221         """
222         LANE_RESULT_SUMMARY = 'LaneResultSummary'
223         TAGS = { 
224           'Cluster': 'cluster',
225           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
226           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
227           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
228           'PercentPassFilterAlign': 'percent_pass_filter_align',
229           'AverageAlignmentScore': 'average_alignment_score',
230           'PercentErrorRate': 'percent_error_rate'
231         }
232                  
233         def __init__(self, html=None, xml=None):
234             self.lane = None
235             self.cluster = None
236             self.average_first_cycle_intensity = None
237             self.percent_intensity_after_20_cycles = None
238             self.percent_pass_filter_clusters = None
239             self.percent_pass_filter_align = None
240             self.average_alignment_score = None
241             self.percent_error_rate = None
242
243             if html is not None:
244                 self.set_elements_from_html(html)
245             if xml is not None:
246                 self.set_elements(xml)
247
248         def set_elements_from_html(self, row_element):
249             data = [ flatten(x) for x in row_element ]
250             if len(data) != 8:
251                 raise RuntimeError("Summary.htm file format changed")
252
253             self.lane = data[0]
254             self.cluster = parse_mean_range(data[1])
255             self.average_first_cycle_intensity = parse_mean_range(data[2])
256             self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
257             self.percent_pass_filter_clusters = parse_mean_range(data[4])
258             self.percent_pass_filter_align = parse_mean_range(data[5])
259             self.average_alignment_score = parse_mean_range(data[6])
260             self.percent_error_rate = parse_mean_range(data[7])
261
262         def get_elements(self):
263             lane_result = ElementTree.Element(
264                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
265                             {'lane': self.lane})
266             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
267                 element = make_mean_range_element(
268                     lane_result,
269                     tag,
270                     *getattr(self, variable_name)
271                 )
272             return lane_result
273
274         def set_elements(self, tree):
275             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
276                 raise ValueError('Expected %s' % (
277                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
278             self.lane = tree.attrib['lane']
279             tags = Summary.LaneResultSummary.TAGS
280             for element in list(tree):
281                 try:
282                     variable_name = tags[element.tag]
283                     setattr(self, variable_name, 
284                             parse_mean_range_element(element))
285                 except KeyError, e:
286                     logging.warn('Unrecognized tag %s' % (element.tag,))
287
288     def __init__(self, filename=None, xml=None):
289         self.lane_results = {}
290
291         if filename is not None:
292             self._extract_lane_results(filename)
293         if xml is not None:
294             self.set_elements(xml)
295
296     def __getitem__(self, key):
297         return self.lane_results[key]
298
299     def __len__(self):
300         return len(self.lane_results)
301
302     def keys(self):
303         return self.lane_results.keys()
304
305     def values(self):
306         return self.lane_results.values()
307
308     def items(self):
309         return self.lane_results.items()
310
311     def _extract_lane_results(self, pathname):
312         """
313         extract the Lane Results Summary table
314         """
315         tree = ElementTree.parse(pathname).getroot()
316         if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
317             raise RuntimeError("Summary.htm file format changed")
318
319         tables = tree.findall('*//table')
320
321         # parse lane result summary
322         lane_summary = tables[2]
323         rows = lane_summary.getchildren()
324         headers = rows[0].getchildren()
325         if flatten(headers[2]) != 'Av 1st Cycle Int ':
326             raise RuntimeError("Summary.htm file format changed")
327
328         for r in rows[1:]:
329             lrs = Summary.LaneResultSummary(html=r)
330             self.lane_results[lrs.lane] = lrs
331
332     def get_elements(self):
333         summary = ElementTree.Element(Summary.SUMMARY, 
334                                       {'version': unicode(Summary.XML_VERSION)})
335         for lane in self.lane_results.values():
336             summary.append(lane.get_elements())
337         return summary
338
339     def set_elements(self, tree):
340         if tree.tag != Summary.SUMMARY:
341             return ValueError("Expected %s" % (Summary.SUMMARY,))
342         xml_version = int(tree.attrib.get('version', 0))
343         if xml_version > Summary.XML_VERSION:
344             logging.warn('Summary XML tree is a higher version than this class')
345         for element in list(tree):
346             lrs = Summary.LaneResultSummary()
347             lrs.set_elements(element)
348             self.lane_results[lrs.lane] = lrs
349
350     def dump(self):
351         """
352         Debugging function, report current object
353         """
354         pass
355
356
357 def build_genome_fasta_map(genome_dir):
358     # build fasta to fasta file map
359     genome = genome_dir.split(os.path.sep)[-1]
360     fasta_map = {}
361     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
362         is_link = False
363         if os.path.islink(vld_file):
364             is_link = True
365         vld_file = os.path.realpath(vld_file)
366         path, vld_name = os.path.split(vld_file)
367         name, ext = os.path.splitext(vld_name)
368         if is_link:
369             fasta_map[name] = name
370         else:
371             fasta_map[name] = os.path.join(genome, name)
372     return fasta_map
373     
374 class ElandLane(object):
375     """
376     Process an eland result file
377     """
378     XML_VERSION = 1
379     LANE = 'ElandLane'
380     SAMPLE_NAME = 'SampleName'
381     LANE_ID = 'LaneID'
382     GENOME_MAP = 'GenomeMap'
383     GENOME_ITEM = 'GenomeItem'
384     MAPPED_READS = 'MappedReads'
385     MAPPED_ITEM = 'MappedItem'
386     MATCH_CODES = 'MatchCodes'
387     MATCH_ITEM = 'Code'
388     READS = 'Reads'
389
390     def __init__(self, pathname=None, genome_map=None, xml=None):
391         self.pathname = pathname
392         self._sample_name = None
393         self._lane_id = None
394         self._reads = None
395         self._mapped_reads = None
396         self._match_codes = None
397         if genome_map is None:
398             genome_map = {}
399         self.genome_map = genome_map
400         
401         if xml is not None:
402             self.set_elements(xml)
403
404     def _update(self):
405         """
406         Actually read the file and actually count the reads
407         """
408         # can't do anything if we don't have a file to process
409         if self.pathname is None:
410             return
411
412         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
413             raise RuntimeError("Eland isn't done, try again later.")
414
415         reads = 0
416         mapped_reads = {}
417
418         match_codes = {'NM':0, 'QC':0, 'RM':0, 
419                        'U0':0, 'U1':0, 'U2':0,
420                        'R0':0, 'R1':0, 'R2':0,
421                       }
422         for line in open(self.pathname):
423             reads += 1
424             fields = line.split()
425             # code = fields[2]
426             # match_codes[code] = match_codes.setdefault(code, 0) + 1
427             # the QC/NM etc codes are in the 3rd field and always present
428             match_codes[fields[2]] += 1
429             # ignore lines that don't have a fasta filename
430             if len(fields) < 7:
431                 continue
432             fasta = self.genome_map.get(fields[6], fields[6])
433             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
434         self._match_codes = match_codes
435         self._mapped_reads = mapped_reads
436         self._reads = reads
437
438     def _update_name(self):
439         # extract the sample name
440         if self.pathname is None:
441             return
442
443         path, name = os.path.split(self.pathname)
444         split_name = name.split('_')
445         self._sample_name = split_name[0]
446         self._lane_id = split_name[1]
447
448     def _get_sample_name(self):
449         if self._sample_name is None:
450             self._update_name()
451         return self._sample_name
452     sample_name = property(_get_sample_name)
453
454     def _get_lane_id(self):
455         if self._lane_id is None:
456             self._update_name()
457         return self._lane_id
458     lane_id = property(_get_lane_id)
459
460     def _get_reads(self):
461         if self._reads is None:
462             self._update()
463         return self._reads
464     reads = property(_get_reads)
465
466     def _get_mapped_reads(self):
467         if self._mapped_reads is None:
468             self._update()
469         return self._mapped_reads
470     mapped_reads = property(_get_mapped_reads)
471
472     def _get_match_codes(self):
473         if self._match_codes is None:
474             self._update()
475         return self._match_codes
476     match_codes = property(_get_match_codes)
477
478     def get_elements(self):
479         lane = ElementTree.Element(ElandLane.LANE, 
480                                    {'version': 
481                                     unicode(ElandLane.XML_VERSION)})
482         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
483         sample_tag.text = self.sample_name
484         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
485         lane_tag.text = self.lane_id
486         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
487         for k, v in self.genome_map.items():
488             item = ElementTree.SubElement(
489                 genome_map, ElandLane.GENOME_ITEM, 
490                 {'name':k, 'value':unicode(v)})
491         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
492         for k, v in self.mapped_reads.items():
493             item = ElementTree.SubElement(
494                 mapped_reads, ElandLane.MAPPED_ITEM, 
495                 {'name':k, 'value':unicode(v)})
496         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
497         for k, v in self.match_codes.items():
498             item = ElementTree.SubElement(
499                 match_codes, ElandLane.MATCH_ITEM, 
500                 {'name':k, 'value':unicode(v)})
501         reads = ElementTree.SubElement(lane, ElandLane.READS)
502         reads.text = unicode(self.reads)
503
504         return lane
505
506     def set_elements(self, tree):
507         if tree.tag != ElandLane.LANE:
508             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
509
510         # reset dictionaries
511         self._mapped_reads = {}
512         self._match_codes = {}
513         
514         for element in tree:
515             tag = element.tag.lower()
516             if tag == ElandLane.SAMPLE_NAME.lower():
517                 self._sample_name = element.text
518             elif tag == ElandLane.LANE_ID.lower():
519                 self._lane_id = element.text
520             elif tag == ElandLane.GENOME_MAP.lower():
521                 for child in element:
522                     name = child.attrib['name']
523                     value = child.attrib['value']
524                     self.genome_map[name] = value
525             elif tag == ElandLane.MAPPED_READS.lower():
526                 for child in element:
527                     name = child.attrib['name']
528                     value = child.attrib['value']
529                     self._mapped_reads[name] = int(value)
530             elif tag == ElandLane.MATCH_CODES.lower():
531                 for child in element:
532                     name = child.attrib['name']
533                     value = int(child.attrib['value'])
534                     self._match_codes[name] = value
535             elif tag == ElandLane.READS.lower():
536                 self._reads = int(element.text)
537             else:
538                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
539
540 def extract_eland_sequence(instream, outstream, start, end):
541     """
542     Extract a chunk of sequence out of an eland file
543     """
544     for line in instream:
545         record = line.split()
546         if len(record) > 1:
547             result = [record[0], record[1][start:end]]
548         else:
549             result = [record[0][start:end]]
550         outstream.write("\t".join(result))
551         outstream.write(os.linesep)
552
553 class ELAND(object):
554     """
555     Summarize information from eland files
556     """
557     XML_VERSION = 1
558
559     ELAND = 'ElandCollection'
560     LANE = 'Lane'
561     LANE_ID = 'id'
562
563     def __init__(self, xml=None):
564         # we need information from the gerald config.xml
565         self.results = {}
566         
567         if xml is not None:
568             self.set_elements(xml)
569
570     def __len__(self):
571         return len(self.results)
572
573     def keys(self):
574         return self.results.keys()
575     
576     def values(self):
577         return self.results.values()
578
579     def items(self):
580         return self.results.items()
581
582     def __getitem__(self, key):
583         return self.results[key]
584
585     def get_elements(self):
586         root = ElementTree.Element(ELAND.ELAND, 
587                                    {'version': unicode(ELAND.XML_VERSION)})
588         for lane_id, lane in self.results.items():
589             eland_lane = lane.get_elements()
590             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
591             root.append(eland_lane)
592         return root
593
594     def set_elements(self, tree):
595         if tree.tag.lower() != ELAND.ELAND.lower():
596             raise ValueError('Expecting %s', ELAND.ELAND)
597         for element in list(tree):
598             lane_id = element.attrib[ELAND.LANE_ID]
599             lane = ElandLane(xml=element)
600             self.results[lane_id] = lane
601
602 def eland(basedir, gerald=None, genome_maps=None):
603     e = ELAND()
604     for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
605         # yes the lane_id is also being computed in ElandLane._update
606         # I didn't want to clutter up my constructor
607         # but I needed to persist the sample_name/lane_id for
608         # runfolder summary_report
609         path, name = os.path.split(pathname)
610         split_name = name.split('_')
611         lane_id = split_name[1]
612
613         if genome_maps is not None:
614             genome_map = genome_maps[lane_id]
615         elif gerald is not None:
616             genome_dir = gerald.lanes[lane_id].eland_genome
617             genome_map = build_genome_fasta_map(genome_dir)
618         else:
619             genome_map = {}
620
621         eland_result = ElandLane(pathname, genome_map)
622         e.results[lane_id] = eland_result
623     return e