1f15eea08a85620ec0e9578c12762ceec987bc80
[htsworkflow.git] / gaworkflow / pipeline / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 from glob import glob
6 import logging
7 import os
8 import stat
9 import time
10
11 from gaworkflow.pipeline.runfolder import \
12    ElementTree, \
13    EUROPEAN_STRPTIME, \
14    LANES_PER_FLOWCELL, \
15    VERSION_RE
16 from gaworkflow.util.ethelp import indent, flatten
17
18 class Gerald(object):
19     """
20     Capture meaning out of the GERALD directory
21     """
22     XML_VERSION = 1
23     GERALD='Gerald'
24     RUN_PARAMETERS='RunParameters'
25     SUMMARY='Summary'
26
27     class LaneParameters(object):
28         """
29         Make it easy to access elements of LaneSpecificRunParameters from python
30         """
31         def __init__(self, gerald, key):
32             self._gerald = gerald
33             self._key = key
34         
35         def __get_attribute(self, xml_tag):
36             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
37             container = subtree.find(xml_tag)
38             if container is None or \
39                len(container.getchildren()) != LANES_PER_FLOWCELL:
40                 raise RuntimeError('GERALD config.xml file changed')
41             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
42             index = lanes.index(self._key)
43             #element = container.find(self._key)
44             element = container[index]
45             return element.text
46         def _get_analysis(self):
47             return self.__get_attribute('ANALYSIS')
48         analysis = property(_get_analysis)
49
50         def _get_eland_genome(self):
51             return self.__get_attribute('ELAND_GENOME')
52         eland_genome = property(_get_eland_genome)
53
54         def _get_read_length(self):
55             return self.__get_attribute('READ_LENGTH')
56         read_length = property(_get_read_length)
57
58         def _get_use_bases(self):
59             return self.__get_attribute('USE_BASES')
60         use_bases = property(_get_use_bases)
61
62     class LaneSpecificRunParameters(object):
63         """
64         Provide access to LaneSpecificRunParameters
65         """
66         def __init__(self, gerald):
67             self._gerald = gerald
68             self._keys = None
69         def __getitem__(self, key):
70             return Gerald.LaneParameters(self._gerald, key)
71         def keys(self):
72             if self._keys is None:
73                 tree = self._gerald.tree
74                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
75                 # according to the pipeline specs I think their fields 
76                 # are sampleName_laneID, with sampleName defaulting to s
77                 # since laneIDs are constant lets just try using 
78                 # those consistently.
79                 self._keys = [ x.tag.split('_')[1] for x in analysis]
80             return self._keys
81         def values(self):
82             return [ self[x] for x in self.keys() ]
83         def items(self):
84             return zip(self.keys(), self.values())
85         def __len__(self):
86             return len(self.keys())
87
88     def __init__(self, xml=None):
89         self.pathname = None
90         self.tree = None
91
92         # parse lane parameters out of the config.xml file
93         self.lanes = Gerald.LaneSpecificRunParameters(self)
94
95         self.summary = None
96         self.eland_results = None
97
98         if xml is not None:
99             self.set_elements(xml)
100
101     def _get_date(self):
102         if self.tree is None:
103             return datetime.today()
104         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
105         return datetime.strptime(timestamp, '%c')
106     date = property(_get_date)
107
108     def _get_time(self):
109         return time.mktime(self.date.timetuple())
110     time = property(_get_time, doc='return run time as seconds since epoch')
111
112     def _get_version(self):
113         if self.tree is None:
114             return None
115         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
116     version = property(_get_version)
117
118     def dump(self):
119         """
120         Debugging function, report current object
121         """
122         print 'Gerald version:', self.version
123         print 'Gerald run date:', self.date
124         print 'Gerald config.xml:', self.tree
125         self.summary.dump()
126
127     def get_elements(self):
128         if self.tree is None or self.summary is None:
129             return None
130
131         gerald = ElementTree.Element(Gerald.GERALD, 
132                                      {'version': unicode(Gerald.XML_VERSION)})
133         gerald.append(self.tree)
134         gerald.append(self.summary.get_elements())
135         if self.eland_results:
136             gerald.append(self.eland_results.get_elements())
137         return gerald
138
139     def set_elements(self, tree):
140         if tree.tag !=  Gerald.GERALD:
141             raise ValueError('exptected GERALD')
142         xml_version = int(tree.attrib.get('version', 0))
143         if xml_version > Gerald.XML_VERSION:
144             logging.warn('XML tree is a higher version than this class')
145         for element in list(tree):
146             tag = element.tag.lower()
147             if tag == Gerald.RUN_PARAMETERS.lower():
148                 self.tree = element
149             elif tag == Gerald.SUMMARY.lower():
150                 self.summary = Summary(xml=element)
151             elif tag == ELAND.ELAND.lower():
152                 self.eland_results = ELAND(xml=element)
153             else:
154                 logging.warn("Unrecognized tag %s" % (element.tag,))
155         
156
157 def gerald(pathname):
158     g = Gerald()
159     g.pathname = pathname
160     path, name = os.path.split(pathname)
161     config_pathname = os.path.join(pathname, 'config.xml')
162     g.tree = ElementTree.parse(config_pathname).getroot()
163
164     # parse Summary.htm file
165     summary_pathname = os.path.join(pathname, 'Summary.htm')
166     g.summary = Summary(summary_pathname)
167     # parse eland files
168     g.eland_results = eland(g.pathname, g)
169     return g
170
171 def tonumber(v):
172     """
173     Convert a value to int if its an int otherwise a float.
174     """
175     try:
176         v = int(v)
177     except ValueError, e:
178         v = float(v)
179     return v
180
181 def parse_mean_range(value):
182     """
183     Parse values like 123 +/- 4.5
184     """
185     if value.strip() == 'unknown':
186         return 0, 0
187
188     average, pm, deviation = value.split()
189     if pm != '+/-':
190         raise RuntimeError("Summary.htm file format changed")
191     return tonumber(average), tonumber(deviation)
192
193 def make_mean_range_element(parent, name, mean, deviation):
194     """
195     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
196     """
197     element = ElementTree.SubElement(parent, name,
198                                      { 'mean': unicode(mean),
199                                        'deviation': unicode(deviation)})
200     return element
201
202 def parse_mean_range_element(element):
203     """
204     Grab mean/deviation out of element
205     """
206     return (tonumber(element.attrib['mean']), 
207             tonumber(element.attrib['deviation']))
208
209 class Summary(object):
210     """
211     Extract some useful information from the Summary.htm file
212     """
213     XML_VERSION = 1
214     SUMMARY = 'Summary'
215
216     class LaneResultSummary(object):
217         """
218         Parse the LaneResultSummary table out of Summary.htm
219         Mostly for the cluster number
220         """
221         LANE_RESULT_SUMMARY = 'LaneResultSummary'
222         TAGS = { 
223           'Cluster': 'cluster',
224           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
225           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
226           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
227           'PercentPassFilterAlign': 'percent_pass_filter_align',
228           'AverageAlignmentScore': 'average_alignment_score',
229           'PercentErrorRate': 'percent_error_rate'
230         }
231                  
232         def __init__(self, html=None, xml=None):
233             self.lane = None
234             self.cluster = None
235             self.average_first_cycle_intensity = None
236             self.percent_intensity_after_20_cycles = None
237             self.percent_pass_filter_clusters = None
238             self.percent_pass_filter_align = None
239             self.average_alignment_score = None
240             self.percent_error_rate = None
241
242             if html is not None:
243                 self.set_elements_from_html(html)
244             if xml is not None:
245                 self.set_elements(xml)
246
247         def set_elements_from_html(self, row_element):
248             data = [ flatten(x) for x in row_element ]
249             if len(data) != 8:
250                 raise RuntimeError("Summary.htm file format changed")
251
252             self.lane = data[0]
253             self.cluster = parse_mean_range(data[1])
254             self.average_first_cycle_intensity = parse_mean_range(data[2])
255             self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
256             self.percent_pass_filter_clusters = parse_mean_range(data[4])
257             self.percent_pass_filter_align = parse_mean_range(data[5])
258             self.average_alignment_score = parse_mean_range(data[6])
259             self.percent_error_rate = parse_mean_range(data[7])
260
261         def get_elements(self):
262             lane_result = ElementTree.Element(
263                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
264                             {'lane': self.lane})
265             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
266                 element = make_mean_range_element(
267                     lane_result,
268                     tag,
269                     *getattr(self, variable_name)
270                 )
271             return lane_result
272
273         def set_elements(self, tree):
274             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
275                 raise ValueError('Expected %s' % (
276                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
277             self.lane = tree.attrib['lane']
278             tags = Summary.LaneResultSummary.TAGS
279             for element in list(tree):
280                 try:
281                     variable_name = tags[element.tag]
282                     setattr(self, variable_name, 
283                             parse_mean_range_element(element))
284                 except KeyError, e:
285                     logging.warn('Unrecognized tag %s' % (element.tag,))
286
287     def __init__(self, filename=None, xml=None):
288         self.lane_results = {}
289
290         if filename is not None:
291             self._extract_lane_results(filename)
292         if xml is not None:
293             self.set_elements(xml)
294
295     def __getitem__(self, key):
296         return self.lane_results[key]
297
298     def __len__(self):
299         return len(self.lane_results)
300
301     def keys(self):
302         return self.lane_results.keys()
303
304     def values(self):
305         return self.lane_results.values()
306
307     def items(self):
308         return self.lane_results.items()
309
310     def _extract_lane_results(self, pathname):
311         """
312         extract the Lane Results Summary table
313         """
314         tree = ElementTree.parse(pathname).getroot()
315         if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
316             raise RuntimeError("Summary.htm file format changed")
317
318         tables = tree.findall('*//table')
319
320         # parse lane result summary
321         lane_summary = tables[2]
322         rows = lane_summary.getchildren()
323         headers = rows[0].getchildren()
324         if flatten(headers[2]) != 'Av 1st Cycle Int ':
325             raise RuntimeError("Summary.htm file format changed")
326
327         for r in rows[1:]:
328             lrs = Summary.LaneResultSummary(html=r)
329             self.lane_results[lrs.lane] = lrs
330
331     def get_elements(self):
332         summary = ElementTree.Element(Summary.SUMMARY, 
333                                       {'version': unicode(Summary.XML_VERSION)})
334         for lane in self.lane_results.values():
335             summary.append(lane.get_elements())
336         return summary
337
338     def set_elements(self, tree):
339         if tree.tag != Summary.SUMMARY:
340             return ValueError("Expected %s" % (Summary.SUMMARY,))
341         xml_version = int(tree.attrib.get('version', 0))
342         if xml_version > Summary.XML_VERSION:
343             logging.warn('Summary XML tree is a higher version than this class')
344         for element in list(tree):
345             lrs = Summary.LaneResultSummary()
346             lrs.set_elements(element)
347             self.lane_results[lrs.lane] = lrs
348
349     def dump(self):
350         """
351         Debugging function, report current object
352         """
353         pass
354
355
356 def build_genome_fasta_map(genome_dir):
357     # build fasta to fasta file map
358     genome = genome_dir.split(os.path.sep)[-1]
359     fasta_map = {}
360     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
361         is_link = False
362         if os.path.islink(vld_file):
363             is_link = True
364         vld_file = os.path.realpath(vld_file)
365         path, vld_name = os.path.split(vld_file)
366         name, ext = os.path.splitext(vld_name)
367         if is_link:
368             fasta_map[name] = name
369         else:
370             fasta_map[name] = os.path.join(genome, name)
371     return fasta_map
372     
373 class ElandLane(object):
374     """
375     Process an eland result file
376     """
377     XML_VERSION = 1
378     LANE = 'ElandLane'
379     GENOME_MAP = 'GenomeMap'
380     GENOME_ITEM = 'GenomeItem'
381     MAPPED_READS = 'MappedReads'
382     MAPPED_ITEM = 'MappedItem'
383     MATCH_CODES = 'MatchCodes'
384     MATCH_ITEM = 'Code'
385     READS = 'Reads'
386
387     def __init__(self, pathname=None, genome_map=None, xml=None):
388         self.pathname = pathname
389         self._reads = None
390         self._mapped_reads = {}
391         self._match_codes = {}
392         if genome_map is None:
393             genome_map = {}
394         self.genome_map = genome_map
395         
396         if pathname is not None:
397             self._update()
398         if xml is not None:
399             self.set_elements(xml)
400
401     def _update(self):
402         """
403         Actually read the file and actually count the reads
404         """
405         # can't do anything if we don't have a file to process
406         if self.pathname is None:
407             return
408
409         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
410             raise RuntimeError("Eland isn't done, try again later.")
411
412         reads = 0
413         mapped_reads = {}
414
415         match_codes = {'NM':0, 'QC':0, 'RM':0, 
416                        'U0':0, 'U1':0, 'U2':0,
417                        'R0':0, 'R1':0, 'R2':0,
418                       }
419         for line in open(self.pathname):
420             reads += 1
421             fields = line.split()
422             # code = fields[2]
423             # match_codes[code] = match_codes.setdefault(code, 0) + 1
424             # the QC/NM etc codes are in the 3rd field and always present
425             match_codes[fields[2]] += 1
426             # ignore lines that don't have a fasta filename
427             if len(fields) < 7:
428                 continue
429             fasta = self.genome_map.get(fields[6], fields[6])
430             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
431         self._match_codes = match_codes
432         self._mapped_reads = mapped_reads
433         self._reads = reads
434
435     def _get_reads(self):
436         if self._reads is None:
437             self._update()
438         return self._reads
439     reads = property(_get_reads)
440
441     def _get_mapped_reads(self):
442         if self._mapped_reads is None:
443             self._update()
444         return self._mapped_reads
445     mapped_reads = property(_get_mapped_reads)
446
447     def _get_match_codes(self):
448         if self._match_codes is None:
449             self._update()
450         return self._match_codes
451     match_codes = property(_get_match_codes)
452
453     def get_elements(self):
454         lane = ElementTree.Element(ElandLane.LANE, 
455                                    {'version': 
456                                     unicode(ElandLane.XML_VERSION)})
457         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
458         for k, v in self.genome_map.items():
459             item = ElementTree.SubElement(
460                 genome_map, ElandLane.GENOME_ITEM, 
461                 {'name':k, 'value':unicode(v)})
462         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
463         for k, v in self.mapped_reads.items():
464             item = ElementTree.SubElement(
465                 mapped_reads, ElandLane.MAPPED_ITEM, 
466                 {'name':k, 'value':unicode(v)})
467         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
468         for k, v in self.match_codes.items():
469             item = ElementTree.SubElement(
470                 match_codes, ElandLane.MATCH_ITEM, 
471                 {'name':k, 'value':unicode(v)})
472         reads = ElementTree.SubElement(lane, ElandLane.READS)
473         reads.text = unicode(self.reads)
474
475         return lane
476
477     def set_elements(self, tree):
478         if tree.tag != ElandLane.LANE:
479             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
480         for element in tree:
481             tag = element.tag.lower()
482             if tag == ElandLane.GENOME_MAP.lower():
483                 for child in element:
484                     name = child.attrib['name']
485                     value = child.attrib['value']
486                     self.genome_map[name] = value
487             elif tag == ElandLane.MAPPED_READS.lower():
488                 for child in element:
489                     name = child.attrib['name']
490                     value = child.attrib['value']
491                     self._mapped_reads[name] = int(value)
492             elif tag == ElandLane.MATCH_CODES.lower():
493                 for child in element:
494                     name = child.attrib['name']
495                     value = int(child.attrib['value'])
496                     self._match_codes[name] = value
497             elif tag == ElandLane.READS.lower():
498                 self._reads = int(element.text)
499             else:
500                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
501
502
503 class ELAND(object):
504     """
505     Summarize information from eland files
506     """
507     XML_VERSION = 1
508
509     ELAND = 'ElandCollection'
510     LANE = 'Lane'
511     LANE_ID = 'id'
512
513     def __init__(self, xml=None):
514         # we need information from the gerald config.xml
515         self.results = {}
516         
517         if xml is not None:
518             self.set_elements(xml)
519
520     def __len__(self):
521         return len(self.results)
522
523     def keys(self):
524         return self.results.keys()
525     
526     def values(self):
527         return self.results.values()
528
529     def items(self):
530         return self.results.items()
531
532     def __getitem__(self, key):
533         return self.results[key]
534
535     def get_elements(self):
536         root = ElementTree.Element(ELAND.ELAND, 
537                                    {'version': unicode(ELAND.XML_VERSION)})
538         for lane_id, lane in self.results.items():
539             eland_lane = lane.get_elements()
540             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
541             root.append(eland_lane)
542         return root
543
544     def set_elements(self, tree):
545         if tree.tag.lower() != ELAND.ELAND.lower():
546             raise ValueError('Expecting %s', ELAND.ELAND)
547         for element in list(tree):
548             lane_id = element.attrib[ELAND.LANE_ID]
549             lane = ElandLane(xml=element)
550             self.results[lane_id] = lane
551
552 def eland(basedir, gerald=None, genome_maps=None):
553     e = ELAND()
554     for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
555         # extract the sample name
556         path, name = os.path.split(pathname)
557         split_name = name.split('_')
558         sample_name = split_name[0]
559         lane_id = split_name[1]
560
561         if genome_maps is not None:
562             genome_map = genome_maps[lane_id]
563         elif gerald is not None:
564             genome_dir = gerald.lanes[lane_id].eland_genome
565             genome_map = build_genome_fasta_map(genome_dir)
566         else:
567             genome_map = {}
568
569         eland_result = ElandLane(pathname, genome_map)
570         e.results[lane_id] = eland_result
571     return e