The htsworkflow.pipelines.gerald module was getting to large
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import time
8
9 from htsworkflow.pipelines.summary import Summary
10 from htsworkflow.pipelines.eland import eland
11
12 from htsworkflow.pipelines.runfolder import \
13    ElementTree, \
14    EUROPEAN_STRPTIME, \
15    LANES_PER_FLOWCELL, \
16    VERSION_RE
17 from htsworkflow.util.ethelp import indent, flatten
18
19 class Gerald(object):
20     """
21     Capture meaning out of the GERALD directory
22     """
23     XML_VERSION = 1
24     GERALD='Gerald'
25     RUN_PARAMETERS='RunParameters'
26     SUMMARY='Summary'
27
28     class LaneParameters(object):
29         """
30         Make it easy to access elements of LaneSpecificRunParameters from python
31         """
32         def __init__(self, gerald, key):
33             self._gerald = gerald
34             self._key = key
35
36         def __get_attribute(self, xml_tag):
37             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38             container = subtree.find(xml_tag)
39             if container is None:
40                 return None
41             if len(container.getchildren()) > LANES_PER_FLOWCELL:
42                 raise RuntimeError('GERALD config.xml file changed')
43             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
44             index = lanes.index(self._key)
45             element = container[index]
46             return element.text
47         def _get_analysis(self):
48             return self.__get_attribute('ANALYSIS')
49         analysis = property(_get_analysis)
50
51         def _get_eland_genome(self):
52             genome = self.__get_attribute('ELAND_GENOME')
53             # default to the chipwide parameters if there isn't an
54             # entry in the lane specific paramaters
55             if genome is None:
56                 subtree = self._gerald.tree.find('ChipWideRunParameters')
57                 container = subtree.find('ELAND_GENOME')
58                 genome = container.text
59             return genome
60         eland_genome = property(_get_eland_genome)
61
62         def _get_read_length(self):
63             return self.__get_attribute('READ_LENGTH')
64         read_length = property(_get_read_length)
65
66         def _get_use_bases(self):
67             return self.__get_attribute('USE_BASES')
68         use_bases = property(_get_use_bases)
69
70     class LaneSpecificRunParameters(object):
71         """
72         Provide access to LaneSpecificRunParameters
73         """
74         def __init__(self, gerald):
75             self._gerald = gerald
76             self._keys = None
77         def __getitem__(self, key):
78             return Gerald.LaneParameters(self._gerald, key)
79         def keys(self):
80             if self._keys is None:
81                 tree = self._gerald.tree
82                 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
83                 # according to the pipeline specs I think their fields
84                 # are sampleName_laneID, with sampleName defaulting to s
85                 # since laneIDs are constant lets just try using
86                 # those consistently.
87                 self._keys = [ x.tag.split('_')[1] for x in analysis]
88             return self._keys
89         def values(self):
90             return [ self[x] for x in self.keys() ]
91         def items(self):
92             return zip(self.keys(), self.values())
93         def __len__(self):
94             return len(self.keys())
95
96     def __init__(self, xml=None):
97         self.pathname = None
98         self.tree = None
99
100         # parse lane parameters out of the config.xml file
101         self.lanes = Gerald.LaneSpecificRunParameters(self)
102
103         self.summary = None
104         self.eland_results = None
105
106         if xml is not None:
107             self.set_elements(xml)
108
109     def _get_date(self):
110         if self.tree is None:
111             return datetime.today()
112         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
113         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
114         return datetime.fromtimestamp(epochstamp)
115     date = property(_get_date)
116
117     def _get_time(self):
118         return time.mktime(self.date.timetuple())
119     time = property(_get_time, doc='return run time as seconds since epoch')
120
121     def _get_version(self):
122         if self.tree is None:
123             return None
124         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
125     version = property(_get_version)
126
127     def dump(self):
128         """
129         Debugging function, report current object
130         """
131         print 'Gerald version:', self.version
132         print 'Gerald run date:', self.date
133         print 'Gerald config.xml:', self.tree
134         self.summary.dump()
135
136     def get_elements(self):
137         if self.tree is None or self.summary is None:
138             return None
139
140         gerald = ElementTree.Element(Gerald.GERALD,
141                                      {'version': unicode(Gerald.XML_VERSION)})
142         gerald.append(self.tree)
143         gerald.append(self.summary.get_elements())
144         if self.eland_results:
145             gerald.append(self.eland_results.get_elements())
146         return gerald
147
148     def set_elements(self, tree):
149         if tree.tag !=  Gerald.GERALD:
150             raise ValueError('exptected GERALD')
151         xml_version = int(tree.attrib.get('version', 0))
152         if xml_version > Gerald.XML_VERSION:
153             logging.warn('XML tree is a higher version than this class')
154         for element in list(tree):
155             tag = element.tag.lower()
156             if tag == Gerald.RUN_PARAMETERS.lower():
157                 self.tree = element
158             elif tag == Gerald.SUMMARY.lower():
159                 self.summary = Summary(xml=element)
160             elif tag == ELAND.ELAND.lower():
161                 self.eland_results = ELAND(xml=element)
162             else:
163                 logging.warn("Unrecognized tag %s" % (element.tag,))
164
165
166 def gerald(pathname):
167     g = Gerald()
168     g.pathname = pathname
169     path, name = os.path.split(pathname)
170     logging.info("Parsing gerald config.xml")
171     config_pathname = os.path.join(pathname, 'config.xml')
172     g.tree = ElementTree.parse(config_pathname).getroot()
173
174     # parse Summary.htm file
175     logging.info("Parsing Summary.htm")
176     summary_pathname = os.path.join(pathname, 'Summary.htm')
177     g.summary = Summary(summary_pathname)
178     # parse eland files
179     g.eland_results = eland(g.pathname, g)
180     return g
181