Add support for first-gen HiSeq flowcells (e.g. ABXX)
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import time
8
9 from htsworkflow.pipelines.summary import Summary
10 from htsworkflow.pipelines.eland import eland, ELAND
11
12 from htsworkflow.pipelines.runfolder import \
13    ElementTree, \
14    EUROPEAN_STRPTIME, \
15    LANES_PER_FLOWCELL, \
16    VERSION_RE
17 from htsworkflow.util.ethelp import indent, flatten
18
19 class Gerald(object):
20     """
21     Capture meaning out of the GERALD directory
22     """
23     XML_VERSION = 1
24     GERALD='Gerald'
25     RUN_PARAMETERS='RunParameters'
26     SUMMARY='Summary'
27
28     class LaneParameters(object):
29         """
30         Make it easy to access elements of LaneSpecificRunParameters from python
31         """
32         def __init__(self, gerald, lane_id):
33             self._gerald = gerald
34             self._lane_id = lane_id
35
36         def __get_attribute(self, xml_tag):
37             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
38             container = subtree.find(xml_tag)
39             if container is None:
40                 return None
41             if len(container.getchildren()) > LANES_PER_FLOWCELL:
42                 raise RuntimeError('GERALD config.xml file changed')
43             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
44             try:
45                 index = lanes.index(self._lane_id)
46             except ValueError, e:
47                 return None
48             element = container[index]
49             return element.text
50         def _get_analysis(self):
51             return self.__get_attribute('ANALYSIS')
52         analysis = property(_get_analysis)
53
54         def _get_eland_genome(self):
55             genome = self.__get_attribute('ELAND_GENOME')
56             # default to the chipwide parameters if there isn't an
57             # entry in the lane specific paramaters
58             if genome is None:
59                 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
60             # ignore flag value
61             if genome == 'Need_to_specify_ELAND_genome_directory':
62                 genome = None
63             return genome
64         eland_genome = property(_get_eland_genome)
65
66         def _get_read_length(self):
67             read_length = self.__get_attribute('READ_LENGTH')
68             if read_length is None:
69                 read_length = self._gerald._get_chip_attribute('READ_LENGTH')
70             return read_length
71         read_length = property(_get_read_length)
72
73         def _get_use_bases(self):
74             return self.__get_attribute('USE_BASES')
75         use_bases = property(_get_use_bases)
76
77     class LaneSpecificRunParameters(object):
78         """
79         Provide access to LaneSpecificRunParameters
80         """
81         def __init__(self, gerald):
82             self._gerald = gerald
83             self._lane = None
84
85         def _initalize_lanes(self):
86             """
87             build dictionary of LaneParameters
88             """
89             self._lanes = {}
90             tree = self._gerald.tree
91             analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
92             # according to the pipeline specs I think their fields
93             # are sampleName_laneID, with sampleName defaulting to s
94             # since laneIDs are constant lets just try using
95             # those consistently.
96             for element in analysis:
97                 sample, lane_id = element.tag.split('_')
98                 self._lanes[int(lane_id)] = Gerald.LaneParameters(
99                                               self._gerald, lane_id)
100
101         def __getitem__(self, key):
102             if self._lane is None:
103                 self._initalize_lanes()
104             return self._lanes[key]
105         def keys(self):
106             if self._lane is None:
107                 self._initalize_lanes()
108             return self._lanes.keys()
109         def values(self):
110             if self._lane is None:
111                 self._initalize_lanes()
112             return self._lanes.values()
113         def items(self):
114             if self._lane is None:
115                 self._initalize_lanes()
116             return self._lanes.items()
117         def __len__(self):
118             if self._lane is None:
119                 self._initalize_lanes()
120             return len(self._lanes)
121
122     def __init__(self, xml=None):
123         self.pathname = None
124         self.tree = None
125
126         # parse lane parameters out of the config.xml file
127         self.lanes = Gerald.LaneSpecificRunParameters(self)
128
129         self.summary = None
130         self.eland_results = None
131
132         if xml is not None:
133             self.set_elements(xml)
134
135     def _get_date(self):
136         if self.tree is None:
137             return datetime.today()
138         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
139         epochstamp = time.mktime(time.strptime(timestamp, '%c'))
140         return datetime.fromtimestamp(epochstamp)
141     date = property(_get_date)
142
143     def _get_time(self):
144         return time.mktime(self.date.timetuple())
145     time = property(_get_time, doc='return run time as seconds since epoch')
146
147     def _get_experiment_root(self):
148         if self.tree is None:
149             return None
150         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
151
152     def _get_runfolder_name(self):
153         if self.tree is None:
154             return None
155
156         root = self._get_experiment_root()
157         if root is None:
158             root = ''
159         else:
160             root = os.path.join(root,'')
161
162         experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
163         if experiment_dir is None:
164             return None
165         experiment_dir = experiment_dir.replace(root, '')
166         if len(experiment_dir) == 0:
167             return None
168
169         dirnames = experiment_dir.split(os.path.sep)
170         return dirnames[0]
171     runfolder_name = property(_get_runfolder_name)
172
173     def _get_version(self):
174         if self.tree is None:
175             return None
176         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
177     version = property(_get_version)
178
179     def _get_chip_attribute(self, value):
180         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
181
182     def dump(self):
183         """
184         Debugging function, report current object
185         """
186         print 'Gerald version:', self.version
187         print 'Gerald run date:', self.date
188         print 'Gerald config.xml:', self.tree
189         self.summary.dump()
190
191     def get_elements(self):
192         if self.tree is None or self.summary is None:
193             return None
194
195         gerald = ElementTree.Element(Gerald.GERALD,
196                                      {'version': unicode(Gerald.XML_VERSION)})
197         gerald.append(self.tree)
198         gerald.append(self.summary.get_elements())
199         if self.eland_results:
200             gerald.append(self.eland_results.get_elements())
201         return gerald
202
203     def set_elements(self, tree):
204         if tree.tag !=  Gerald.GERALD:
205             raise ValueError('exptected GERALD')
206         xml_version = int(tree.attrib.get('version', 0))
207         if xml_version > Gerald.XML_VERSION:
208             logging.warn('XML tree is a higher version than this class')
209         self.eland_results = ELAND()
210         for element in list(tree):
211             tag = element.tag.lower()
212             if tag == Gerald.RUN_PARAMETERS.lower():
213                 self.tree = element
214             elif tag == Gerald.SUMMARY.lower():
215                 self.summary = Summary(xml=element)
216             elif tag == ELAND.ELAND.lower():
217                 self.eland_results = ELAND(xml=element)
218             else:
219                 logging.warn("Unrecognized tag %s" % (element.tag,))
220
221 def gerald(pathname):
222     g = Gerald()
223     g.pathname = os.path.expanduser(pathname)
224     path, name = os.path.split(g.pathname)
225     logging.info("Parsing gerald config.xml")
226     config_pathname = os.path.join(g.pathname, 'config.xml')
227     g.tree = ElementTree.parse(config_pathname).getroot()
228
229     # parse Summary.htm file
230     summary_pathname = os.path.join(g.pathname, 'Summary.xml')
231     if os.path.exists(summary_pathname):
232         logging.info("Parsing Summary.xml")
233     else:
234         summary_pathname = os.path.join(g.pathname, 'Summary.htm')
235         logging.info("Parsing Summary.htm")
236     g.summary = Summary(summary_pathname)
237     # parse eland files
238     g.eland_results = eland(g.pathname, g)
239     return g
240
241 if __name__ == "__main__":
242   # quick test code
243   import sys
244   g = gerald(sys.argv[1])
245   #ElementTree.dump(g.get_elements())