Return a gerald version number as a number and not a cvs string.
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import re
8 import stat
9 import time
10
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
13
14 from htsworkflow.pipelines.runfolder import \
15    ElementTree, \
16    EUROPEAN_STRPTIME, \
17    LANES_PER_FLOWCELL, \
18    VERSION_RE
19 from htsworkflow.util.ethelp import indent, flatten
20
21 LOGGER = logging.getLogger(__name__)
22
23 class Alignment(object):
24     """
25     Capture meaning out of the GERALD directory
26     """
27     XML_VERSION = 1
28     RUN_PARAMETERS='RunParameters'
29     SUMMARY='Summary'
30
31     def __init__(self, xml=None, pathname=None, tree=None):
32         self.pathname = pathname
33         self.tree = tree
34
35         # parse lane parameters out of the config.xml file
36         self.lanes = LaneSpecificRunParameters(self)
37
38         self.summary = None
39         self.eland_results = None
40
41         if xml is not None:
42             self.set_elements(xml)
43
44     def _get_time(self):
45         return time.mktime(self.date.timetuple())
46     time = property(_get_time, doc='return run time as seconds since epoch')
47
48     def _get_chip_attribute(self, value):
49         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
50
51     def dump(self):
52         """
53         Debugging function, report current object
54         """
55         print 'Software:'. self.__class__.__name__
56         print 'Alignment version:', self.version
57         print 'Run date:', self.date
58         print 'config.xml:', self.tree
59         self.summary.dump()
60
61     def get_elements(self):
62         if self.tree is None or self.summary is None:
63             return None
64
65         gerald = ElementTree.Element(Gerald.GERALD,
66                                      {'version': unicode(Gerald.XML_VERSION)})
67         gerald.append(self.tree)
68         gerald.append(self.summary.get_elements())
69         if self.eland_results:
70             gerald.append(self.eland_results.get_elements())
71         return gerald
72
73     def set_elements(self, tree):
74         if tree.tag !=  self.__class__.GERALD:
75             raise ValueError('expected GERALD')
76         xml_version = int(tree.attrib.get('version', 0))
77         if xml_version > Gerald.XML_VERSION:
78             LOGGER.warn('XML tree is a higher version than this class')
79         self.eland_results = ELAND()
80         for element in list(tree):
81             tag = element.tag.lower()
82             if tag == Gerald.RUN_PARAMETERS.lower():
83                 self.tree = element
84             elif tag == Gerald.SUMMARY.lower():
85                 self.summary = Summary(xml=element)
86             elif tag == ELAND.ELAND.lower():
87                 self.eland_results = ELAND(xml=element)
88             else:
89                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
90
91 class Gerald(Alignment):
92     GERALD='Gerald'
93
94     def _get_date(self):
95         if self.tree is None:
96             return datetime.today()
97         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
98         if timestamp is not None:
99             epochstamp = time.mktime(time.strptime(timestamp, '%c'))
100             return datetime.fromtimestamp(epochstamp)
101         if self.pathname is not None:
102             epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
103             return datetime.fromtimestamp(epochstamp)
104         return datetime.today()
105     date = property(_get_date)
106
107     def _get_experiment_root(self):
108         if self.tree is None:
109             return None
110         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
111
112     def _get_runfolder_name(self):
113         if self.tree is None:
114             return None
115
116         expt_root = os.path.normpath(self._get_experiment_root())
117         chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
118
119         if expt_root is not None and chip_expt_dir is not None:
120             experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
121             experiment_dir = experiment_dir.split(os.path.sep)[0]
122
123         if experiment_dir is None or len(experiment_dir) == 0:
124             return None
125         return experiment_dir
126
127     runfolder_name = property(_get_runfolder_name)
128
129     def _get_version(self):
130         if self.tree is None:
131             return None
132         ga_version = self.tree.findtext(
133             'ChipWideRunParameters/SOFTWARE_VERSION')
134         if ga_version is not None:
135             match = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
136                              ga_version)
137             if match:
138                 return match.group('version')
139             return ga_version
140     version = property(_get_version)
141
142 class CASAVA(Alignment):
143     GERALD='Casava'
144
145     def _get_runfolder_name(self):
146         if self.tree is None:
147             return None
148
149         # hiseqs renamed the experiment dir location
150         defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
151         _, experiment_dir = os.path.split(defaults_expt_dir)
152
153         if experiment_dir is None or len(experiment_dir) == 0:
154             return None
155         return experiment_dir
156
157     runfolder_name = property(_get_runfolder_name)
158
159     def _get_version(self):
160         if self.tree is None:
161             return None
162         hiseq_software_node = self.tree.find('Software')
163         hiseq_version = hiseq_software_node.attrib['Version']
164         return hiseq_version
165     version = property(_get_version)
166
167
168 class LaneParameters(object):
169     """
170     Make it easy to access elements of LaneSpecificRunParameters from python
171     """
172     def __init__(self, gerald, lane_id):
173         self._gerald = gerald
174         self._lane_id = lane_id
175
176     def _get_analysis(self):
177         raise NotImplemented("abstract class")
178     analysis = property(_get_analysis)
179
180     def _get_eland_genome(self):
181         raise NotImplemented("abstract class")
182     eland_genome = property(_get_eland_genome)
183
184     def _get_read_length(self):
185         raise NotImplemented("abstract class")
186     read_length = property(_get_read_length)
187
188     def _get_use_bases(self):
189         raise NotImplemented("abstract class")
190     use_bases = property(_get_use_bases)
191
192
193 class LaneParametersGA(LaneParameters):
194     """
195     Make it easy to access elements of LaneSpecificRunParameters from python
196     """
197     def __init__(self, gerald, lane_id):
198         super(LaneParametersGA, self).__init__(gerald, lane_id)
199
200     def __get_attribute(self, xml_tag):
201         subtree = self._gerald.tree.find('LaneSpecificRunParameters')
202         container = subtree.find(xml_tag)
203         if container is None:
204             return None
205         if len(container.getchildren()) > LANES_PER_FLOWCELL:
206             raise RuntimeError('GERALD config.xml file changed')
207         lanes = [x.tag.split('_')[1] for x in container.getchildren()]
208         try:
209             index = lanes.index(self._lane_id)
210         except ValueError, e:
211             return None
212         element = container[index]
213         return element.text
214     def _get_analysis(self):
215         return self.__get_attribute('ANALYSIS')
216     analysis = property(_get_analysis)
217
218     def _get_eland_genome(self):
219         genome = self.__get_attribute('ELAND_GENOME')
220         # default to the chipwide parameters if there isn't an
221         # entry in the lane specific paramaters
222         if genome is None:
223             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
224         # ignore flag value
225         if genome == 'Need_to_specify_ELAND_genome_directory':
226             genome = None
227         return genome
228     eland_genome = property(_get_eland_genome)
229
230     def _get_read_length(self):
231         read_length = self.__get_attribute('READ_LENGTH')
232         if read_length is None:
233             read_length = self._gerald._get_chip_attribute('READ_LENGTH')
234         return read_length
235     read_length = property(_get_read_length)
236
237     def _get_use_bases(self):
238         return self.__get_attribute('USE_BASES')
239     use_bases = property(_get_use_bases)
240
241
242 class LaneParametersHiSeq(LaneParameters):
243     """
244     Make it easy to access elements of LaneSpecificRunParameters from python
245     """
246     def __init__(self, gerald, lane_id, element):
247         super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
248         self.element = element
249
250     def __get_attribute(self, xml_tag):
251         container = self.element.find(xml_tag)
252         if container is None:
253             return None
254         return container.text
255
256     def _get_analysis(self):
257         return self.__get_attribute('ANALYSIS')
258     analysis = property(_get_analysis)
259
260     def _get_eland_genome(self):
261         genome = self.__get_attribute('ELAND_GENOME')
262         # default to the chipwide parameters if there isn't an
263         # entry in the lane specific paramaters
264         if genome is None:
265             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
266         # ignore flag value
267         if genome == 'Need_to_specify_ELAND_genome_directory':
268             genome = None
269         return genome
270     eland_genome = property(_get_eland_genome)
271
272     def _get_read_length(self):
273         return self.__get_attribute('READ_LENGTH1')
274     read_length = property(_get_read_length)
275
276     def _get_use_bases(self):
277         return self.__get_attribute('USE_BASES1')
278     use_bases = property(_get_use_bases)
279
280 class LaneSpecificRunParameters(object):
281     """
282     Provide access to LaneSpecificRunParameters
283     """
284     def __init__(self, gerald):
285         self._gerald = gerald
286         self._lane = None
287
288     def _initalize_lanes(self):
289         """
290         build dictionary of LaneParameters
291         """
292         self._lanes = {}
293         tree = self._gerald.tree
294         analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
295         if analysis is not None:
296             self._extract_ga_analysis_type(analysis)
297         analysis = tree.find('Projects')
298         if analysis is not None:
299             self._extract_hiseq_analysis_type(analysis)
300
301     def _extract_ga_analysis_type(self, analysis):
302         # according to the pipeline specs I think their fields
303         # are sampleName_laneID, with sampleName defaulting to s
304         # since laneIDs are constant lets just try using
305         # those consistently.
306         for element in analysis:
307             sample, lane_id = element.tag.split('_')
308             self._lanes[int(lane_id)] = LaneParametersGA(
309                                           self._gerald, lane_id)
310
311     def _extract_hiseq_analysis_type(self, analysis):
312         """Extract from HiSeq style multiplexed analysis types"""
313         for element in analysis:
314             name = element.attrib['name']
315             self._lanes[name] = LaneParametersHiSeq(self._gerald,
316                                                     name,
317                                                     element)
318
319     def __iter__(self):
320         return self._lanes.iterkeys()
321     def __getitem__(self, key):
322         if self._lane is None:
323             self._initalize_lanes()
324         return self._lanes[key]
325     def get(self, key, default):
326         if self._lane is None:
327             self._initalize_lanes()
328         return self._lanes.get(key, None)
329     def keys(self):
330         if self._lane is None:
331             self._initalize_lanes()
332         return self._lanes.keys()
333     def values(self):
334         if self._lane is None:
335             self._initalize_lanes()
336         return self._lanes.values()
337     def items(self):
338         if self._lane is None:
339             self._initalize_lanes()
340         return self._lanes.items()
341     def __len__(self):
342         if self._lane is None:
343             self._initalize_lanes()
344         return len(self._lanes)
345
346
347 def gerald(pathname):
348     LOGGER.info("Parsing gerald config.xml")
349     pathname = os.path.expanduser(pathname)
350     config_pathname = os.path.join(pathname, 'config.xml')
351     config_tree = ElementTree.parse(config_pathname).getroot()
352
353     # parse Summary.htm file
354     summary_xml = os.path.join(pathname, 'Summary.xml')
355     summary_htm = os.path.join(pathname, 'Summary.htm')
356     report_summary = os.path.join(pathname, '..', 'Data',
357                                   'reports', 'Summary', )
358     if os.path.exists(summary_xml):
359         g = Gerald(pathname = pathname, tree=config_tree)
360         LOGGER.info("Parsing Summary.xml")
361         g.summary = SummaryGA(summary_xml)
362         g.eland_results = eland(g.pathname, g)
363     elif os.path.exists(summary_htm):
364         g = Gerald(pathname=pathname, tree=config_tree)
365         LOGGER.info("Parsing Summary.htm")
366         g.summary = SummaryGA(summary_htm)
367         g.eland_results = eland(g.pathname, g)
368     elif os.path.isdir(report_summary):
369         g = CASAVA(pathname=pathname, tree=config_tree)
370         LOGGER.info("Parsing %s" % (report_summary,))
371         g.summary = SummaryHiSeq(report_summary)
372
373     # parse eland files
374     return g
375
376 if __name__ == "__main__":
377   # quick test code
378   import sys
379   g = gerald(sys.argv[1])
380   #ElementTree.dump(g.get_elements())