Add a software (name) property to firecrest, ipar, bustard, gerald
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import re
8 import stat
9 import time
10
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
13
14 from htsworkflow.pipelines.runfolder import \
15    ElementTree, \
16    EUROPEAN_STRPTIME, \
17    LANES_PER_FLOWCELL, \
18    VERSION_RE
19 from htsworkflow.util.ethelp import indent, flatten
20
21 LOGGER = logging.getLogger(__name__)
22
23 class Alignment(object):
24     """
25     Capture meaning out of the GERALD directory
26     """
27     XML_VERSION = 1
28     RUN_PARAMETERS='RunParameters'
29     SUMMARY='Summary'
30
31     def __init__(self, xml=None, pathname=None, tree=None):
32         self.pathname = pathname
33         self.tree = tree
34
35         # parse lane parameters out of the config.xml file
36         self.lanes = LaneSpecificRunParameters(self)
37
38         self.summary = None
39         self.eland_results = None
40
41         if xml is not None:
42             self.set_elements(xml)
43
44     def _get_time(self):
45         return time.mktime(self.date.timetuple())
46     time = property(_get_time, doc='return run time as seconds since epoch')
47
48     def _get_chip_attribute(self, value):
49         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
50
51     def dump(self):
52         """
53         Debugging function, report current object
54         """
55         print 'Software:'. self.__class__.__name__
56         print 'Alignment version:', self.version
57         print 'Run date:', self.date
58         print 'config.xml:', self.tree
59         self.summary.dump()
60
61     def get_elements(self):
62         if self.tree is None or self.summary is None:
63             return None
64
65         gerald = ElementTree.Element(Gerald.GERALD,
66                                      {'version': unicode(Gerald.XML_VERSION)})
67         gerald.append(self.tree)
68         gerald.append(self.summary.get_elements())
69         if self.eland_results:
70             gerald.append(self.eland_results.get_elements())
71         return gerald
72
73     def set_elements(self, tree):
74         if tree.tag !=  self.__class__.GERALD:
75             raise ValueError('expected GERALD')
76         xml_version = int(tree.attrib.get('version', 0))
77         if xml_version > Gerald.XML_VERSION:
78             LOGGER.warn('XML tree is a higher version than this class')
79         self.eland_results = ELAND()
80         for element in list(tree):
81             tag = element.tag.lower()
82             if tag == Gerald.RUN_PARAMETERS.lower():
83                 self.tree = element
84             elif tag == Gerald.SUMMARY.lower():
85                 self.summary = Summary(xml=element)
86             elif tag == ELAND.ELAND.lower():
87                 self.eland_results = ELAND(xml=element)
88             else:
89                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
90
91 class Gerald(Alignment):
92     GERALD='Gerald'
93
94     def _get_date(self):
95         if self.tree is None:
96             return datetime.today()
97         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
98         if timestamp is not None:
99             epochstamp = time.mktime(time.strptime(timestamp, '%c'))
100             return datetime.fromtimestamp(epochstamp)
101         if self.pathname is not None:
102             epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
103             return datetime.fromtimestamp(epochstamp)
104         return datetime.today()
105     date = property(_get_date)
106
107     def _get_experiment_root(self):
108         if self.tree is None:
109             return None
110         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
111
112     def _get_runfolder_name(self):
113         if self.tree is None:
114             return None
115
116         expt_root = os.path.normpath(self._get_experiment_root())
117         chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
118
119         if expt_root is not None and chip_expt_dir is not None:
120             experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
121             experiment_dir = experiment_dir.split(os.path.sep)[0]
122
123         if experiment_dir is None or len(experiment_dir) == 0:
124             return None
125         return experiment_dir
126
127     runfolder_name = property(_get_runfolder_name)
128
129     def _get_software_version(self):
130         if self.tree is None:
131             return None
132         ga_version = self.tree.findtext(
133             'ChipWideRunParameters/SOFTWARE_VERSION')
134         if ga_version is not None:
135             gerald = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
136                              ga_version)
137             if gerald:
138                 return ('GERALD', gerald.group('version'))
139             casava = re.match('CASAVA-(?P<version>\d+[.\d]*)',
140                               ga_version)
141             if casava:
142                 return ('CASAVA', casava.group('version'))
143
144     def _get_software(self):
145         """Return name of analysis software package"""
146         software_version = self._get_software_version()
147         return software_version[0] if software_version is not None else None
148     software = property(_get_software)
149
150     def _get_version(self):
151         """Return version number of software package"""
152         software_version = self._get_software_version()
153         return software_version[1] if software_version is not None else None
154     version = property(_get_version)
155
156 class CASAVA(Alignment):
157     GERALD='Casava'
158
159     def _get_runfolder_name(self):
160         if self.tree is None:
161             return None
162
163         # hiseqs renamed the experiment dir location
164         defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
165         _, experiment_dir = os.path.split(defaults_expt_dir)
166
167         if experiment_dir is None or len(experiment_dir) == 0:
168             return None
169         return experiment_dir
170
171     runfolder_name = property(_get_runfolder_name)
172
173     def _get_software_version(self):
174         if self.tree is None:
175             return None
176         if self.tree is None:
177             return None
178         hiseq_software_node = self.tree.find('Software')
179         software_version = hiseq_software_node.attrib.get('Version',None)
180         if software_version is None:
181             return None
182         return software_version.split('-')
183
184     def _get_software(self):
185         software_version = self._get_software_version()
186         if software_version is None:
187             return None
188         return software_version[0]
189     software = property(_get_software)
190
191     def _get_version(self):
192         software_version = self._get_software_version()
193         if software_version is None:
194             return None
195         return software_version[1]
196     version = property(_get_version)
197
198
199 class LaneParameters(object):
200     """
201     Make it easy to access elements of LaneSpecificRunParameters from python
202     """
203     def __init__(self, gerald, lane_id):
204         self._gerald = gerald
205         self._lane_id = lane_id
206
207     def _get_analysis(self):
208         raise NotImplemented("abstract class")
209     analysis = property(_get_analysis)
210
211     def _get_eland_genome(self):
212         raise NotImplemented("abstract class")
213     eland_genome = property(_get_eland_genome)
214
215     def _get_read_length(self):
216         raise NotImplemented("abstract class")
217     read_length = property(_get_read_length)
218
219     def _get_use_bases(self):
220         raise NotImplemented("abstract class")
221     use_bases = property(_get_use_bases)
222
223
224 class LaneParametersGA(LaneParameters):
225     """
226     Make it easy to access elements of LaneSpecificRunParameters from python
227     """
228     def __init__(self, gerald, lane_id):
229         super(LaneParametersGA, self).__init__(gerald, lane_id)
230
231     def __get_attribute(self, xml_tag):
232         subtree = self._gerald.tree.find('LaneSpecificRunParameters')
233         container = subtree.find(xml_tag)
234         if container is None:
235             return None
236         if len(container.getchildren()) > LANES_PER_FLOWCELL:
237             raise RuntimeError('GERALD config.xml file changed')
238         lanes = [x.tag.split('_')[1] for x in container.getchildren()]
239         try:
240             index = lanes.index(self._lane_id)
241         except ValueError, e:
242             return None
243         element = container[index]
244         return element.text
245     def _get_analysis(self):
246         return self.__get_attribute('ANALYSIS')
247     analysis = property(_get_analysis)
248
249     def _get_eland_genome(self):
250         genome = self.__get_attribute('ELAND_GENOME')
251         # default to the chipwide parameters if there isn't an
252         # entry in the lane specific paramaters
253         if genome is None:
254             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
255         # ignore flag value
256         if genome == 'Need_to_specify_ELAND_genome_directory':
257             genome = None
258         return genome
259     eland_genome = property(_get_eland_genome)
260
261     def _get_read_length(self):
262         read_length = self.__get_attribute('READ_LENGTH')
263         if read_length is None:
264             read_length = self._gerald._get_chip_attribute('READ_LENGTH')
265         return read_length
266     read_length = property(_get_read_length)
267
268     def _get_use_bases(self):
269         return self.__get_attribute('USE_BASES')
270     use_bases = property(_get_use_bases)
271
272
273 class LaneParametersHiSeq(LaneParameters):
274     """
275     Make it easy to access elements of LaneSpecificRunParameters from python
276     """
277     def __init__(self, gerald, lane_id, element):
278         super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
279         self.element = element
280
281     def __get_attribute(self, xml_tag):
282         container = self.element.find(xml_tag)
283         if container is None:
284             return None
285         return container.text
286
287     def _get_analysis(self):
288         return self.__get_attribute('ANALYSIS')
289     analysis = property(_get_analysis)
290
291     def _get_eland_genome(self):
292         genome = self.__get_attribute('ELAND_GENOME')
293         # default to the chipwide parameters if there isn't an
294         # entry in the lane specific paramaters
295         if genome is None:
296             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
297         # ignore flag value
298         if genome == 'Need_to_specify_ELAND_genome_directory':
299             genome = None
300         return genome
301     eland_genome = property(_get_eland_genome)
302
303     def _get_read_length(self):
304         return self.__get_attribute('READ_LENGTH1')
305     read_length = property(_get_read_length)
306
307     def _get_use_bases(self):
308         return self.__get_attribute('USE_BASES1')
309     use_bases = property(_get_use_bases)
310
311 class LaneSpecificRunParameters(object):
312     """
313     Provide access to LaneSpecificRunParameters
314     """
315     def __init__(self, gerald):
316         self._gerald = gerald
317         self._lane = None
318
319     def _initalize_lanes(self):
320         """
321         build dictionary of LaneParameters
322         """
323         self._lanes = {}
324         tree = self._gerald.tree
325         analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
326         if analysis is not None:
327             self._extract_ga_analysis_type(analysis)
328         analysis = tree.find('Projects')
329         if analysis is not None:
330             self._extract_hiseq_analysis_type(analysis)
331
332     def _extract_ga_analysis_type(self, analysis):
333         # according to the pipeline specs I think their fields
334         # are sampleName_laneID, with sampleName defaulting to s
335         # since laneIDs are constant lets just try using
336         # those consistently.
337         for element in analysis:
338             sample, lane_id = element.tag.split('_')
339             self._lanes[int(lane_id)] = LaneParametersGA(
340                                           self._gerald, lane_id)
341
342     def _extract_hiseq_analysis_type(self, analysis):
343         """Extract from HiSeq style multiplexed analysis types"""
344         for element in analysis:
345             name = element.attrib['name']
346             self._lanes[name] = LaneParametersHiSeq(self._gerald,
347                                                     name,
348                                                     element)
349
350     def __iter__(self):
351         return self._lanes.iterkeys()
352     def __getitem__(self, key):
353         if self._lane is None:
354             self._initalize_lanes()
355         return self._lanes[key]
356     def get(self, key, default):
357         if self._lane is None:
358             self._initalize_lanes()
359         return self._lanes.get(key, None)
360     def keys(self):
361         if self._lane is None:
362             self._initalize_lanes()
363         return self._lanes.keys()
364     def values(self):
365         if self._lane is None:
366             self._initalize_lanes()
367         return self._lanes.values()
368     def items(self):
369         if self._lane is None:
370             self._initalize_lanes()
371         return self._lanes.items()
372     def __len__(self):
373         if self._lane is None:
374             self._initalize_lanes()
375         return len(self._lanes)
376
377
378 def gerald(pathname):
379     LOGGER.info("Parsing gerald config.xml")
380     pathname = os.path.expanduser(pathname)
381     config_pathname = os.path.join(pathname, 'config.xml')
382     config_tree = ElementTree.parse(config_pathname).getroot()
383
384     # parse Summary.htm file
385     summary_xml = os.path.join(pathname, 'Summary.xml')
386     summary_htm = os.path.join(pathname, 'Summary.htm')
387     report_summary = os.path.join(pathname, '..', 'Data',
388                                   'reports', 'Summary', )
389     if os.path.exists(summary_xml):
390         g = Gerald(pathname = pathname, tree=config_tree)
391         LOGGER.info("Parsing Summary.xml")
392         g.summary = SummaryGA(summary_xml)
393         g.eland_results = eland(g.pathname, g)
394     elif os.path.exists(summary_htm):
395         g = Gerald(pathname=pathname, tree=config_tree)
396         LOGGER.info("Parsing Summary.htm")
397         g.summary = SummaryGA(summary_htm)
398         g.eland_results = eland(g.pathname, g)
399     elif os.path.isdir(report_summary):
400         g = CASAVA(pathname=pathname, tree=config_tree)
401         LOGGER.info("Parsing %s" % (report_summary,))
402         g.summary = SummaryHiSeq(report_summary)
403
404     # parse eland files
405     return g
406
407 if __name__ == "__main__":
408   # quick test code
409   import sys
410   g = gerald(sys.argv[1])
411   #ElementTree.dump(g.get_elements())