Use different tags indicating gerald or casava style base alignment
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import re
8 import stat
9 import time
10
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
13
14 from htsworkflow.pipelines.runfolder import \
15    ElementTree, \
16    EUROPEAN_STRPTIME, \
17    LANES_PER_FLOWCELL, \
18    VERSION_RE
19 from htsworkflow.util.ethelp import indent, flatten
20
21 LOGGER = logging.getLogger(__name__)
22
23 class Alignment(object):
24     """
25     Capture meaning out of the GERALD directory
26     """
27     XML_VERSION = 1
28     RUN_PARAMETERS='RunParameters'
29     SUMMARY='Summary'
30
31     def __init__(self, xml=None, pathname=None, tree=None):
32         self.pathname = pathname
33         self.tree = tree
34
35         # parse lane parameters out of the config.xml file
36         self.lanes = LaneSpecificRunParameters(self)
37
38         self.summary = None
39         self.eland_results = None
40
41         if xml is not None:
42             self.set_elements(xml)
43
44     def _get_date(self):
45         if self.pathname is not None:
46             epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
47             return datetime.fromtimestamp(epochstamp)
48         return datetime.today()
49
50     def _get_time(self):
51         return time.mktime(self.date.timetuple())
52     time = property(_get_time, doc='return run time as seconds since epoch')
53
54     def _get_chip_attribute(self, value):
55         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
56
57     def dump(self):
58         """
59         Debugging function, report current object
60         """
61         print 'Software:'. self.__class__.__name__
62         print 'Alignment version:', self.version
63         print 'Run date:', self.date
64         print 'config.xml:', self.tree
65         self.summary.dump()
66
67     def get_elements(self, root_tag):
68         if self.tree is None or self.summary is None:
69             return None
70
71         gerald = ElementTree.Element(root_tag,
72                                      {'version': unicode(Gerald.XML_VERSION)})
73         gerald.append(self.tree)
74         gerald.append(self.summary.get_elements())
75         if self.eland_results:
76             gerald.append(self.eland_results.get_elements())
77         return gerald
78
79     def set_elements(self, tree, root_tag):
80         if tree.tag !=  root_tag:
81             raise ValueError('expected %s' % (self.__class__.GERALD,))
82         xml_version = int(tree.attrib.get('version', 0))
83         if xml_version > Gerald.XML_VERSION:
84             LOGGER.warn('XML tree is a higher version than this class')
85         self.eland_results = ELAND()
86         for element in list(tree):
87             tag = element.tag.lower()
88             if tag == Gerald.RUN_PARAMETERS.lower():
89                 self.tree = element
90             elif tag == Gerald.SUMMARY.lower():
91                 self.summary = Summary(xml=element)
92             elif tag == ELAND.ELAND.lower():
93                 self.eland_results = ELAND(xml=element)
94             else:
95                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
96
97 class Gerald(Alignment):
98     GERALD='Gerald'
99
100     def _get_date(self):
101         if self.tree is None:
102             return datetime.today()
103
104         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
105         if timestamp is not None:
106             epochstamp = time.mktime(time.strptime(timestamp, '%c'))
107             return datetime.fromtimestamp(epochstamp)
108         return super(Gerald, self)._get_date()
109     date = property(_get_date)
110
111     def get_elements(self):
112         return super(Gerald, self).get_elements(Gerald.GERALD)
113
114     def set_elements(self, tree):
115         return super(Gerald, self).set_elements(tree, Gerald.GERALD)
116
117     def _get_experiment_root(self):
118         if self.tree is None:
119             return None
120         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
121
122     def _get_runfolder_name(self):
123         if self.tree is None:
124             return None
125
126         expt_root = os.path.normpath(self._get_experiment_root())
127         chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
128
129         if expt_root is not None and chip_expt_dir is not None:
130             experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
131             experiment_dir = experiment_dir.split(os.path.sep)[0]
132
133         if experiment_dir is None or len(experiment_dir) == 0:
134             return None
135         return experiment_dir
136
137     runfolder_name = property(_get_runfolder_name)
138
139     def _get_software_version(self):
140         if self.tree is None:
141             return None
142         ga_version = self.tree.findtext(
143             'ChipWideRunParameters/SOFTWARE_VERSION')
144         if ga_version is not None:
145             gerald = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
146                              ga_version)
147             if gerald:
148                 return ('GERALD', gerald.group('version'))
149             casava = re.match('CASAVA-(?P<version>\d+[.\d]*)',
150                               ga_version)
151             if casava:
152                 return ('CASAVA', casava.group('version'))
153
154     def _get_software(self):
155         """Return name of analysis software package"""
156         software_version = self._get_software_version()
157         return software_version[0] if software_version is not None else None
158     software = property(_get_software)
159
160     def _get_version(self):
161         """Return version number of software package"""
162         software_version = self._get_software_version()
163         return software_version[1] if software_version is not None else None
164     version = property(_get_version)
165
166 class CASAVA(Alignment):
167     GERALD='Casava'
168
169     def __init__(self, xml=None, pathname=None, tree=None):
170         super(CASAVA, self).__init__(xml=xml, pathname=pathname, tree=tree)
171
172         self._add_timestamp()
173
174     def _add_timestamp(self):
175         """Manually add a time stamp to CASAVA runs"""
176         if self.tree is None:
177             return
178         if len(self.tree.xpath('TIME_STAMP')) == 0:
179             time_stamp = self.date.strftime('%c')
180             time_element = ElementTree.Element('TIME_STAMP')
181             time_element.text = time_stamp
182             self.tree.append(time_element)
183
184     def _get_date(self):
185         if self.tree is None:
186             return None
187         time_element = self.tree.xpath('TIME_STAMP')
188         if len(time_element) == 1:
189             return datetime.strptime(time_element[0].text, '%c')
190         return super(CASAVA, self)._get_date()
191     date = property(_get_date)
192
193     def get_elements(self):
194         tree = super(CASAVA, self).get_elements(CASAVA.GERALD)
195         return tree
196
197     def set_elements(self, tree):
198         return super(CASAVA, self).set_elements(tree, CASAVA.GERALD)
199
200     def _get_runfolder_name(self):
201         if self.tree is None:
202             return None
203
204         # hiseqs renamed the experiment dir location
205         defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
206         _, experiment_dir = os.path.split(defaults_expt_dir)
207
208         if experiment_dir is None or len(experiment_dir) == 0:
209             return None
210         return experiment_dir
211
212     runfolder_name = property(_get_runfolder_name)
213
214     def _get_software_version(self):
215         if self.tree is None:
216             return None
217         if self.tree is None:
218             return None
219         hiseq_software_node = self.tree.find('Software')
220         software_version = hiseq_software_node.attrib.get('Version',None)
221         if software_version is None:
222             return None
223         return software_version.split('-')
224
225     def _get_software(self):
226         software_version = self._get_software_version()
227         if software_version is None:
228             return None
229         return software_version[0]
230     software = property(_get_software)
231
232     def _get_version(self):
233         software_version = self._get_software_version()
234         if software_version is None:
235             return None
236         return software_version[1]
237     version = property(_get_version)
238
239
240 class LaneParameters(object):
241     """
242     Make it easy to access elements of LaneSpecificRunParameters from python
243     """
244     def __init__(self, gerald, lane_id):
245         self._gerald = gerald
246         self._lane_id = lane_id
247
248     def _get_analysis(self):
249         raise NotImplemented("abstract class")
250     analysis = property(_get_analysis)
251
252     def _get_eland_genome(self):
253         raise NotImplemented("abstract class")
254     eland_genome = property(_get_eland_genome)
255
256     def _get_read_length(self):
257         raise NotImplemented("abstract class")
258     read_length = property(_get_read_length)
259
260     def _get_use_bases(self):
261         raise NotImplemented("abstract class")
262     use_bases = property(_get_use_bases)
263
264
265 class LaneParametersGA(LaneParameters):
266     """
267     Make it easy to access elements of LaneSpecificRunParameters from python
268     """
269     def __init__(self, gerald, lane_id):
270         super(LaneParametersGA, self).__init__(gerald, lane_id)
271
272     def __get_attribute(self, xml_tag):
273         subtree = self._gerald.tree.find('LaneSpecificRunParameters')
274         container = subtree.find(xml_tag)
275         if container is None:
276             return None
277         if len(container.getchildren()) > LANES_PER_FLOWCELL:
278             raise RuntimeError('GERALD config.xml file changed')
279         lanes = [x.tag.split('_')[1] for x in container.getchildren()]
280         try:
281             index = lanes.index(self._lane_id)
282         except ValueError, e:
283             return None
284         element = container[index]
285         return element.text
286     def _get_analysis(self):
287         return self.__get_attribute('ANALYSIS')
288     analysis = property(_get_analysis)
289
290     def _get_eland_genome(self):
291         genome = self.__get_attribute('ELAND_GENOME')
292         # default to the chipwide parameters if there isn't an
293         # entry in the lane specific paramaters
294         if genome is None:
295             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
296         # ignore flag value
297         if genome == 'Need_to_specify_ELAND_genome_directory':
298             genome = None
299         return genome
300     eland_genome = property(_get_eland_genome)
301
302     def _get_read_length(self):
303         read_length = self.__get_attribute('READ_LENGTH')
304         if read_length is None:
305             read_length = self._gerald._get_chip_attribute('READ_LENGTH')
306         return read_length
307     read_length = property(_get_read_length)
308
309     def _get_use_bases(self):
310         return self.__get_attribute('USE_BASES')
311     use_bases = property(_get_use_bases)
312
313
314 class LaneParametersHiSeq(LaneParameters):
315     """
316     Make it easy to access elements of LaneSpecificRunParameters from python
317     """
318     def __init__(self, gerald, lane_id, element):
319         super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
320         self.element = element
321
322     def __get_attribute(self, xml_tag):
323         container = self.element.find(xml_tag)
324         if container is None:
325             return None
326         return container.text
327
328     def _get_analysis(self):
329         return self.__get_attribute('ANALYSIS')
330     analysis = property(_get_analysis)
331
332     def _get_eland_genome(self):
333         genome = self.__get_attribute('ELAND_GENOME')
334         # default to the chipwide parameters if there isn't an
335         # entry in the lane specific paramaters
336         if genome is None:
337             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
338         # ignore flag value
339         if genome == 'Need_to_specify_ELAND_genome_directory':
340             genome = None
341         return genome
342     eland_genome = property(_get_eland_genome)
343
344     def _get_read_length(self):
345         return self.__get_attribute('READ_LENGTH1')
346     read_length = property(_get_read_length)
347
348     def _get_use_bases(self):
349         return self.__get_attribute('USE_BASES1')
350     use_bases = property(_get_use_bases)
351
352 class LaneSpecificRunParameters(object):
353     """
354     Provide access to LaneSpecificRunParameters
355     """
356     def __init__(self, gerald):
357         self._gerald = gerald
358         self._lane = None
359
360     def _initalize_lanes(self):
361         """
362         build dictionary of LaneParameters
363         """
364         self._lanes = {}
365         tree = self._gerald.tree
366         analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
367         if analysis is not None:
368             self._extract_ga_analysis_type(analysis)
369         analysis = tree.find('Projects')
370         if analysis is not None:
371             self._extract_hiseq_analysis_type(analysis)
372
373     def _extract_ga_analysis_type(self, analysis):
374         # according to the pipeline specs I think their fields
375         # are sampleName_laneID, with sampleName defaulting to s
376         # since laneIDs are constant lets just try using
377         # those consistently.
378         for element in analysis:
379             sample, lane_id = element.tag.split('_')
380             self._lanes[int(lane_id)] = LaneParametersGA(
381                                           self._gerald, lane_id)
382
383     def _extract_hiseq_analysis_type(self, analysis):
384         """Extract from HiSeq style multiplexed analysis types"""
385         for element in analysis:
386             name = element.attrib['name']
387             self._lanes[name] = LaneParametersHiSeq(self._gerald,
388                                                     name,
389                                                     element)
390
391     def __iter__(self):
392         return self._lanes.iterkeys()
393     def __getitem__(self, key):
394         if self._lane is None:
395             self._initalize_lanes()
396         return self._lanes[key]
397     def get(self, key, default):
398         if self._lane is None:
399             self._initalize_lanes()
400         return self._lanes.get(key, None)
401     def keys(self):
402         if self._lane is None:
403             self._initalize_lanes()
404         return self._lanes.keys()
405     def values(self):
406         if self._lane is None:
407             self._initalize_lanes()
408         return self._lanes.values()
409     def items(self):
410         if self._lane is None:
411             self._initalize_lanes()
412         return self._lanes.items()
413     def __len__(self):
414         if self._lane is None:
415             self._initalize_lanes()
416         return len(self._lanes)
417
418
419 def gerald(pathname):
420     LOGGER.info("Parsing gerald config.xml")
421     pathname = os.path.expanduser(pathname)
422     config_pathname = os.path.join(pathname, 'config.xml')
423     config_tree = ElementTree.parse(config_pathname).getroot()
424
425     # parse Summary.htm file
426     summary_xml = os.path.join(pathname, 'Summary.xml')
427     summary_htm = os.path.join(pathname, 'Summary.htm')
428     report_summary = os.path.join(pathname, '..', 'Data',
429                                   'reports', 'Summary', )
430     if os.path.exists(summary_xml):
431         g = Gerald(pathname = pathname, tree=config_tree)
432         LOGGER.info("Parsing Summary.xml")
433         g.summary = SummaryGA(summary_xml)
434         g.eland_results = eland(g.pathname, g)
435     elif os.path.exists(summary_htm):
436         g = Gerald(pathname=pathname, tree=config_tree)
437         LOGGER.info("Parsing Summary.htm")
438         g.summary = SummaryGA(summary_htm)
439         g.eland_results = eland(g.pathname, g)
440     elif os.path.isdir(report_summary):
441         g = CASAVA(pathname=pathname, tree=config_tree)
442         LOGGER.info("Parsing %s" % (report_summary,))
443         g.summary = SummaryHiSeq(report_summary)
444         g.eland_results = eland(g.pathname, g)
445
446     # parse eland files
447     return g
448
449 if __name__ == "__main__":
450   # quick test code
451   import sys
452   g = gerald(sys.argv[1])
453   #ElementTree.dump(g.get_elements())