Use specific time formatting instead of locale '%c'
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """Provide access to information stored in the GERALD directory.
2 """
3 import collections
4 from datetime import datetime, date
5 import logging
6 import os
7 import re
8 import stat
9 import time
10
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
13 from htsworkflow.pipelines.samplekey import SampleKey
14
15 from htsworkflow.pipelines.runfolder import \
16    ElementTree, \
17    EUROPEAN_STRPTIME, \
18    LANES_PER_FLOWCELL, \
19    VERSION_RE
20 from htsworkflow.util.ethelp import indent, flatten
21
22 LOGGER = logging.getLogger(__name__)
23
24 class Alignment(object):
25     """
26     Capture meaning out of the GERALD directory
27     """
28     XML_VERSION = 1
29     RUN_PARAMETERS='RunParameters'
30     SUMMARY='Summary'
31
32     def __init__(self, xml=None, pathname=None, tree=None):
33         self.pathname = pathname
34         self.tree = tree
35
36         # parse lane parameters out of the config.xml file
37         self.lanes = LaneSpecificRunParameters(self)
38
39         self.summary = None
40         self.eland_results = None
41
42         if xml is not None:
43             self.set_elements(xml)
44
45     def _get_date(self):
46         if self.pathname is not None:
47             epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
48             return datetime.fromtimestamp(epochstamp)
49         return datetime.today()
50
51     def _get_time(self):
52         return time.mktime(self.date.timetuple())
53     time = property(_get_time, doc='return run time as seconds since epoch')
54
55     def _get_chip_attribute(self, value):
56         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
57
58     def dump(self):
59         """
60         Debugging function, report current object
61         """
62         print 'Software:'. self.__class__.__name__
63         print 'Alignment version:', self.version
64         print 'Run date:', self.date
65         print 'config.xml:', self.tree
66         self.summary.dump()
67
68     def get_elements(self, root_tag):
69         if self.tree is None or self.summary is None:
70             return None
71
72         gerald = ElementTree.Element(root_tag,
73                                      {'version': unicode(Gerald.XML_VERSION)})
74         gerald.append(self.tree)
75         gerald.append(self.summary.get_elements())
76         if self.eland_results:
77             gerald.append(self.eland_results.get_elements())
78         return gerald
79
80     def set_elements(self, tree, root_tag):
81         if tree.tag !=  root_tag:
82             raise ValueError('expected %s' % (self.__class__.GERALD,))
83         xml_version = int(tree.attrib.get('version', 0))
84         if xml_version > Gerald.XML_VERSION:
85             LOGGER.warn('XML tree is a higher version than this class')
86         self.eland_results = ELAND()
87         for element in list(tree):
88             tag = element.tag.lower()
89             if tag == Gerald.RUN_PARAMETERS.lower():
90                 self.tree = element
91             elif tag == Gerald.SUMMARY.lower():
92                 self.summary = Summary(xml=element)
93             elif tag == ELAND.ELAND.lower():
94                 self.eland_results = ELAND(xml=element)
95             else:
96                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
97
98 class Gerald(Alignment):
99     GERALD='Gerald'
100
101     def _get_date(self):
102         if self.tree is None:
103             return datetime.today()
104
105         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
106         if timestamp is not None:
107             epochstamp = time.mktime(time.strptime(timestamp))
108             return datetime.fromtimestamp(epochstamp)
109         return super(Gerald, self)._get_date()
110     date = property(_get_date)
111
112     def get_elements(self):
113         return super(Gerald, self).get_elements(Gerald.GERALD)
114
115     def set_elements(self, tree):
116         return super(Gerald, self).set_elements(tree, Gerald.GERALD)
117
118     def _get_experiment_root(self):
119         if self.tree is None:
120             return None
121         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
122
123     def _get_runfolder_name(self):
124         if self.tree is None:
125             return None
126
127         expt_root = os.path.normpath(self._get_experiment_root())
128         chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
129
130         if expt_root is not None and chip_expt_dir is not None:
131             experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
132             experiment_dir = experiment_dir.split(os.path.sep)[0]
133
134         if experiment_dir is None or len(experiment_dir) == 0:
135             return None
136         return experiment_dir
137
138     runfolder_name = property(_get_runfolder_name)
139
140     def _get_software_version(self):
141         if self.tree is None:
142             return None
143         ga_version = self.tree.findtext(
144             'ChipWideRunParameters/SOFTWARE_VERSION')
145         if ga_version is not None:
146             gerald = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
147                              ga_version)
148             if gerald:
149                 return ('GERALD', gerald.group('version'))
150             casava = re.match('CASAVA-(?P<version>\d+[.\d]*)',
151                               ga_version)
152             if casava:
153                 return ('CASAVA', casava.group('version'))
154
155     def _get_software(self):
156         """Return name of analysis software package"""
157         software_version = self._get_software_version()
158         return software_version[0] if software_version is not None else None
159     software = property(_get_software)
160
161     def _get_version(self):
162         """Return version number of software package"""
163         software_version = self._get_software_version()
164         return software_version[1] if software_version is not None else None
165     version = property(_get_version)
166
167 class CASAVA(Alignment):
168     GERALD='Casava'
169
170     def __init__(self, xml=None, pathname=None, tree=None):
171         super(CASAVA, self).__init__(xml=xml, pathname=pathname, tree=tree)
172
173         self._add_timestamp()
174
175     def _add_timestamp(self):
176         """Manually add a time stamp to CASAVA runs"""
177         if self.tree is None:
178             return
179         if len(self.tree.xpath('TIME_STAMP')) == 0:
180             time_stamp = self.date.strftime('%c')
181             time_element = ElementTree.Element('TIME_STAMP')
182             time_element.text = time_stamp
183             self.tree.append(time_element)
184
185     def _get_date(self):
186         if self.tree is None:
187             return None
188         time_element = self.tree.xpath('TIME_STAMP')
189         if len(time_element) == 1:
190             timetuple = time.strptime(
191                 time_element[0].text.strip(),
192                 "%a %d %b %Y %I:%M:%S %p")
193             return datetime(*timetuple[:6])
194         return super(CASAVA, self)._get_date()
195     date = property(_get_date)
196
197     def get_elements(self):
198         tree = super(CASAVA, self).get_elements(CASAVA.GERALD)
199         return tree
200
201     def set_elements(self, tree):
202         return super(CASAVA, self).set_elements(tree, CASAVA.GERALD)
203
204     def _get_runfolder_name(self):
205         if self.tree is None:
206             return None
207
208         # hiseqs renamed the experiment dir location
209         defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
210         _, experiment_dir = os.path.split(defaults_expt_dir)
211
212         if experiment_dir is None or len(experiment_dir) == 0:
213             return None
214         return experiment_dir
215
216     runfolder_name = property(_get_runfolder_name)
217
218     def _get_software_version(self):
219         if self.tree is None:
220             return None
221         if self.tree is None:
222             return None
223         hiseq_software_node = self.tree.find('Software')
224         software_version = hiseq_software_node.attrib.get('Version',None)
225         if software_version is None:
226             return None
227         return software_version.split('-')
228
229     def _get_software(self):
230         software_version = self._get_software_version()
231         if software_version is None:
232             return None
233         return software_version[0]
234     software = property(_get_software)
235
236     def _get_version(self):
237         software_version = self._get_software_version()
238         if software_version is None:
239             return None
240         return software_version[1]
241     version = property(_get_version)
242
243
244 class LaneParameters(object):
245     """
246     Make it easy to access elements of LaneSpecificRunParameters from python
247     """
248     def __init__(self, gerald, lane_id):
249         self._gerald = gerald
250         self._lane_id = lane_id
251
252     def _get_analysis(self):
253         raise NotImplemented("abstract class")
254     analysis = property(_get_analysis)
255
256     def _get_eland_genome(self):
257         raise NotImplemented("abstract class")
258     eland_genome = property(_get_eland_genome)
259
260     def _get_read_length(self):
261         raise NotImplemented("abstract class")
262     read_length = property(_get_read_length)
263
264     def _get_use_bases(self):
265         raise NotImplemented("abstract class")
266     use_bases = property(_get_use_bases)
267
268
269 class LaneParametersGA(LaneParameters):
270     """
271     Make it easy to access elements of LaneSpecificRunParameters from python
272     """
273     def __init__(self, gerald, lane_id):
274         super(LaneParametersGA, self).__init__(gerald, lane_id)
275
276     def __get_attribute(self, xml_tag):
277         subtree = self._gerald.tree.find('LaneSpecificRunParameters')
278         container = subtree.find(xml_tag)
279         if container is None:
280             return None
281         if len(container.getchildren()) > LANES_PER_FLOWCELL:
282             raise RuntimeError('GERALD config.xml file changed')
283         lanes = [x.tag.split('_')[1] for x in container.getchildren()]
284         try:
285             index = lanes.index(self._lane_id)
286         except ValueError, e:
287             return None
288         element = container[index]
289         return element.text
290     def _get_analysis(self):
291         return self.__get_attribute('ANALYSIS')
292     analysis = property(_get_analysis)
293
294     def _get_eland_genome(self):
295         genome = self.__get_attribute('ELAND_GENOME')
296         # default to the chipwide parameters if there isn't an
297         # entry in the lane specific paramaters
298         if genome is None:
299             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
300         # ignore flag value
301         if genome == 'Need_to_specify_ELAND_genome_directory':
302             genome = None
303         return genome
304     eland_genome = property(_get_eland_genome)
305
306     def _get_read_length(self):
307         read_length = self.__get_attribute('READ_LENGTH')
308         if read_length is None:
309             read_length = self._gerald._get_chip_attribute('READ_LENGTH')
310         return read_length
311     read_length = property(_get_read_length)
312
313     def _get_use_bases(self):
314         return self.__get_attribute('USE_BASES')
315     use_bases = property(_get_use_bases)
316
317
318 class LaneParametersHiSeq(LaneParameters):
319     """
320     Make it easy to access elements of LaneSpecificRunParameters from python
321     """
322     def __init__(self, gerald, lane_id, element):
323         super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
324         self.element = element
325
326     def __get_attribute(self, xml_tag):
327         container = self.element.find(xml_tag)
328         if container is None:
329             return None
330         return container.text
331
332     def _get_analysis(self):
333         return self.__get_attribute('ANALYSIS')
334     analysis = property(_get_analysis)
335
336     def _get_eland_genome(self):
337         genome = self.__get_attribute('ELAND_GENOME')
338         # default to the chipwide parameters if there isn't an
339         # entry in the lane specific paramaters
340         if genome is None:
341             genome = self._gerald._get_chip_attribute('ELAND_GENOME')
342         # ignore flag value
343         if genome == 'Need_to_specify_ELAND_genome_directory':
344             genome = None
345         return genome
346     eland_genome = property(_get_eland_genome)
347
348     def _get_read_length(self):
349         return self.__get_attribute('READ_LENGTH1')
350     read_length = property(_get_read_length)
351
352     def _get_use_bases(self):
353         return self.__get_attribute('USE_BASES1')
354     use_bases = property(_get_use_bases)
355
356 class LaneSpecificRunParameters(collections.MutableMapping):
357     """
358     Provide access to LaneSpecificRunParameters
359     """
360     def __init__(self, gerald):
361         self._gerald = gerald
362         self._lanes = None
363
364     def _initialize_lanes(self):
365         """
366         build dictionary of LaneParameters
367         """
368         self._lanes = {}
369         tree = self._gerald.tree
370         analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
371         if analysis is not None:
372             self._extract_ga_analysis_type(analysis)
373         analysis = tree.find('Projects')
374         if analysis is not None:
375             self._extract_hiseq_analysis_type(analysis)
376
377     def _extract_ga_analysis_type(self, analysis):
378         # according to the pipeline specs I think their fields
379         # are sampleName_laneID, with sampleName defaulting to s
380         # since laneIDs are constant lets just try using
381         # those consistently.
382         for element in analysis:
383             sample, lane_id = element.tag.split('_')
384             key = SampleKey(lane=int(lane_id), sample=sample)
385             self._lanes[key] = LaneParametersGA(
386                                           self._gerald, lane_id)
387
388     def _extract_hiseq_analysis_type(self, analysis):
389         """Extract from HiSeq style multiplexed analysis types"""
390         for element in analysis:
391             name = element.attrib['name']
392             key = SampleKey(sample=name)
393             self._lanes[key] = LaneParametersHiSeq(self._gerald,
394                                                    name,
395                                                    element)
396
397     def __iter__(self):
398         if self._lanes is None:
399             self._initialize_lanes()
400         return self._lanes.iterkeys()
401
402     def __getitem__(self, key):
403         if self._lanes is None:
404             self._initialize_lanes()
405         value = self._lanes.get(key, None)
406         if value is not None:
407             return value
408         real_key = self._find_key(key)
409         if real_key is not None:
410             return self._lanes[real_key]
411         raise KeyError("%s not found in %s" % (
412             repr(key),
413             ",".join((repr(k) for k in self._lanes.keys()))))
414
415     def __setitem__(self, key, value):
416         if len(self._lanes) > 100:
417             LOGGER.warn("many projects loaded, consider improving dictionary")
418         real_key = self._find_key(key)
419         if real_key is not None:
420             key = real_key
421         self._lanes[key] = value
422
423     def __delitem__(self, key):
424         if key in self._lanes:
425             del self._lanes[key]
426         else:
427             real_key = self._find_key(key)
428             if real_key is not None:
429                 del self._lanes[real_key]
430
431     def __len__(self):
432         if self._lanes is None:
433             self._initialize_lanes()
434         return len(self._lanes)
435
436     def _find_key(self, lookup_key):
437         if not isinstance(lookup_key, SampleKey):
438             lookup_key = SampleKey(lane=lookup_key)
439
440         results = []
441         for k in self._lanes:
442             if k.matches(lookup_key):
443                 results.append(k)
444         if len(results) > 1:
445             errmsg = "Key %s matched multiple keys: %s"
446             raise ValueError(errmsg % (str(lookup_key),
447                                        ",".join((str(x) for x in results))))
448
449         elif len(results) == 1:
450             return results[0]
451         else:
452             return None
453
454 def gerald(pathname):
455     LOGGER.info("Parsing gerald config.xml")
456     pathname = os.path.expanduser(pathname)
457     config_pathname = os.path.join(pathname, 'config.xml')
458     config_tree = ElementTree.parse(config_pathname).getroot()
459
460     # parse Summary.htm file
461     summary_xml = os.path.join(pathname, 'Summary.xml')
462     summary_htm = os.path.join(pathname, 'Summary.htm')
463     report_summary = os.path.join(pathname, '..', 'Data',
464                                   'reports', 'Summary', )
465     if os.path.exists(summary_xml):
466         g = Gerald(pathname = pathname, tree=config_tree)
467         LOGGER.info("Parsing Summary.xml")
468         g.summary = SummaryGA(summary_xml)
469         g.eland_results = eland(g.pathname, g)
470     elif os.path.exists(summary_htm):
471         g = Gerald(pathname=pathname, tree=config_tree)
472         LOGGER.info("Parsing Summary.htm")
473         g.summary = SummaryGA(summary_htm)
474         g.eland_results = eland(g.pathname, g)
475     elif os.path.isdir(report_summary):
476         g = CASAVA(pathname=pathname, tree=config_tree)
477         LOGGER.info("Parsing %s" % (report_summary,))
478         g.summary = SummaryHiSeq(report_summary)
479         g.eland_results = eland(g.pathname, g)
480
481     # parse eland files
482     return g
483
484 if __name__ == "__main__":
485   # quick test code
486   import sys
487   g = gerald(sys.argv[1])
488   #ElementTree.dump(g.get_elements())