Minimal changes needed to get raw data archived for loxcyc.
[htsworkflow.git] / htsworkflow / pipelines / gerald.py
1 """
2 Provide access to information stored in the GERALD directory.
3 """
4 from datetime import datetime, date
5 import logging
6 import os
7 import stat
8 import time
9
10 from htsworkflow.pipelines.summary import Summary
11 from htsworkflow.pipelines.eland import eland, ELAND
12
13 from htsworkflow.pipelines.runfolder import \
14    ElementTree, \
15    EUROPEAN_STRPTIME, \
16    LANES_PER_FLOWCELL, \
17    VERSION_RE
18 from htsworkflow.util.ethelp import indent, flatten
19
20 LOGGER = logging.getLogger(__name__)
21
22 class Gerald(object):
23     """
24     Capture meaning out of the GERALD directory
25     """
26     XML_VERSION = 1
27     GERALD='Gerald'
28     RUN_PARAMETERS='RunParameters'
29     SUMMARY='Summary'
30
31     class LaneParameters(object):
32         """
33         Make it easy to access elements of LaneSpecificRunParameters from python
34         """
35         def __init__(self, gerald, lane_id):
36             self._gerald = gerald
37             self._lane_id = lane_id
38
39         def __get_attribute(self, xml_tag):
40             subtree = self._gerald.tree.find('LaneSpecificRunParameters')
41             container = subtree.find(xml_tag)
42             if container is None:
43                 return None
44             if len(container.getchildren()) > LANES_PER_FLOWCELL:
45                 raise RuntimeError('GERALD config.xml file changed')
46             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
47             try:
48                 index = lanes.index(self._lane_id)
49             except ValueError, e:
50                 return None
51             element = container[index]
52             return element.text
53         def _get_analysis(self):
54             return self.__get_attribute('ANALYSIS')
55         analysis = property(_get_analysis)
56
57         def _get_eland_genome(self):
58             genome = self.__get_attribute('ELAND_GENOME')
59             # default to the chipwide parameters if there isn't an
60             # entry in the lane specific paramaters
61             if genome is None:
62                 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
63             # ignore flag value
64             if genome == 'Need_to_specify_ELAND_genome_directory':
65                 genome = None
66             return genome
67         eland_genome = property(_get_eland_genome)
68
69         def _get_read_length(self):
70             read_length = self.__get_attribute('READ_LENGTH')
71             if read_length is None:
72                 read_length = self._gerald._get_chip_attribute('READ_LENGTH')
73             return read_length
74         read_length = property(_get_read_length)
75
76         def _get_use_bases(self):
77             return self.__get_attribute('USE_BASES')
78         use_bases = property(_get_use_bases)
79
80     class LaneSpecificRunParameters(object):
81         """
82         Provide access to LaneSpecificRunParameters
83         """
84         def __init__(self, gerald):
85             self._gerald = gerald
86             self._lane = None
87
88         def _initalize_lanes(self):
89             """
90             build dictionary of LaneParameters
91             """
92             self._lanes = {}
93             tree = self._gerald.tree
94             analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
95             if analysis is None:
96                 return
97             # according to the pipeline specs I think their fields
98             # are sampleName_laneID, with sampleName defaulting to s
99             # since laneIDs are constant lets just try using
100             # those consistently.
101             for element in analysis:
102                 sample, lane_id = element.tag.split('_')
103                 self._lanes[int(lane_id)] = Gerald.LaneParameters(
104                                               self._gerald, lane_id)
105
106         def __getitem__(self, key):
107             if self._lane is None:
108                 self._initalize_lanes()
109             return self._lanes[key]
110         def get(self, key, default):
111             if self._lane is None:
112                 self._initalize_lanes()
113             return self._lanes.get(key, None)
114         def keys(self):
115             if self._lane is None:
116                 self._initalize_lanes()
117             return self._lanes.keys()
118         def values(self):
119             if self._lane is None:
120                 self._initalize_lanes()
121             return self._lanes.values()
122         def items(self):
123             if self._lane is None:
124                 self._initalize_lanes()
125             return self._lanes.items()
126         def __len__(self):
127             if self._lane is None:
128                 self._initalize_lanes()
129             return len(self._lanes)
130
131     def __init__(self, xml=None):
132         self.pathname = None
133         self.tree = None
134
135         # parse lane parameters out of the config.xml file
136         self.lanes = Gerald.LaneSpecificRunParameters(self)
137
138         self.summary = None
139         self.eland_results = None
140
141         if xml is not None:
142             self.set_elements(xml)
143
144     def _get_date(self):
145         if self.tree is None:
146             return datetime.today()
147         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
148         if timestamp is not None:
149             epochstamp = time.mktime(time.strptime(timestamp, '%c'))
150             return datetime.fromtimestamp(epochstamp)
151         if self.pathname is not None:
152             epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
153             return datetime.fromtimestamp(epochstamp)
154         return datetime.today()
155     date = property(_get_date)
156
157     def _get_time(self):
158         return time.mktime(self.date.timetuple())
159     time = property(_get_time, doc='return run time as seconds since epoch')
160
161     def _get_experiment_root(self):
162         if self.tree is None:
163             return None
164         return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
165
166     def _get_runfolder_name(self):
167         if self.tree is None:
168             return None
169
170         root = self._get_experiment_root()
171         if root is None:
172             root = ''
173         else:
174             root = os.path.join(root,'')
175
176         experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
177         if experiment_dir is not None:
178             experiment_dir = experiment_dir.replace(root, '')
179         experiment_dir = self.tree.findtext('Defaults/EXPT_DIR')
180         if experiment_dir is not None:
181             _, experiment_dir = os.path.split(experiment_dir)
182         if experiment_dir is None or len(experiment_dir) == 0:
183             return None
184
185         dirnames = experiment_dir.split(os.path.sep)
186         return dirnames[0]
187     runfolder_name = property(_get_runfolder_name)
188
189     def _get_version(self):
190         if self.tree is None:
191             return None
192         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
193     version = property(_get_version)
194
195     def _get_chip_attribute(self, value):
196         return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
197
198     def dump(self):
199         """
200         Debugging function, report current object
201         """
202         print 'Gerald version:', self.version
203         print 'Gerald run date:', self.date
204         print 'Gerald config.xml:', self.tree
205         self.summary.dump()
206
207     def get_elements(self):
208         if self.tree is None or self.summary is None:
209             return None
210
211         gerald = ElementTree.Element(Gerald.GERALD,
212                                      {'version': unicode(Gerald.XML_VERSION)})
213         gerald.append(self.tree)
214         gerald.append(self.summary.get_elements())
215         if self.eland_results:
216             gerald.append(self.eland_results.get_elements())
217         return gerald
218
219     def set_elements(self, tree):
220         if tree.tag !=  Gerald.GERALD:
221             raise ValueError('exptected GERALD')
222         xml_version = int(tree.attrib.get('version', 0))
223         if xml_version > Gerald.XML_VERSION:
224             LOGGER.warn('XML tree is a higher version than this class')
225         self.eland_results = ELAND()
226         for element in list(tree):
227             tag = element.tag.lower()
228             if tag == Gerald.RUN_PARAMETERS.lower():
229                 self.tree = element
230             elif tag == Gerald.SUMMARY.lower():
231                 self.summary = Summary(xml=element)
232             elif tag == ELAND.ELAND.lower():
233                 self.eland_results = ELAND(xml=element)
234             else:
235                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
236
237 def gerald(pathname):
238     g = Gerald()
239     g.pathname = os.path.expanduser(pathname)
240     path, name = os.path.split(g.pathname)
241     LOGGER.info("Parsing gerald config.xml")
242     config_pathname = os.path.join(g.pathname, 'config.xml')
243     g.tree = ElementTree.parse(config_pathname).getroot()
244
245     # parse Summary.htm file
246     summary_xml = os.path.join(g.pathname, 'Summary.xml')
247     summary_htm = os.path.join(g.pathname, 'Summary.htm')
248     status_files_summary = os.path.join(g.pathname, '..', 'Data', 'Status_Files', 'Summary.htm')
249     if os.path.exists(summary_xml):
250         LOGGER.info("Parsing Summary.xml")
251         summary_pathname = summary_xml
252     elif os.path.exists(summary_htm):
253         summary_pathname = os.path.join(g.pathname, 'Summary.htm')
254         LOGGER.info("Parsing Summary.htm")
255     else:
256         summary_pathname = status_files_summary
257         LOGGER.info("Parsing %s" % (status_files_summary,))
258     g.summary = Summary(summary_pathname)
259     # parse eland files
260     g.eland_results = eland(g.pathname, g)
261     return g
262
263 if __name__ == "__main__":
264   # quick test code
265   import sys
266   g = gerald(sys.argv[1])
267   #ElementTree.dump(g.get_elements())