2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
14 from htsworkflow.pipelines.runfolder import \
19 from htsworkflow.util.ethelp import indent, flatten
21 LOGGER = logging.getLogger(__name__)
23 class Alignment(object):
25 Capture meaning out of the GERALD directory
28 RUN_PARAMETERS='RunParameters'
31 def __init__(self, xml=None, pathname=None, tree=None):
32 self.pathname = pathname
35 # parse lane parameters out of the config.xml file
36 self.lanes = LaneSpecificRunParameters(self)
39 self.eland_results = None
42 self.set_elements(xml)
45 if self.pathname is not None:
46 epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
47 return datetime.fromtimestamp(epochstamp)
48 return datetime.today()
51 return time.mktime(self.date.timetuple())
52 time = property(_get_time, doc='return run time as seconds since epoch')
54 def _get_chip_attribute(self, value):
55 return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
59 Debugging function, report current object
61 print 'Software:'. self.__class__.__name__
62 print 'Alignment version:', self.version
63 print 'Run date:', self.date
64 print 'config.xml:', self.tree
67 def get_elements(self, root_tag):
68 if self.tree is None or self.summary is None:
71 gerald = ElementTree.Element(root_tag,
72 {'version': unicode(Gerald.XML_VERSION)})
73 gerald.append(self.tree)
74 gerald.append(self.summary.get_elements())
75 if self.eland_results:
76 gerald.append(self.eland_results.get_elements())
79 def set_elements(self, tree, root_tag):
80 if tree.tag != root_tag:
81 raise ValueError('expected %s' % (self.__class__.GERALD,))
82 xml_version = int(tree.attrib.get('version', 0))
83 if xml_version > Gerald.XML_VERSION:
84 LOGGER.warn('XML tree is a higher version than this class')
85 self.eland_results = ELAND()
86 for element in list(tree):
87 tag = element.tag.lower()
88 if tag == Gerald.RUN_PARAMETERS.lower():
90 elif tag == Gerald.SUMMARY.lower():
91 self.summary = Summary(xml=element)
92 elif tag == ELAND.ELAND.lower():
93 self.eland_results = ELAND(xml=element)
95 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
97 class Gerald(Alignment):
101 if self.tree is None:
102 return datetime.today()
104 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
105 if timestamp is not None:
106 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
107 return datetime.fromtimestamp(epochstamp)
108 return super(Gerald, self)._get_date()
109 date = property(_get_date)
111 def get_elements(self):
112 return super(Gerald, self).get_elements(Gerald.GERALD)
114 def set_elements(self, tree):
115 return super(Gerald, self).set_elements(tree, Gerald.GERALD)
117 def _get_experiment_root(self):
118 if self.tree is None:
120 return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
122 def _get_runfolder_name(self):
123 if self.tree is None:
126 expt_root = os.path.normpath(self._get_experiment_root())
127 chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
129 if expt_root is not None and chip_expt_dir is not None:
130 experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
131 experiment_dir = experiment_dir.split(os.path.sep)[0]
133 if experiment_dir is None or len(experiment_dir) == 0:
135 return experiment_dir
137 runfolder_name = property(_get_runfolder_name)
139 def _get_software_version(self):
140 if self.tree is None:
142 ga_version = self.tree.findtext(
143 'ChipWideRunParameters/SOFTWARE_VERSION')
144 if ga_version is not None:
145 gerald = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
148 return ('GERALD', gerald.group('version'))
149 casava = re.match('CASAVA-(?P<version>\d+[.\d]*)',
152 return ('CASAVA', casava.group('version'))
154 def _get_software(self):
155 """Return name of analysis software package"""
156 software_version = self._get_software_version()
157 return software_version[0] if software_version is not None else None
158 software = property(_get_software)
160 def _get_version(self):
161 """Return version number of software package"""
162 software_version = self._get_software_version()
163 return software_version[1] if software_version is not None else None
164 version = property(_get_version)
166 class CASAVA(Alignment):
169 def __init__(self, xml=None, pathname=None, tree=None):
170 super(CASAVA, self).__init__(xml=xml, pathname=pathname, tree=tree)
172 self._add_timestamp()
174 def _add_timestamp(self):
175 """Manually add a time stamp to CASAVA runs"""
176 if self.tree is None:
178 if len(self.tree.xpath('TIME_STAMP')) == 0:
179 time_stamp = self.date.strftime('%c')
180 time_element = ElementTree.Element('TIME_STAMP')
181 time_element.text = time_stamp
182 self.tree.append(time_element)
185 if self.tree is None:
187 time_element = self.tree.xpath('TIME_STAMP')
188 if len(time_element) == 1:
189 return datetime.strptime(time_element[0].text, '%c')
190 return super(CASAVA, self)._get_date()
191 date = property(_get_date)
193 def get_elements(self):
194 tree = super(CASAVA, self).get_elements(CASAVA.GERALD)
197 def set_elements(self, tree):
198 return super(CASAVA, self).set_elements(tree, CASAVA.GERALD)
200 def _get_runfolder_name(self):
201 if self.tree is None:
204 # hiseqs renamed the experiment dir location
205 defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
206 _, experiment_dir = os.path.split(defaults_expt_dir)
208 if experiment_dir is None or len(experiment_dir) == 0:
210 return experiment_dir
212 runfolder_name = property(_get_runfolder_name)
214 def _get_software_version(self):
215 if self.tree is None:
217 if self.tree is None:
219 hiseq_software_node = self.tree.find('Software')
220 software_version = hiseq_software_node.attrib.get('Version',None)
221 if software_version is None:
223 return software_version.split('-')
225 def _get_software(self):
226 software_version = self._get_software_version()
227 if software_version is None:
229 return software_version[0]
230 software = property(_get_software)
232 def _get_version(self):
233 software_version = self._get_software_version()
234 if software_version is None:
236 return software_version[1]
237 version = property(_get_version)
240 class LaneParameters(object):
242 Make it easy to access elements of LaneSpecificRunParameters from python
244 def __init__(self, gerald, lane_id):
245 self._gerald = gerald
246 self._lane_id = lane_id
248 def _get_analysis(self):
249 raise NotImplemented("abstract class")
250 analysis = property(_get_analysis)
252 def _get_eland_genome(self):
253 raise NotImplemented("abstract class")
254 eland_genome = property(_get_eland_genome)
256 def _get_read_length(self):
257 raise NotImplemented("abstract class")
258 read_length = property(_get_read_length)
260 def _get_use_bases(self):
261 raise NotImplemented("abstract class")
262 use_bases = property(_get_use_bases)
265 class LaneParametersGA(LaneParameters):
267 Make it easy to access elements of LaneSpecificRunParameters from python
269 def __init__(self, gerald, lane_id):
270 super(LaneParametersGA, self).__init__(gerald, lane_id)
272 def __get_attribute(self, xml_tag):
273 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
274 container = subtree.find(xml_tag)
275 if container is None:
277 if len(container.getchildren()) > LANES_PER_FLOWCELL:
278 raise RuntimeError('GERALD config.xml file changed')
279 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
281 index = lanes.index(self._lane_id)
282 except ValueError, e:
284 element = container[index]
286 def _get_analysis(self):
287 return self.__get_attribute('ANALYSIS')
288 analysis = property(_get_analysis)
290 def _get_eland_genome(self):
291 genome = self.__get_attribute('ELAND_GENOME')
292 # default to the chipwide parameters if there isn't an
293 # entry in the lane specific paramaters
295 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
297 if genome == 'Need_to_specify_ELAND_genome_directory':
300 eland_genome = property(_get_eland_genome)
302 def _get_read_length(self):
303 read_length = self.__get_attribute('READ_LENGTH')
304 if read_length is None:
305 read_length = self._gerald._get_chip_attribute('READ_LENGTH')
307 read_length = property(_get_read_length)
309 def _get_use_bases(self):
310 return self.__get_attribute('USE_BASES')
311 use_bases = property(_get_use_bases)
314 class LaneParametersHiSeq(LaneParameters):
316 Make it easy to access elements of LaneSpecificRunParameters from python
318 def __init__(self, gerald, lane_id, element):
319 super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
320 self.element = element
322 def __get_attribute(self, xml_tag):
323 container = self.element.find(xml_tag)
324 if container is None:
326 return container.text
328 def _get_analysis(self):
329 return self.__get_attribute('ANALYSIS')
330 analysis = property(_get_analysis)
332 def _get_eland_genome(self):
333 genome = self.__get_attribute('ELAND_GENOME')
334 # default to the chipwide parameters if there isn't an
335 # entry in the lane specific paramaters
337 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
339 if genome == 'Need_to_specify_ELAND_genome_directory':
342 eland_genome = property(_get_eland_genome)
344 def _get_read_length(self):
345 return self.__get_attribute('READ_LENGTH1')
346 read_length = property(_get_read_length)
348 def _get_use_bases(self):
349 return self.__get_attribute('USE_BASES1')
350 use_bases = property(_get_use_bases)
352 class LaneSpecificRunParameters(object):
354 Provide access to LaneSpecificRunParameters
356 def __init__(self, gerald):
357 self._gerald = gerald
360 def _initalize_lanes(self):
362 build dictionary of LaneParameters
365 tree = self._gerald.tree
366 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
367 if analysis is not None:
368 self._extract_ga_analysis_type(analysis)
369 analysis = tree.find('Projects')
370 if analysis is not None:
371 self._extract_hiseq_analysis_type(analysis)
373 def _extract_ga_analysis_type(self, analysis):
374 # according to the pipeline specs I think their fields
375 # are sampleName_laneID, with sampleName defaulting to s
376 # since laneIDs are constant lets just try using
377 # those consistently.
378 for element in analysis:
379 sample, lane_id = element.tag.split('_')
380 self._lanes[int(lane_id)] = LaneParametersGA(
381 self._gerald, lane_id)
383 def _extract_hiseq_analysis_type(self, analysis):
384 """Extract from HiSeq style multiplexed analysis types"""
385 for element in analysis:
386 name = element.attrib['name']
387 self._lanes[name] = LaneParametersHiSeq(self._gerald,
392 return self._lanes.iterkeys()
393 def __getitem__(self, key):
394 if self._lane is None:
395 self._initalize_lanes()
396 return self._lanes[key]
397 def get(self, key, default):
398 if self._lane is None:
399 self._initalize_lanes()
400 return self._lanes.get(key, None)
402 if self._lane is None:
403 self._initalize_lanes()
404 return self._lanes.keys()
406 if self._lane is None:
407 self._initalize_lanes()
408 return self._lanes.values()
410 if self._lane is None:
411 self._initalize_lanes()
412 return self._lanes.items()
414 if self._lane is None:
415 self._initalize_lanes()
416 return len(self._lanes)
419 def gerald(pathname):
420 LOGGER.info("Parsing gerald config.xml")
421 pathname = os.path.expanduser(pathname)
422 config_pathname = os.path.join(pathname, 'config.xml')
423 config_tree = ElementTree.parse(config_pathname).getroot()
425 # parse Summary.htm file
426 summary_xml = os.path.join(pathname, 'Summary.xml')
427 summary_htm = os.path.join(pathname, 'Summary.htm')
428 report_summary = os.path.join(pathname, '..', 'Data',
429 'reports', 'Summary', )
430 if os.path.exists(summary_xml):
431 g = Gerald(pathname = pathname, tree=config_tree)
432 LOGGER.info("Parsing Summary.xml")
433 g.summary = SummaryGA(summary_xml)
434 g.eland_results = eland(g.pathname, g)
435 elif os.path.exists(summary_htm):
436 g = Gerald(pathname=pathname, tree=config_tree)
437 LOGGER.info("Parsing Summary.htm")
438 g.summary = SummaryGA(summary_htm)
439 g.eland_results = eland(g.pathname, g)
440 elif os.path.isdir(report_summary):
441 g = CASAVA(pathname=pathname, tree=config_tree)
442 LOGGER.info("Parsing %s" % (report_summary,))
443 g.summary = SummaryHiSeq(report_summary)
444 g.eland_results = eland(g.pathname, g)
449 if __name__ == "__main__":
452 g = gerald(sys.argv[1])
453 #ElementTree.dump(g.get_elements())