2 Provide access to information stored in the GERALD directory.
4 from datetime import datetime, date
11 from htsworkflow.pipelines.summary import Summary, SummaryGA, SummaryHiSeq
12 from htsworkflow.pipelines.eland import eland, ELAND
14 from htsworkflow.pipelines.runfolder import \
19 from htsworkflow.util.ethelp import indent, flatten
21 LOGGER = logging.getLogger(__name__)
23 class Alignment(object):
25 Capture meaning out of the GERALD directory
28 RUN_PARAMETERS='RunParameters'
31 def __init__(self, xml=None, pathname=None, tree=None):
32 self.pathname = pathname
35 # parse lane parameters out of the config.xml file
36 self.lanes = LaneSpecificRunParameters(self)
39 self.eland_results = None
42 self.set_elements(xml)
45 return time.mktime(self.date.timetuple())
46 time = property(_get_time, doc='return run time as seconds since epoch')
48 def _get_chip_attribute(self, value):
49 return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
53 Debugging function, report current object
55 print 'Software:'. self.__class__.__name__
56 print 'Alignment version:', self.version
57 print 'Run date:', self.date
58 print 'config.xml:', self.tree
61 def get_elements(self):
62 if self.tree is None or self.summary is None:
65 gerald = ElementTree.Element(Gerald.GERALD,
66 {'version': unicode(Gerald.XML_VERSION)})
67 gerald.append(self.tree)
68 gerald.append(self.summary.get_elements())
69 if self.eland_results:
70 gerald.append(self.eland_results.get_elements())
73 def set_elements(self, tree):
74 if tree.tag != self.__class__.GERALD:
75 raise ValueError('expected GERALD')
76 xml_version = int(tree.attrib.get('version', 0))
77 if xml_version > Gerald.XML_VERSION:
78 LOGGER.warn('XML tree is a higher version than this class')
79 self.eland_results = ELAND()
80 for element in list(tree):
81 tag = element.tag.lower()
82 if tag == Gerald.RUN_PARAMETERS.lower():
84 elif tag == Gerald.SUMMARY.lower():
85 self.summary = Summary(xml=element)
86 elif tag == ELAND.ELAND.lower():
87 self.eland_results = ELAND(xml=element)
89 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
91 class Gerald(Alignment):
96 return datetime.today()
97 timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
98 if timestamp is not None:
99 epochstamp = time.mktime(time.strptime(timestamp, '%c'))
100 return datetime.fromtimestamp(epochstamp)
101 if self.pathname is not None:
102 epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
103 return datetime.fromtimestamp(epochstamp)
104 return datetime.today()
105 date = property(_get_date)
107 def _get_experiment_root(self):
108 if self.tree is None:
110 return self.tree.findtext('ChipWideRunParameters/EXPT_DIR_ROOT')
112 def _get_runfolder_name(self):
113 if self.tree is None:
116 expt_root = os.path.normpath(self._get_experiment_root())
117 chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
119 if expt_root is not None and chip_expt_dir is not None:
120 experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
121 experiment_dir = experiment_dir.split(os.path.sep)[0]
123 if experiment_dir is None or len(experiment_dir) == 0:
125 return experiment_dir
127 runfolder_name = property(_get_runfolder_name)
129 def _get_software_version(self):
130 if self.tree is None:
132 ga_version = self.tree.findtext(
133 'ChipWideRunParameters/SOFTWARE_VERSION')
134 if ga_version is not None:
135 gerald = re.match("@.*GERALD.pl,v (?P<version>\d+(\.\d+)+)",
138 return ('GERALD', gerald.group('version'))
139 casava = re.match('CASAVA-(?P<version>\d+[.\d]*)',
142 return ('CASAVA', casava.group('version'))
144 def _get_software(self):
145 """Return name of analysis software package"""
146 software_version = self._get_software_version()
147 return software_version[0] if software_version is not None else None
148 software = property(_get_software)
150 def _get_version(self):
151 """Return version number of software package"""
152 software_version = self._get_software_version()
153 return software_version[1] if software_version is not None else None
154 version = property(_get_version)
156 class CASAVA(Alignment):
159 def _get_runfolder_name(self):
160 if self.tree is None:
163 # hiseqs renamed the experiment dir location
164 defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
165 _, experiment_dir = os.path.split(defaults_expt_dir)
167 if experiment_dir is None or len(experiment_dir) == 0:
169 return experiment_dir
171 runfolder_name = property(_get_runfolder_name)
173 def _get_software_version(self):
174 if self.tree is None:
176 if self.tree is None:
178 hiseq_software_node = self.tree.find('Software')
179 software_version = hiseq_software_node.attrib.get('Version',None)
180 if software_version is None:
182 return software_version.split('-')
184 def _get_software(self):
185 software_version = self._get_software_version()
186 if software_version is None:
188 return software_version[0]
189 software = property(_get_software)
191 def _get_version(self):
192 software_version = self._get_software_version()
193 if software_version is None:
195 return software_version[1]
196 version = property(_get_version)
199 class LaneParameters(object):
201 Make it easy to access elements of LaneSpecificRunParameters from python
203 def __init__(self, gerald, lane_id):
204 self._gerald = gerald
205 self._lane_id = lane_id
207 def _get_analysis(self):
208 raise NotImplemented("abstract class")
209 analysis = property(_get_analysis)
211 def _get_eland_genome(self):
212 raise NotImplemented("abstract class")
213 eland_genome = property(_get_eland_genome)
215 def _get_read_length(self):
216 raise NotImplemented("abstract class")
217 read_length = property(_get_read_length)
219 def _get_use_bases(self):
220 raise NotImplemented("abstract class")
221 use_bases = property(_get_use_bases)
224 class LaneParametersGA(LaneParameters):
226 Make it easy to access elements of LaneSpecificRunParameters from python
228 def __init__(self, gerald, lane_id):
229 super(LaneParametersGA, self).__init__(gerald, lane_id)
231 def __get_attribute(self, xml_tag):
232 subtree = self._gerald.tree.find('LaneSpecificRunParameters')
233 container = subtree.find(xml_tag)
234 if container is None:
236 if len(container.getchildren()) > LANES_PER_FLOWCELL:
237 raise RuntimeError('GERALD config.xml file changed')
238 lanes = [x.tag.split('_')[1] for x in container.getchildren()]
240 index = lanes.index(self._lane_id)
241 except ValueError, e:
243 element = container[index]
245 def _get_analysis(self):
246 return self.__get_attribute('ANALYSIS')
247 analysis = property(_get_analysis)
249 def _get_eland_genome(self):
250 genome = self.__get_attribute('ELAND_GENOME')
251 # default to the chipwide parameters if there isn't an
252 # entry in the lane specific paramaters
254 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
256 if genome == 'Need_to_specify_ELAND_genome_directory':
259 eland_genome = property(_get_eland_genome)
261 def _get_read_length(self):
262 read_length = self.__get_attribute('READ_LENGTH')
263 if read_length is None:
264 read_length = self._gerald._get_chip_attribute('READ_LENGTH')
266 read_length = property(_get_read_length)
268 def _get_use_bases(self):
269 return self.__get_attribute('USE_BASES')
270 use_bases = property(_get_use_bases)
273 class LaneParametersHiSeq(LaneParameters):
275 Make it easy to access elements of LaneSpecificRunParameters from python
277 def __init__(self, gerald, lane_id, element):
278 super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
279 self.element = element
281 def __get_attribute(self, xml_tag):
282 container = self.element.find(xml_tag)
283 if container is None:
285 return container.text
287 def _get_analysis(self):
288 return self.__get_attribute('ANALYSIS')
289 analysis = property(_get_analysis)
291 def _get_eland_genome(self):
292 genome = self.__get_attribute('ELAND_GENOME')
293 # default to the chipwide parameters if there isn't an
294 # entry in the lane specific paramaters
296 genome = self._gerald._get_chip_attribute('ELAND_GENOME')
298 if genome == 'Need_to_specify_ELAND_genome_directory':
301 eland_genome = property(_get_eland_genome)
303 def _get_read_length(self):
304 return self.__get_attribute('READ_LENGTH1')
305 read_length = property(_get_read_length)
307 def _get_use_bases(self):
308 return self.__get_attribute('USE_BASES1')
309 use_bases = property(_get_use_bases)
311 class LaneSpecificRunParameters(object):
313 Provide access to LaneSpecificRunParameters
315 def __init__(self, gerald):
316 self._gerald = gerald
319 def _initalize_lanes(self):
321 build dictionary of LaneParameters
324 tree = self._gerald.tree
325 analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
326 if analysis is not None:
327 self._extract_ga_analysis_type(analysis)
328 analysis = tree.find('Projects')
329 if analysis is not None:
330 self._extract_hiseq_analysis_type(analysis)
332 def _extract_ga_analysis_type(self, analysis):
333 # according to the pipeline specs I think their fields
334 # are sampleName_laneID, with sampleName defaulting to s
335 # since laneIDs are constant lets just try using
336 # those consistently.
337 for element in analysis:
338 sample, lane_id = element.tag.split('_')
339 self._lanes[int(lane_id)] = LaneParametersGA(
340 self._gerald, lane_id)
342 def _extract_hiseq_analysis_type(self, analysis):
343 """Extract from HiSeq style multiplexed analysis types"""
344 for element in analysis:
345 name = element.attrib['name']
346 self._lanes[name] = LaneParametersHiSeq(self._gerald,
351 return self._lanes.iterkeys()
352 def __getitem__(self, key):
353 if self._lane is None:
354 self._initalize_lanes()
355 return self._lanes[key]
356 def get(self, key, default):
357 if self._lane is None:
358 self._initalize_lanes()
359 return self._lanes.get(key, None)
361 if self._lane is None:
362 self._initalize_lanes()
363 return self._lanes.keys()
365 if self._lane is None:
366 self._initalize_lanes()
367 return self._lanes.values()
369 if self._lane is None:
370 self._initalize_lanes()
371 return self._lanes.items()
373 if self._lane is None:
374 self._initalize_lanes()
375 return len(self._lanes)
378 def gerald(pathname):
379 LOGGER.info("Parsing gerald config.xml")
380 pathname = os.path.expanduser(pathname)
381 config_pathname = os.path.join(pathname, 'config.xml')
382 config_tree = ElementTree.parse(config_pathname).getroot()
384 # parse Summary.htm file
385 summary_xml = os.path.join(pathname, 'Summary.xml')
386 summary_htm = os.path.join(pathname, 'Summary.htm')
387 report_summary = os.path.join(pathname, '..', 'Data',
388 'reports', 'Summary', )
389 if os.path.exists(summary_xml):
390 g = Gerald(pathname = pathname, tree=config_tree)
391 LOGGER.info("Parsing Summary.xml")
392 g.summary = SummaryGA(summary_xml)
393 g.eland_results = eland(g.pathname, g)
394 elif os.path.exists(summary_htm):
395 g = Gerald(pathname=pathname, tree=config_tree)
396 LOGGER.info("Parsing Summary.htm")
397 g.summary = SummaryGA(summary_htm)
398 g.eland_results = eland(g.pathname, g)
399 elif os.path.isdir(report_summary):
400 g = CASAVA(pathname=pathname, tree=config_tree)
401 LOGGER.info("Parsing %s" % (report_summary,))
402 g.summary = SummaryHiSeq(report_summary)
407 if __name__ == "__main__":
410 g = gerald(sys.argv[1])
411 #ElementTree.dump(g.get_elements())