From: Diane Trout Date: Sat, 2 Jun 2012 00:20:24 +0000 (-0700) Subject: Split lane parameters into seperate classes for GA & HiSeq config files. X-Git-Tag: v0.5.5~20 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=93f27d07d8fedc44b6e05f037d52f7f63eb748aa Split lane parameters into seperate classes for GA & HiSeq config files. Also tweak the tests for a different example flowcell --- diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index e13ac2a..087edd4 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -28,112 +28,12 @@ class Gerald(object): RUN_PARAMETERS='RunParameters' SUMMARY='Summary' - class LaneParameters(object): - """ - Make it easy to access elements of LaneSpecificRunParameters from python - """ - def __init__(self, gerald, lane_id): - self._gerald = gerald - self._lane_id = lane_id - - def __get_attribute(self, xml_tag): - subtree = self._gerald.tree.find('LaneSpecificRunParameters') - container = subtree.find(xml_tag) - if container is None: - return None - if len(container.getchildren()) > LANES_PER_FLOWCELL: - raise RuntimeError('GERALD config.xml file changed') - lanes = [x.tag.split('_')[1] for x in container.getchildren()] - try: - index = lanes.index(self._lane_id) - except ValueError, e: - return None - element = container[index] - return element.text - def _get_analysis(self): - return self.__get_attribute('ANALYSIS') - analysis = property(_get_analysis) - - def _get_eland_genome(self): - genome = self.__get_attribute('ELAND_GENOME') - # default to the chipwide parameters if there isn't an - # entry in the lane specific paramaters - if genome is None: - genome = self._gerald._get_chip_attribute('ELAND_GENOME') - # ignore flag value - if genome == 'Need_to_specify_ELAND_genome_directory': - genome = None - return genome - eland_genome = property(_get_eland_genome) - - def _get_read_length(self): - read_length = self.__get_attribute('READ_LENGTH') - if read_length is None: - read_length = self._gerald._get_chip_attribute('READ_LENGTH') - return read_length - read_length = property(_get_read_length) - - def _get_use_bases(self): - return self.__get_attribute('USE_BASES') - use_bases = property(_get_use_bases) - - class LaneSpecificRunParameters(object): - """ - Provide access to LaneSpecificRunParameters - """ - def __init__(self, gerald): - self._gerald = gerald - self._lane = None - - def _initalize_lanes(self): - """ - build dictionary of LaneParameters - """ - self._lanes = {} - tree = self._gerald.tree - analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') - if analysis is None: - return - # according to the pipeline specs I think their fields - # are sampleName_laneID, with sampleName defaulting to s - # since laneIDs are constant lets just try using - # those consistently. - for element in analysis: - sample, lane_id = element.tag.split('_') - self._lanes[int(lane_id)] = Gerald.LaneParameters( - self._gerald, lane_id) - - def __getitem__(self, key): - if self._lane is None: - self._initalize_lanes() - return self._lanes[key] - def get(self, key, default): - if self._lane is None: - self._initalize_lanes() - return self._lanes.get(key, None) - def keys(self): - if self._lane is None: - self._initalize_lanes() - return self._lanes.keys() - def values(self): - if self._lane is None: - self._initalize_lanes() - return self._lanes.values() - def items(self): - if self._lane is None: - self._initalize_lanes() - return self._lanes.items() - def __len__(self): - if self._lane is None: - self._initalize_lanes() - return len(self._lanes) - def __init__(self, xml=None): self.pathname = None self.tree = None # parse lane parameters out of the config.xml file - self.lanes = Gerald.LaneSpecificRunParameters(self) + self.lanes = LaneSpecificRunParameters(self) self.summary = None self.eland_results = None @@ -240,6 +140,186 @@ class Gerald(object): else: LOGGER.warn("Unrecognized tag %s" % (element.tag,)) + +class LaneParameters(object): + """ + Make it easy to access elements of LaneSpecificRunParameters from python + """ + def __init__(self, gerald, lane_id): + self._gerald = gerald + self._lane_id = lane_id + + def _get_analysis(self): + raise NotImplemented("abstract class") + analysis = property(_get_analysis) + + def _get_eland_genome(self): + raise NotImplemented("abstract class") + eland_genome = property(_get_eland_genome) + + def _get_read_length(self): + raise NotImplemented("abstract class") + read_length = property(_get_read_length) + + def _get_use_bases(self): + raise NotImplemented("abstract class") + use_bases = property(_get_use_bases) + + +class LaneParametersGA(LaneParameters): + """ + Make it easy to access elements of LaneSpecificRunParameters from python + """ + def __init__(self, gerald, lane_id): + super(LaneParametersGA, self).__init__(gerald, lane_id) + + def __get_attribute(self, xml_tag): + subtree = self._gerald.tree.find('LaneSpecificRunParameters') + container = subtree.find(xml_tag) + if container is None: + return None + if len(container.getchildren()) > LANES_PER_FLOWCELL: + raise RuntimeError('GERALD config.xml file changed') + lanes = [x.tag.split('_')[1] for x in container.getchildren()] + try: + index = lanes.index(self._lane_id) + except ValueError, e: + return None + element = container[index] + return element.text + def _get_analysis(self): + return self.__get_attribute('ANALYSIS') + analysis = property(_get_analysis) + + def _get_eland_genome(self): + genome = self.__get_attribute('ELAND_GENOME') + # default to the chipwide parameters if there isn't an + # entry in the lane specific paramaters + if genome is None: + genome = self._gerald._get_chip_attribute('ELAND_GENOME') + # ignore flag value + if genome == 'Need_to_specify_ELAND_genome_directory': + genome = None + return genome + eland_genome = property(_get_eland_genome) + + def _get_read_length(self): + read_length = self.__get_attribute('READ_LENGTH') + if read_length is None: + read_length = self._gerald._get_chip_attribute('READ_LENGTH') + return read_length + read_length = property(_get_read_length) + + def _get_use_bases(self): + return self.__get_attribute('USE_BASES') + use_bases = property(_get_use_bases) + + +class LaneParametersHiSeq(LaneParameters): + """ + Make it easy to access elements of LaneSpecificRunParameters from python + """ + def __init__(self, gerald, lane_id, element): + super(LaneParametersHiSeq, self).__init__(gerald, lane_id) + self.element = element + + def __get_attribute(self, xml_tag): + container = self.element.find(xml_tag) + if container is None: + return None + return container.text + + def _get_analysis(self): + return self.__get_attribute('ANALYSIS') + analysis = property(_get_analysis) + + def _get_eland_genome(self): + genome = self.__get_attribute('ELAND_GENOME') + # default to the chipwide parameters if there isn't an + # entry in the lane specific paramaters + if genome is None: + genome = self._gerald._get_chip_attribute('ELAND_GENOME') + # ignore flag value + if genome == 'Need_to_specify_ELAND_genome_directory': + genome = None + return genome + eland_genome = property(_get_eland_genome) + + def _get_read_length(self): + return self.__get_attribute('READ_LENGTH1') + read_length = property(_get_read_length) + + def _get_use_bases(self): + return self.__get_attribute('USE_BASES1') + use_bases = property(_get_use_bases) + +class LaneSpecificRunParameters(object): + """ + Provide access to LaneSpecificRunParameters + """ + def __init__(self, gerald): + self._gerald = gerald + self._lane = None + + def _initalize_lanes(self): + """ + build dictionary of LaneParameters + """ + self._lanes = {} + tree = self._gerald.tree + analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') + if analysis is not None: + self._extract_ga_analysis_type(analysis) + analysis = tree.find('Projects') + if analysis is not None: + self._extract_hiseq_analysis_type(analysis) + + def _extract_ga_analysis_type(self, analysis): + # according to the pipeline specs I think their fields + # are sampleName_laneID, with sampleName defaulting to s + # since laneIDs are constant lets just try using + # those consistently. + for element in analysis: + sample, lane_id = element.tag.split('_') + self._lanes[int(lane_id)] = LaneParametersGA( + self._gerald, lane_id) + + def _extract_hiseq_analysis_type(self, analysis): + """Extract from HiSeq style multiplexed analysis types""" + for element in analysis: + name = element.attrib['name'] + self._lanes[name] = LaneParametersHiSeq(self._gerald, + name, + element) + + def __iter__(self): + return self._lanes.iterkeys() + def __getitem__(self, key): + if self._lane is None: + self._initalize_lanes() + return self._lanes[key] + def get(self, key, default): + if self._lane is None: + self._initalize_lanes() + return self._lanes.get(key, None) + def keys(self): + if self._lane is None: + self._initalize_lanes() + return self._lanes.keys() + def values(self): + if self._lane is None: + self._initalize_lanes() + return self._lanes.values() + def items(self): + if self._lane is None: + self._initalize_lanes() + return self._lanes.items() + def __len__(self): + if self._lane is None: + self._initalize_lanes() + return len(self._lanes) + + def gerald(pathname): g = Gerald() g.pathname = os.path.expanduser(pathname) diff --git a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py index ed225bf..9b7d3b9 100644 --- a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py +++ b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py @@ -95,38 +95,28 @@ class RunfolderTests(unittest.TestCase): # make_gerald_config. # the first None is to offset the genomes list to be 1..9 # instead of pythons default 0..8 - genomes = [None, - '/g/mm9', - '/g/mm9', - '/g/elegans190', - '/g/arabidopsis01222004', - '/g/mm9', - '/g/mm9', - '/g/mm9', - '/g/mm9', ] - # test lane specific parameters from gerald config file - for i in range(1,9): - cur_lane = g.lanes[i] - self.failUnlessEqual(cur_lane.analysis, 'eland_extended') - self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) - self.failUnlessEqual(cur_lane.read_length, '37') - self.failUnlessEqual(cur_lane.use_bases, 'Y'*37) - - # I want to be able to use a simple iterator - for l in g.lanes.values(): - self.failUnlessEqual(l.analysis, 'eland_extended') - self.failUnlessEqual(l.read_length, '37') - self.failUnlessEqual(l.use_bases, 'Y'*37) + + undetermined = g.lanes['Undetermined_indices'] + self.failUnlessEqual(undetermined.analysis, 'none') + self.failUnlessEqual(undetermined.read_length, None) + self.failUnlessEqual(undetermined.use_bases, None) + + project = g.lanes['12383'] + self.failUnlessEqual(project.analysis, 'eland_extended') + self.failUnlessEqual(project.eland_genome, '/g/hg18/chromosomes/') + self.failUnlessEqual(project.read_length, '49') + self.failUnlessEqual(project.use_bases, 'y'*49+'n') # test data extracted from summary file clusters = [None, - (281331, 11169), (203841, 13513), - (220889, 15653), (137294, 14666), - (129388, 14525), (262092, 10751), - (185754, 13503), (233765, 9537),] + (3878755, 579626.0), (3920639, 1027332.4), + (5713049, 876187.3), (5852907, 538640.6), + (4006751, 1265247.4), (5678021, 627070.7), + (1854131, 429053.2), (4777517, 592904.0), + ] - self.failUnlessEqual(len(g.summary), 1) + self.failUnlessEqual(len(g.summary), 2) for i in range(1,9): summary_lane = g.summary[0][i] self.failUnlessEqual(summary_lane.cluster, clusters[i]) diff --git a/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml b/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml index 2b7f3af..18b5714 100644 --- a/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml +++ b/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml @@ -195,7 +195,7 @@ eland_extended fileName - /mmjggl/nicodemus/data01/genomes/hg18/chromosomes/ + /g/hg18/chromosomes/ *.fa 32 1