Split lane parameters into seperate classes for GA & HiSeq config files.
authorDiane Trout <diane@caltech.edu>
Sat, 2 Jun 2012 00:20:24 +0000 (17:20 -0700)
committerDiane Trout <diane@caltech.edu>
Sat, 2 Jun 2012 00:20:24 +0000 (17:20 -0700)
Also tweak the tests for a different example flowcell

htsworkflow/pipelines/gerald.py
htsworkflow/pipelines/test/test_runfolder_rta1_12.py
htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml

index e13ac2ab22354a0a4d026b2e2e3d303e1f4214f9..087edd43a167a7ef8a9de585bfbfce72de42785b 100644 (file)
@@ -28,112 +28,12 @@ class Gerald(object):
     RUN_PARAMETERS='RunParameters'
     SUMMARY='Summary'
 
-    class LaneParameters(object):
-        """
-        Make it easy to access elements of LaneSpecificRunParameters from python
-        """
-        def __init__(self, gerald, lane_id):
-            self._gerald = gerald
-            self._lane_id = lane_id
-
-        def __get_attribute(self, xml_tag):
-            subtree = self._gerald.tree.find('LaneSpecificRunParameters')
-            container = subtree.find(xml_tag)
-            if container is None:
-                return None
-            if len(container.getchildren()) > LANES_PER_FLOWCELL:
-                raise RuntimeError('GERALD config.xml file changed')
-            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            try:
-                index = lanes.index(self._lane_id)
-            except ValueError, e:
-                return None
-            element = container[index]
-            return element.text
-        def _get_analysis(self):
-            return self.__get_attribute('ANALYSIS')
-        analysis = property(_get_analysis)
-
-        def _get_eland_genome(self):
-            genome = self.__get_attribute('ELAND_GENOME')
-            # default to the chipwide parameters if there isn't an
-            # entry in the lane specific paramaters
-            if genome is None:
-                genome = self._gerald._get_chip_attribute('ELAND_GENOME')
-            # ignore flag value
-            if genome == 'Need_to_specify_ELAND_genome_directory':
-                genome = None
-            return genome
-        eland_genome = property(_get_eland_genome)
-
-        def _get_read_length(self):
-            read_length = self.__get_attribute('READ_LENGTH')
-            if read_length is None:
-                read_length = self._gerald._get_chip_attribute('READ_LENGTH')
-            return read_length
-        read_length = property(_get_read_length)
-
-        def _get_use_bases(self):
-            return self.__get_attribute('USE_BASES')
-        use_bases = property(_get_use_bases)
-
-    class LaneSpecificRunParameters(object):
-        """
-        Provide access to LaneSpecificRunParameters
-        """
-        def __init__(self, gerald):
-            self._gerald = gerald
-            self._lane = None
-
-        def _initalize_lanes(self):
-            """
-            build dictionary of LaneParameters
-            """
-            self._lanes = {}
-            tree = self._gerald.tree
-            analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
-            if analysis is None:
-                return
-            # according to the pipeline specs I think their fields
-            # are sampleName_laneID, with sampleName defaulting to s
-            # since laneIDs are constant lets just try using
-            # those consistently.
-            for element in analysis:
-                sample, lane_id = element.tag.split('_')
-                self._lanes[int(lane_id)] = Gerald.LaneParameters(
-                                              self._gerald, lane_id)
-
-        def __getitem__(self, key):
-            if self._lane is None:
-                self._initalize_lanes()
-            return self._lanes[key]
-        def get(self, key, default):
-            if self._lane is None:
-                self._initalize_lanes()
-            return self._lanes.get(key, None)
-        def keys(self):
-            if self._lane is None:
-                self._initalize_lanes()
-            return self._lanes.keys()
-        def values(self):
-            if self._lane is None:
-                self._initalize_lanes()
-            return self._lanes.values()
-        def items(self):
-            if self._lane is None:
-                self._initalize_lanes()
-            return self._lanes.items()
-        def __len__(self):
-            if self._lane is None:
-                self._initalize_lanes()
-            return len(self._lanes)
-
     def __init__(self, xml=None):
         self.pathname = None
         self.tree = None
 
         # parse lane parameters out of the config.xml file
-        self.lanes = Gerald.LaneSpecificRunParameters(self)
+        self.lanes = LaneSpecificRunParameters(self)
 
         self.summary = None
         self.eland_results = None
@@ -240,6 +140,186 @@ class Gerald(object):
             else:
                 LOGGER.warn("Unrecognized tag %s" % (element.tag,))
 
+
+class LaneParameters(object):
+    """
+    Make it easy to access elements of LaneSpecificRunParameters from python
+    """
+    def __init__(self, gerald, lane_id):
+        self._gerald = gerald
+        self._lane_id = lane_id
+
+    def _get_analysis(self):
+        raise NotImplemented("abstract class")
+    analysis = property(_get_analysis)
+
+    def _get_eland_genome(self):
+        raise NotImplemented("abstract class")
+    eland_genome = property(_get_eland_genome)
+
+    def _get_read_length(self):
+        raise NotImplemented("abstract class")
+    read_length = property(_get_read_length)
+
+    def _get_use_bases(self):
+        raise NotImplemented("abstract class")
+    use_bases = property(_get_use_bases)
+
+
+class LaneParametersGA(LaneParameters):
+    """
+    Make it easy to access elements of LaneSpecificRunParameters from python
+    """
+    def __init__(self, gerald, lane_id):
+        super(LaneParametersGA, self).__init__(gerald, lane_id)
+
+    def __get_attribute(self, xml_tag):
+        subtree = self._gerald.tree.find('LaneSpecificRunParameters')
+        container = subtree.find(xml_tag)
+        if container is None:
+            return None
+        if len(container.getchildren()) > LANES_PER_FLOWCELL:
+            raise RuntimeError('GERALD config.xml file changed')
+        lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+        try:
+            index = lanes.index(self._lane_id)
+        except ValueError, e:
+            return None
+        element = container[index]
+        return element.text
+    def _get_analysis(self):
+        return self.__get_attribute('ANALYSIS')
+    analysis = property(_get_analysis)
+
+    def _get_eland_genome(self):
+        genome = self.__get_attribute('ELAND_GENOME')
+        # default to the chipwide parameters if there isn't an
+        # entry in the lane specific paramaters
+        if genome is None:
+            genome = self._gerald._get_chip_attribute('ELAND_GENOME')
+        # ignore flag value
+        if genome == 'Need_to_specify_ELAND_genome_directory':
+            genome = None
+        return genome
+    eland_genome = property(_get_eland_genome)
+
+    def _get_read_length(self):
+        read_length = self.__get_attribute('READ_LENGTH')
+        if read_length is None:
+            read_length = self._gerald._get_chip_attribute('READ_LENGTH')
+        return read_length
+    read_length = property(_get_read_length)
+
+    def _get_use_bases(self):
+        return self.__get_attribute('USE_BASES')
+    use_bases = property(_get_use_bases)
+
+
+class LaneParametersHiSeq(LaneParameters):
+    """
+    Make it easy to access elements of LaneSpecificRunParameters from python
+    """
+    def __init__(self, gerald, lane_id, element):
+        super(LaneParametersHiSeq, self).__init__(gerald, lane_id)
+        self.element = element
+
+    def __get_attribute(self, xml_tag):
+        container = self.element.find(xml_tag)
+        if container is None:
+            return None
+        return container.text
+
+    def _get_analysis(self):
+        return self.__get_attribute('ANALYSIS')
+    analysis = property(_get_analysis)
+
+    def _get_eland_genome(self):
+        genome = self.__get_attribute('ELAND_GENOME')
+        # default to the chipwide parameters if there isn't an
+        # entry in the lane specific paramaters
+        if genome is None:
+            genome = self._gerald._get_chip_attribute('ELAND_GENOME')
+        # ignore flag value
+        if genome == 'Need_to_specify_ELAND_genome_directory':
+            genome = None
+        return genome
+    eland_genome = property(_get_eland_genome)
+
+    def _get_read_length(self):
+        return self.__get_attribute('READ_LENGTH1')
+    read_length = property(_get_read_length)
+
+    def _get_use_bases(self):
+        return self.__get_attribute('USE_BASES1')
+    use_bases = property(_get_use_bases)
+
+class LaneSpecificRunParameters(object):
+    """
+    Provide access to LaneSpecificRunParameters
+    """
+    def __init__(self, gerald):
+        self._gerald = gerald
+        self._lane = None
+
+    def _initalize_lanes(self):
+        """
+        build dictionary of LaneParameters
+        """
+        self._lanes = {}
+        tree = self._gerald.tree
+        analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+        if analysis is not None:
+            self._extract_ga_analysis_type(analysis)
+        analysis = tree.find('Projects')
+        if analysis is not None:
+            self._extract_hiseq_analysis_type(analysis)
+
+    def _extract_ga_analysis_type(self, analysis):
+        # according to the pipeline specs I think their fields
+        # are sampleName_laneID, with sampleName defaulting to s
+        # since laneIDs are constant lets just try using
+        # those consistently.
+        for element in analysis:
+            sample, lane_id = element.tag.split('_')
+            self._lanes[int(lane_id)] = LaneParametersGA(
+                                          self._gerald, lane_id)
+
+    def _extract_hiseq_analysis_type(self, analysis):
+        """Extract from HiSeq style multiplexed analysis types"""
+        for element in analysis:
+            name = element.attrib['name']
+            self._lanes[name] = LaneParametersHiSeq(self._gerald,
+                                                    name,
+                                                    element)
+
+    def __iter__(self):
+        return self._lanes.iterkeys()
+    def __getitem__(self, key):
+        if self._lane is None:
+            self._initalize_lanes()
+        return self._lanes[key]
+    def get(self, key, default):
+        if self._lane is None:
+            self._initalize_lanes()
+        return self._lanes.get(key, None)
+    def keys(self):
+        if self._lane is None:
+            self._initalize_lanes()
+        return self._lanes.keys()
+    def values(self):
+        if self._lane is None:
+            self._initalize_lanes()
+        return self._lanes.values()
+    def items(self):
+        if self._lane is None:
+            self._initalize_lanes()
+        return self._lanes.items()
+    def __len__(self):
+        if self._lane is None:
+            self._initalize_lanes()
+        return len(self._lanes)
+
+
 def gerald(pathname):
     g = Gerald()
     g.pathname = os.path.expanduser(pathname)
index ed225bfd36b9ca4dbf5cf618e04eddd50d3af1af..9b7d3b9169b1cb0a66e70f614222a1b8795dfb8e 100644 (file)
@@ -95,38 +95,28 @@ class RunfolderTests(unittest.TestCase):
         # make_gerald_config.
         # the first None is to offset the genomes list to be 1..9
         # instead of pythons default 0..8
-        genomes = [None,
-                   '/g/mm9',
-                   '/g/mm9',
-                   '/g/elegans190',
-                   '/g/arabidopsis01222004',
-                   '/g/mm9',
-                   '/g/mm9',
-                   '/g/mm9',
-                   '/g/mm9', ]
-
         # test lane specific parameters from gerald config file
-        for i in range(1,9):
-            cur_lane = g.lanes[i]
-            self.failUnlessEqual(cur_lane.analysis, 'eland_extended')
-            self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
-            self.failUnlessEqual(cur_lane.read_length, '37')
-            self.failUnlessEqual(cur_lane.use_bases, 'Y'*37)
-
-        # I want to be able to use a simple iterator
-        for l in g.lanes.values():
-          self.failUnlessEqual(l.analysis, 'eland_extended')
-          self.failUnlessEqual(l.read_length, '37')
-          self.failUnlessEqual(l.use_bases, 'Y'*37)
+
+        undetermined = g.lanes['Undetermined_indices']
+        self.failUnlessEqual(undetermined.analysis, 'none')
+        self.failUnlessEqual(undetermined.read_length, None)
+        self.failUnlessEqual(undetermined.use_bases, None)
+
+        project = g.lanes['12383']
+        self.failUnlessEqual(project.analysis, 'eland_extended')
+        self.failUnlessEqual(project.eland_genome, '/g/hg18/chromosomes/')
+        self.failUnlessEqual(project.read_length, '49')
+        self.failUnlessEqual(project.use_bases, 'y'*49+'n')
 
         # test data extracted from summary file
         clusters = [None,
-                    (281331, 11169), (203841, 13513),
-                    (220889, 15653), (137294, 14666),
-                    (129388, 14525), (262092, 10751),
-                    (185754, 13503), (233765, 9537),]
+                    (3878755,  579626.0), (3920639, 1027332.4),
+                    (5713049,  876187.3), (5852907,  538640.6),
+                    (4006751, 1265247.4), (5678021,  627070.7),
+                    (1854131,  429053.2), (4777517,  592904.0),
+                   ]
 
-        self.failUnlessEqual(len(g.summary), 1)
+        self.failUnlessEqual(len(g.summary), 2)
         for i in range(1,9):
             summary_lane = g.summary[0][i]
             self.failUnlessEqual(summary_lane.cluster, clusters[i])
index 2b7f3afc1557be7ed4dcd4522d881764ff64409f..18b571470ea56e27a886dcd903481388bfdb8206 100644 (file)
     <Project name="12383">
       <ANALYSIS>eland_extended</ANALYSIS>
       <CHROM_NAME_SOURCE>fileName</CHROM_NAME_SOURCE>
-      <ELAND_GENOME>/mmjggl/nicodemus/data01/genomes/hg18/chromosomes/</ELAND_GENOME>
+      <ELAND_GENOME>/g/hg18/chromosomes/</ELAND_GENOME>
       <ELAND_GENOME_MASK>*.fa</ELAND_GENOME_MASK>
       <ELAND_SEED_LENGTH1>32</ELAND_SEED_LENGTH1>
       <READS>1</READS>