Fix bugs introduduced by the improved HiSeq runfolder scanning.
authorDiane Trout <diane@caltech.edu>
Tue, 15 May 2012 22:10:08 +0000 (15:10 -0700)
committerDiane Trout <diane@caltech.edu>
Tue, 15 May 2012 22:10:08 +0000 (15:10 -0700)
This makes more progress toward analyzing a HiSeq runfolder, but
is currently lacking the ability to process the Aligned reads.

This does seem to now process the base call processing information
out of the Unaligned tree of hiseq runs.

Also all the previous runfolder versions tests pass again. (Some of my
intoduced logic was a bit off for guessing what type of runfolder it
is.)

htsworkflow/pipelines/bustard.py
htsworkflow/pipelines/gerald.py
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/test/simulate_runfolder.py
htsworkflow/pipelines/test/test_runfolder_rta1_12.py
htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm [new file with mode: 0755]

index 5ebde3901cea490dedb3c1d4067d1e2f01edab8b..acdc60ea7a511c3a30386beaf0c9b6d10e777f0c 100644 (file)
@@ -219,6 +219,25 @@ class Bustard(object):
         if xml is not None:
             self.set_elements(xml)
 
+    def update_attributes_from_pathname(self):
+        """Update version, date, user from bustard directory names
+        Obviously this wont work for BaseCalls or later runfolders
+        """
+        if self.pathname is None:
+            raise ValueError(
+                "Set pathname before calling update_attributes_from_pathname")
+        path, name = os.path.split(self.pathname)
+
+        if not re.match('bustard', name, re.IGNORECASE):
+            return
+
+        groups = name.split("_")
+        version = re.search(VERSION_RE, groups[0])
+        self.version = version.group(1)
+        t = time.strptime(groups[1], EUROPEAN_STRPTIME)
+        self.date = date(*t[0:3])
+        self.user = groups[2]
+
     def _get_time(self):
         if self.date is None:
             return None
@@ -290,47 +309,92 @@ def bustard(pathname):
     :Return:
       Fully initialized Bustard object.
     """
-    b = Bustard()
     pathname = os.path.abspath(pathname)
+    bustard_filename = os.path.join(pathname, 'config.xml')
+    demultiplexed_filename = os.path.join(pathname,
+                                          'DemultiplexedBustardConfig.xml')
+
+    if os.path.exists(demultiplexed_filename):
+        b = bustard_from_hiseq(pathname, demultiplexed_filename)
+    elif os.path.exists(bustard_filename):
+        b = bustard_from_ga2(pathname, bustard_filename)
+    else:
+        b = bustard_from_ga1(pathname)
+
+    return b
+
+def bustard_from_ga1(pathname):
+    """Initialize bustard class from ga1 era runfolders.
+    """
     path, name = os.path.split(pathname)
+
     groups = name.split("_")
-    if groups[0].lower().startswith('bustard'):
-        version = re.search(VERSION_RE, groups[0])
-        b.version = version.group(1)
-        t = time.strptime(groups[1], EUROPEAN_STRPTIME)
-        b.date = date(*t[0:3])
-        b.user = groups[2]
-    elif groups[0] == 'BaseCalls':
-        # stub values
-        b.version = None
-        b.date = None
-        b.user = None
+    if len(groups) < 3:
+        msg = "Not enough information to create attributes"\
+              " from directory name: %s"
+        LOGGER.error(msg % (self.pathname,))
+        return None
 
+    b = Bustard()
     b.pathname = pathname
-    bustard_config_filename = os.path.join(pathname, 'config.xml')
-    paramfiles = glob(os.path.join(pathname, "params?.xml"))
-    for paramfile in paramfiles:
-        phasing = Phasing(paramfile)
-        assert (phasing.lane >= 1 and phasing.lane <= 8)
-        b.phasing[phasing.lane] = phasing
+    b.update_attributes_from_pathname()
+    version = re.search(VERSION_RE, groups[0])
+    b.version = version.group(1)
+    t = time.strptime(groups[1], EUROPEAN_STRPTIME)
+    b.date = date(*t[0:3])
+    b.user = groups[2]
+
     # I only found these in Bustard1.9.5/1.9.6 directories
     if b.version in ('1.9.5', '1.9.6'):
         # at least for our runfolders for 1.9.5 and 1.9.6 matrix[1-8].txt are always the same
         crosstalk_file = os.path.join(pathname, "matrix1.txt")
         b.crosstalk = CrosstalkMatrix(crosstalk_file)
+
+    add_phasing(b)
+    return b
+
+
+def bustard_from_ga2(pathname, config_filename):
+    """Initialize bustard class from ga2-era runfolder
+    Actually I don't quite remember if it is exactly the GA2s, but
+    its after the original runfolder style and before the HiSeq.
+    """
     # for version 1.3.2 of the pipeline the bustard version number went down
     # to match the rest of the pipeline. However there's now a nifty
     # new (useful) bustard config file.
-    elif os.path.exists(bustard_config_filename):
-        bustard_config_root = ElementTree.parse(bustard_config_filename)
-        b.bustard_config = bustard_config_root.getroot()
-        b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname, b.bustard_config)
-        software = bustard_config_root.find('*/Software')
-        b.version = software.attrib['Version']
-        #b.version = software.attrib['Name'] + "-" + software.attrib['Version']
+
+    # stub values
+    b = Bustard()
+    b.pathname = pathname
+    b.update_attributes_from_pathname()
+    bustard_config_root = ElementTree.parse(config_filename)
+    b.bustard_config = bustard_config_root.getroot()
+    b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname,
+                                                       b.bustard_config)
+    software = bustard_config_root.find('*/Software')
+    b.version = software.attrib['Version']
+    add_phasing(b)
 
     return b
 
+def bustard_from_hiseq(pathname, config_filename):
+    b = Bustard()
+    b.pathname = pathname
+    bustard_config_root = ElementTree.parse(config_filename)
+    b.bustard_config = bustard_config_root.getroot()
+    software = bustard_config_root.find('*/Software')
+    b.version = software.attrib['Version']
+    add_phasing(b)
+    return b
+
+def add_phasing(bustard_obj):
+    paramfiles = glob(os.path.join(bustard_obj.pathname,
+                                   "params?.xml"))
+    for paramfile in paramfiles:
+        phasing = Phasing(paramfile)
+        assert (phasing.lane >= 1 and phasing.lane <= 8)
+        bustard_obj.phasing[phasing.lane] = phasing
+
 def fromxml(tree):
     """
     Reconstruct a htsworkflow.pipelines.Bustard object from an xml block
index 9d32a5bfd741739edb4d685c7828180f69ed798d..feb03e646e5a741cb9145a2d3a19487958aea67f 100644 (file)
@@ -167,29 +167,35 @@ class Gerald(object):
         if self.tree is None:
             return None
 
-        root = self._get_experiment_root()
-        if root is None:
-            root = ''
-        else:
-            root = os.path.join(root,'')
-
-        experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
-        if experiment_dir is not None:
-            experiment_dir = experiment_dir.replace(root, '')
-        experiment_dir = self.tree.findtext('Defaults/EXPT_DIR')
-        if experiment_dir is not None:
-            _, experiment_dir = os.path.split(experiment_dir)
+        expt_root = os.path.normpath(self._get_experiment_root())
+        chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
+        # hiseqs renamed the experiment dir location
+        defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR')
+
+        experiment_dir = None
+        if defaults_expt_dir is not None:
+            _, experiment_dir = os.path.split(defaults_expt_dir)
+        elif expt_root is not None and chip_expt_dir is not None:
+            experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '')
+            experiment_dir = experiment_dir.split(os.path.sep)[0]
+
         if experiment_dir is None or len(experiment_dir) == 0:
             return None
+        return experiment_dir
 
-        dirnames = experiment_dir.split(os.path.sep)
-        return dirnames[0]
     runfolder_name = property(_get_runfolder_name)
 
     def _get_version(self):
         if self.tree is None:
             return None
-        return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
+        ga_version = self.tree.findtext(
+            'ChipWideRunParameters/SOFTWARE_VERSION')
+        if ga_version is not None:
+            return ga_version
+        hiseq_software_node = self.tree.find('Software')
+        hiseq_version = hiseq_software_node.attrib['Version']
+        return hiseq_version
+
     version = property(_get_version)
 
     def _get_chip_attribute(self, value):
index 3bfb928d763f442faa4b4b852b8683efabaae93f..9edba3113b1a8d097993d2795a83b9c56acc1ae2 100644 (file)
@@ -186,34 +186,41 @@ def get_runs(runfolder, flowcell_id=None):
         for bustard_pathname in bustard_dirs:
             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
             b = bustard.bustard(bustard_pathname)
-            gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
-            LOGGER.info("Looking for gerald directories in %s" % (pathname,))
-            for gerald_pathname in glob(gerald_glob):
-                LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
-                try:
-                    g = gerald.gerald(gerald_pathname)
-                    p = PipelineRun(runfolder, flowcell_id)
-                    p.datadir = datadir
-                    p.image_analysis = image_analysis
-                    p.bustard = b
-                    p.gerald = g
-                    runs.append(p)
-                except IOError, e:
-                    LOGGER.error("Ignoring " + str(e))
-
-            aligned_glob = os.path.join(runfolder, 'Aligned*')
-            for aligned in glob(aligned_glob):
-                LOGGER.info("Found aligned directory %s" % (aligned,))
-                try:
-                    g = gerald.gerald(aligned)
-                    p = PipelineRun(runfolder, flowcell_id)
-                    p.datadir = datadir
-                    p.image_analysis = image_analysis
-                    p.bustard = b
-                    p.gerald = g
-                    runs.append(p)
-                except IOError, e:
-                    LOGGER.error("Ignoring " + str(e))
+            build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
+
+            build_aligned_runs(image_analysis, runs, b, datadir, runfolder)
+
+    def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
+        gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+        LOGGER.info("Looking for gerald directories in %s" % (pathname,))
+        for gerald_pathname in glob(gerald_glob):
+            LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
+            try:
+                g = gerald.gerald(gerald_pathname)
+                p = PipelineRun(runfolder, flowcell_id)
+                p.datadir = datadir
+                p.image_analysis = image_analysis
+                p.bustard = b
+                p.gerald = g
+                runs.append(p)
+            except IOError, e:
+                LOGGER.error("Ignoring " + str(e))
+
+
+    def build_aligned_runs(image_analysis, runs, b, datadir, runfolder):
+        aligned_glob = os.path.join(runfolder, 'Aligned*')
+        for aligned in glob(aligned_glob):
+            LOGGER.info("Found aligned directory %s" % (aligned,))
+            try:
+                g = gerald.gerald(aligned)
+                p = PipelineRun(runfolder, flowcell_id)
+                p.datadir = datadir
+                p.image_analysis = image_analysis
+                p.bustard = b
+                p.gerald = g
+                runs.append(p)
+            except IOError, e:
+                LOGGER.error("Ignoring " + str(e))
 
     datadir = os.path.join(runfolder, 'Data')
 
@@ -229,7 +236,7 @@ def get_runs(runfolder, flowcell_id=None):
             )
         else:
             scan_post_image_analysis(
-                runs, runfolder, image_analysis, firecrest_pathname
+                runs, runfolder, datadir, image_analysis, firecrest_pathname
             )
     # scan for IPAR directories
     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
index 42cf421587b227bc466b8248e4b58124a775ac65..aae60786914a81dc0b121d1805a524e91d0316cb 100644 (file)
@@ -67,9 +67,9 @@ def make_unaligned_config_1_12(unaligned_dir):
         (os.path.join(TESTDATA_DIR, 'demultiplex_1.12.4.2.xml'),
          os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
         (os.path.join(TESTDATA_DIR, 'demultiplexed_bustard_1.12.4.2.xml'),
-         os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
+         os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')),
         (os.path.join(TESTDATA_DIR, 'demultiplexed_summary_1.12.4.2.xml'),
-         os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
+         os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')),
     ]
     for src, dest in demultiplex_pairs:
         shutil.copy(src, dest)
@@ -357,6 +357,10 @@ def make_summary_casava1_7_xml(gerald_dir):
     destination = os.path.join(gerald_dir, 'Summary.xml')
     shutil.copy(source, destination)
 
+def make_summary_rta1_12(status_dir):
+    source = os.path.join(TESTDATA_DIR, 'Summary-rta1_12.htm')
+    destination = os.path.join(status_dir, 'Summary.htm')
+    shutil.copy(source, destination)
 
 def make_eland_results(gerald_dir):
     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
index 927cf61072f038eb626f4a07deecd937dbbd71b6..70c11bface5ecc3c680d40acc5e8c6c8818e3737 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 from datetime import datetime, date
+import logging
 import os
 import tempfile
 import shutil
@@ -33,6 +34,10 @@ def make_runfolder(obj=None):
 
     intensities_dir = make_rta_intensities_1_12(data_dir)
 
+    status_dir = os.path.join(data_dir, 'Status_Files')
+    os.mkdir(status_dir)
+    make_summary_rta1_12(status_dir)
+
     basecalls_dir = make_rta_basecalls_1_12(intensities_dir)
     make_matrix_dir_rta_1_12(basecalls_dir)
 
@@ -70,7 +75,7 @@ class RunfolderTests(unittest.TestCase):
     def test_bustard(self):
         """Construct a bustard object"""
         b = bustard.bustard(self.bustard_dir)
-        self.failUnlessEqual(b.version, '1.8.70.0')
+        self.failUnlessEqual(b.version, '1.12.4.2')
         self.failUnlessEqual(b.date,    None)
         self.failUnlessEqual(b.user,    None)
         self.failUnlessEqual(len(b.phasing), 0)
@@ -85,13 +90,10 @@ class RunfolderTests(unittest.TestCase):
         # need to update gerald and make tests for it
         g = gerald.gerald(self.gerald_dir)
 
-        self.failUnlessEqual(g.version,
-            '@(#) Id: GERALD.pl,v 1.171 2008/05/19 17:36:14 mzerara Exp')
-        self.failUnlessEqual(g.date, datetime(2009,2,22,21,15,59))
+        self.failUnlessEqual(g.version, 'CASAVA-1.8.1')
         self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
         self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
 
-
         # list of genomes, matches what was defined up in
         # make_gerald_config.
         # the first None is to offset the genomes list to be 1..9
@@ -283,9 +285,6 @@ def suite():
     return unittest.makeSuite(RunfolderTests,'test')
 
 if __name__ == "__main__":
-    #unittest.main(defaultTest="suite")
-    class Test(object): pass
-    t = Test()
-    make_runfolder(t)
-    print ('path ' + t.runfolder_dir)
+    logging.basicConfig(level=logging.WARN)
+    unittest.main(defaultTest="suite")
 
diff --git a/htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm b/htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm
new file mode 100755 (executable)
index 0000000..b11632d
--- /dev/null
@@ -0,0 +1,53 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r
+<head>\r
+       <link rel="stylesheet" href="./Status.css" type="text/css" media="screen"/>\r
+       <script src="./Status.js" type="text/javascript"></script>\r
+       <script src="./Summary.js" type="text/javascript"></script>\r
+       <title>Status</title>\r
+</head>\r
\r
+<body>\r
+       <table style="border:none"><tr>\r
+               <td><img alt="Refresh" title="Refresh" name="RefreshBtn" height="25" width="25" src="./RefreshBtn/r1.png"  onmouseout="this.src='./RefreshBtn/r1.png'" onmouseover="this.src='./RefreshBtn/r2.png'" onmousedown="this.src='./RefreshBtn/r3.png'" onmouseup="this.src='./RefreshBtn/r2.png'" onclick="window.location.reload(true)" />&#160;</td>\r
+               <td id="StatUpdateDiv" >\r
+                       <table class="StatusTable"><tr>\r
+                               <th>Total:</th><td>0</td><th>Extracted:</th><td>0</td><th>Called:</th><td>0</td>\r
+                               <th>Scored:</th><td>0</td><th>Copied:</th><td>0</td><th>Err.Rated:</th><td>0</td>\r
+                       </tr></table></td> \r
+               <td id="RunNameTD">Loading Run Data ...</td>\r
+       </tr></table>\r
+       <script type="text/javascript">\r
+               loadXSLTable('../reports/StatusUpdate.xml', './StatusUpdate.xsl', 'StatUpdateDiv');\r
+               runName = xslTransform(loadXMLDoc('../reports/Status.xml'), loadXMLDoc('./RunName.xsl'));\r
+               document.getElementById("RunNameTD").innerHTML = runName;\r
+               document.title = runName + " Status";\r
+       </script>\r
+<ul id="tabmenu">\r
+       <li><a href="./RunInfo.htm">Run Info</a></li>\r
+       <li><a href="../Status.htm">Tile Status</a></li>\r
+       <li><a href="./Charts.htm">Charts</a></li>\r
+       <li><a class="selected" href="./Summary.htm">Summary</a></li>\r
+  <li><a class="space">Plots:</a></li>\r
+       <li><a href="./ByLane.htm">Cluster Density</a></li>\r
+       <li><a href="./ByCycle.htm">Data By Cycle</a></li>\r
+</ul>\r
\r
+<div id="container">\r
+       <div id="SumTbl1" ><p> No data available yet.</p></div>\r
+       <div id="SumTbl2"></div>\r
+       <div id="SumTbl3"></div>\r
+       <div id="SumTbl4"></div>\r
+       <div id="SumTbl5"></div>\r
+       <div id="SumTbl6"></div>\r
+       <div id="SumTbl7"></div>\r
+       <div id="SumTbl8"></div>\r
+       <div id="SumTbl9"></div>\r
+       <div id="SumTbl10"></div>\r
+       <div id="SumTbl11"></div>\r
+       <script type="text/javascript">loadSumTable()</script>\r
+</div>\r
\r
+</body>\r
+</html>\r
+\r