For pipeline 1.1rc1 or 1.3.2, look for the matrix files in the bustard dir
authorDiane Trout <diane@caltech.edu>
Sat, 28 Mar 2009 02:17:22 +0000 (02:17 +0000)
committerDiane Trout <diane@caltech.edu>
Sat, 28 Mar 2009 02:17:22 +0000 (02:17 +0000)
also if the bustard config.xml file is present, check to see if the
matrix file was forced in there.

htsworkflow/pipelines/bustard.py
htsworkflow/pipelines/firecrest.py
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/test/simulate_runfolder.py
htsworkflow/pipelines/test/test_runfolder_ipar130.py
htsworkflow/pipelines/test/testdata/bustard-config132.xml [new file with mode: 0644]
scripts/srf

index a10c909afc5f8f84f53a91f903289454fa3e58a8..a8eb4c7b7b3d3b6875b2013a5c6737cf40477f83 100644 (file)
@@ -4,12 +4,14 @@ Extract configuration from Illumina Bustard Directory.
 This includes the version number, run date, bustard executable parameters, and 
 phasing estimates. 
 """
+from copy import copy
 from datetime import date
 from glob import glob
 import logging
 import os
-import time
 import re
+import sys
+import time
 
 from htsworkflow.pipelines.runfolder import \
    ElementTree, \
@@ -19,6 +21,8 @@ from htsworkflow.pipelines.runfolder import \
 # make epydoc happy
 __docformat__ = "restructuredtext en"
 
+LANE_LIST = range(1,9)
+
 class Phasing(object):
     PHASING = 'Phasing'
     PREPHASING = 'Prephasing'
@@ -44,10 +48,13 @@ class Phasing(object):
 
     def get_elements(self):
         root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
+        root.tail = os.linesep
         phasing = ElementTree.SubElement(root, Phasing.PHASING)
         phasing.text = str(self.phasing)
+        phasing.tail = os.linesep
         prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
         prephasing.text = str(self.prephasing)
+        prephasing.tail = os.linesep
         return root
 
     def set_elements(self, tree):
@@ -62,8 +69,112 @@ class Phasing(object):
             elif element.tag == Phasing.PREPHASING:
                 self.prephasing = float(element.text)
 
+class CrosstalkMatrix(object):
+    CROSSTALK = "MatrixElements"
+    BASE = 'Base'
+    ELEMENT = 'Element'
+
+    def __init__(self, fromfile=None, xml=None):
+        self.base = {}
+
+        if fromfile is not None:
+            self._initialize_from_file(fromfile)
+        elif xml is not None:
+            self.set_elements(xml)
+
+    def _initialize_from_file(self, pathname):
+        data = open(pathname).readlines()
+        auto_header = '# Auto-generated frequency response matrix'
+        if data[0].strip() != auto_header or len(data) != 9:
+            raise RuntimeError("matrix file %s is unusual" % (pathname,))
+        # skip over lines 1,2,3,4 which contain the 4 bases
+        self.base['A'] = [ float(v) for v in data[5].split() ]
+        self.base['C'] = [ float(v) for v in data[6].split() ]
+        self.base['G'] = [ float(v) for v in data[7].split() ]
+        self.base['T'] = [ float(v) for v in data[8].split() ]
+
+    def get_elements(self):
+        root = ElementTree.Element(CrosstalkMatrix.CROSSTALK)
+        root.tail = os.linesep
+        base_order = ['A','C','G','T']
+        for b in base_order:
+            base_element = ElementTree.SubElement(root, CrosstalkMatrix.BASE)
+            base_element.text = b
+            base_element.tail = os.linesep
+        for b in base_order:
+            for value in self.base[b]:
+                crosstalk_value = ElementTree.SubElement(root, CrosstalkMatrix.ELEMENT)
+                crosstalk_value.text = unicode(value)
+                crosstalk_value.tail = os.linesep
+
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag != CrosstalkMatrix.CROSSTALK:
+            raise ValueError('Invalid run-xml exptected '+CrosstalkMatrix.CROSSTALK)
+        base_order = []
+        current_base = None
+        current_index = 0
+        for element in tree.getchildren():
+            # read in the order of the bases
+            if element.tag == 'Base':
+                base_order.append(element.text)
+            elif element.tag == 'Element':
+                # we're done reading bases, now its just the 4x4 matrix
+                # written out as a list of elements
+                # if this is the first element, make a copy of the list
+                # to play with and initialize an empty list for the current base
+                if current_base is None:
+                    current_base = copy(base_order)
+                    self.base[current_base[0]] = []
+                # we found (probably) 4 bases go to the next base
+                if current_index == len(base_order):
+                    current_base.pop(0)
+                    current_index = 0
+                    self.base[current_base[0]] = []
+                value = float(element.text)
+                self.base[current_base[0]].append(value)
+
+                current_index += 1
+            else:
+                raise RuntimeError("Unrecognized tag in run xml: %s" %(element.tag,))
+
+def crosstalk_matrix_from_bustard_config(bustard_path, bustard_config_tree):
+    """
+    Analyze the bustard config file and try to find the crosstalk matrix.
+    """
+    bustard_run = bustard_config_tree[0]
+    if bustard_run.tag != 'Run':
+        raise RuntimeError('Expected Run tag, got %s' % (bustard_run.tag,))
+
+    call_parameters = bustard_run.find('BaseCallParameters')
+    if call_parameters is None:
+        raise RuntimeError('Missing BaseCallParameters section')
+
+    matrix = call_parameters.find('Matrix')
+    if matrix is None:
+        raise RuntimeError('Expected to find Matrix in Bustard BaseCallParameters')
+
+    matrix_auto_flag = int(matrix.find('AutoFlag').text)
+    matrix_auto_lane = int(matrix.find('AutoLane').text)
+
+    if matrix_auto_flag:
+        # we estimated the matrix from something in this run.
+        # though we don't really care which lane it was
+        matrix_path = os.path.join(bustard_path, 'Matrix', 's_02_matrix.txt')
+        matrix = CrosstalkMatrix(matrix_path)
+    else:
+        # the matrix was provided
+        matrix_elements = call_parameters.find('MatrixElements')
+        if matrix_elements is None:
+            raise RuntimeError('Expected to find MatrixElements in Bustard BaseCallParameters')
+        matrix = CrosstalkMatrix(xml=matrix_elements)
+
+    print "matrix:", matrix
+    return matrix
+
 class Bustard(object):
-    XML_VERSION = 1
+    XML_VERSION = 2
 
     # Xml Tags
     BUSTARD = 'Bustard'
@@ -71,13 +182,16 @@ class Bustard(object):
     DATE = 'run_time'
     USER = 'user'
     PARAMETERS = 'Parameters'
+    BUSTARD_CONFIG = 'BaseCallAnalysis'
 
     def __init__(self, xml=None):
         self.version = None
         self.date = date.today()
         self.user = None
         self.phasing = {}
+        self.crosstalk = {}
         self.pathname = None
+        self.bustard_config = None
 
         if xml is not None:
             self.set_elements(xml)
@@ -87,12 +201,8 @@ class Bustard(object):
     time = property(_get_time, doc='return run time as seconds since epoch')
 
     def dump(self):
-        print "Bustard version:", self.version
-        print "Run date", self.date
-        print "user:", self.user
-        for lane, tree in self.phasing.items():
-            print lane
-            print tree
+        #print ElementTree.tostring(self.get_elements())
+        ElementTree.dump(self.get_elements())
 
     def get_elements(self):
         root = ElementTree.Element('Bustard', 
@@ -104,8 +214,13 @@ class Bustard(object):
         user = ElementTree.SubElement(root, Bustard.USER)
         user.text = self.user
         params = ElementTree.SubElement(root, Bustard.PARAMETERS)
-        for p in self.phasing.values():
-            params.append(p.get_elements())
+
+        for lane in LANE_LIST:
+            params.append(self.phasing[lane].get_elements())
+            #params.append(self.crosstalk[lane].get_elements())
+        
+        if self.bustard_config is not None:
+            root.append(self.bustard_config)
         return root
 
     def set_elements(self, tree):
@@ -125,11 +240,11 @@ class Bustard(object):
                 for param in element:
                     p = Phasing(xml=param)
                     self.phasing[p.lane] = p
+            elif element.tag == Bustard.BUSTARD_CONFIG:
+                self.bustard_config = element
             else:
                 raise ValueError("Unrecognized tag: %s" % (element.tag,))
         
-
-
 def bustard(pathname):
     """
     Construct a Bustard object by analyzing an Illumina Bustard directory.
@@ -141,6 +256,7 @@ def bustard(pathname):
       Fully initialized Bustard object.
     """
     b = Bustard()
+    pathname = os.path.abspath(pathname)
     path, name = os.path.split(pathname)
     groups = name.split("_")
     version = re.search(VERSION_RE, groups[0])
@@ -149,11 +265,26 @@ def bustard(pathname):
     b.date = date(*t[0:3])
     b.user = groups[2]
     b.pathname = pathname
+    bustard_config_filename = os.path.join(pathname, 'config.xml')
+    print bustard_config_filename
     paramfiles = glob(os.path.join(pathname, "params?.xml"))
     for paramfile in paramfiles:
         phasing = Phasing(paramfile)
         assert (phasing.lane >= 1 and phasing.lane <= 8)
         b.phasing[phasing.lane] = phasing
+    # I only found these in Bustard1.9.5/1.9.6 directories
+    if b.version in ('1.9.5', '1.9.6'):
+        # at least for our runfolders for 1.9.5 and 1.9.6 matrix[1-8].txt are always the same
+        crosstalk_file = os.path.join(pathname, "matrix1.txt")
+        b.crosstalk = CrosstalkMatrix(crosstalk_file)
+    # for version 1.3.2 of the pipeline the bustard version number went down
+    # to match the rest of the pipeline. However there's now a nifty
+    # new (useful) bustard config file.
+    elif os.path.exists(bustard_config_filename):
+        bustard_config_root = ElementTree.parse(bustard_config_filename)
+        b.bustard_config = bustard_config_root.getroot()
+        b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname, b.bustard_config)
+
     return b
 
 def fromxml(tree):
@@ -163,3 +294,33 @@ def fromxml(tree):
     b = Bustard()
     b.set_elements(tree)
     return b
+
+def make_cmdline_parser():
+    from optparse import OptionParser
+    parser = OptionParser('%prog: bustard_directory')
+    return parser
+
+def main(cmdline):
+    parser = make_cmdline_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    for bustard_dir in args:
+        print u'analyzing bustard directory: ' + unicode(bustard_dir)
+        bustard_object = bustard(bustard_dir)
+        bustard_object.dump() 
+
+        bustard_object2 = Bustard(xml=bustard_object.get_elements())
+        print ('-------------------------------------')
+        bustard_object2.dump()
+        print ('=====================================')
+        b1_tree = bustard_object.get_elements()
+        b1 = ElementTree.tostring(b1_tree).split(os.linesep)
+        b2_tree = bustard_object2.get_elements()
+        b2 = ElementTree.tostring(b2_tree).split(os.linesep)
+        for line1, line2 in zip(b1, b2):
+            if b1 != b2:
+                print "b1: ", b1
+                print "b2: ", b2
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
index 847f608bcb5be87832266037dfc9350d1b61b128..bcac6edd668810dc0bed9d7286751a71d157b074 100644 (file)
@@ -11,6 +11,7 @@ fromxml
 """
 
 from datetime import date
+from glob import glob
 import os
 import re
 import time
@@ -117,12 +118,19 @@ def firecrest(pathname):
     # username
     f.user = groups[3]
 
+    bustard_pattern = os.path.join(pathname, 'Bustard*')
     # should I parse this deeper than just stashing the 
     # contents of the matrix file?
     matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
-    if not os.path.exists(matrix_pathname):
+    if os.path.exists(matrix_pathname):
+        # this is for firecrest < 1.3.2
+        f.matrix = open(matrix_pathname, 'r').read()
+    elif glob(bustard_pattern) > 0:
+        f.matrix = None
+        # there are runs here. Bustard should save the matrix.
+    else:
         return None
-    f.matrix = open(matrix_pathname, 'r').read()
+
     return f
 
 def fromxml(tree):
index e1072400716b1bb3affd5c65158baf63701feadb..b8cbc56b1902aaf7b9d7f790b4dba71929c6d9c9 100644 (file)
@@ -321,6 +321,8 @@ def extract_results(runs, output_base_dir=None):
       # save run file
       r.save(cycle_dir)
 
+      return
+
       # Copy Summary.htm
       summary_path = os.path.join(r.gerald.pathname, 'Summary.htm')
       if os.path.exists(summary_path):
index 53c7301483eae6b009a3dd92b4af8c29d94c260b..455d8c1b73bdb537372d710c50f4ca3f5ef980a3 100644 (file)
@@ -15,14 +15,14 @@ def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
     os.mkdir(firecrest_dir)
     return firecrest_dir
     
-def make_ipar_dir(data_dir):
+def make_ipar_dir(data_dir, version='1.01'):
     """
     Construct an artificial ipar parameter file and directory
     """
     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
 
-    ipar_dir = os.path.join(data_dir, 'IPAR_1.01')
+    ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
     if not os.path.exists(ipar_dir):
       os.mkdir(ipar_dir)
     return ipar_dir
@@ -44,6 +44,11 @@ def make_flowcell_id(runfolder_dir, flowcell_id=None):
     f.write(config)
     f.close()
 
+def make_bustard_config132(gerald_dir):
+    source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
+    destination = os.path.join(gerald_dir, 'config.xml')
+    shutil.copy(source, destination)
+
 def make_matrix(matrix_filename):
     contents = """# Auto-generated frequency response matrix
 > A
index 23e5e9dd7c9a8c9ffebf44e9a135ae732cf75ea7..efa6fe1e97409947d251ba6bed110684793f2742 100644 (file)
@@ -29,12 +29,13 @@ def make_runfolder(obj=None):
     data_dir = os.path.join(runfolder_dir, 'Data')
     os.mkdir(data_dir)
 
-    ipar_dir = make_ipar_dir(data_dir)
+    ipar_dir = make_ipar_dir(data_dir, '1.30')
 
     bustard_dir = os.path.join(ipar_dir,
                                'Bustard1.3.2_15-03-2008_diane')
     os.mkdir(bustard_dir)
     make_phasing_params(bustard_dir)
+    make_bustard_config132(bustard_dir)
 
     gerald_dir = os.path.join(bustard_dir,
                               'GERALD_15-03-2008_diane')
@@ -94,6 +95,27 @@ class RunfolderTests(unittest.TestCase):
         self.failUnlessEqual(b.user,    'diane')
         self.failUnlessEqual(len(b.phasing), 8)
         self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+        self.failUnlessEqual(b.crosstalk.base.keys(), ['A','C','T','G'])
+
+        self.failUnlessAlmostEqual(b.crosstalk.base['A'][0],  1.27)
+        self.failUnlessAlmostEqual(b.crosstalk.base['A'][1],  0.20999999999999)
+        self.failUnlessAlmostEqual(b.crosstalk.base['A'][2], -0.02)
+        self.failUnlessAlmostEqual(b.crosstalk.base['A'][3], -0.03)
+
+        self.failUnlessAlmostEqual(b.crosstalk.base['C'][0],  0.57)
+        self.failUnlessAlmostEqual(b.crosstalk.base['C'][1],  0.58)
+        self.failUnlessAlmostEqual(b.crosstalk.base['C'][2], -0.01)
+        self.failUnlessAlmostEqual(b.crosstalk.base['C'][3], -0.01)
+
+        self.failUnlessAlmostEqual(b.crosstalk.base['T'][0], -0.02)
+        self.failUnlessAlmostEqual(b.crosstalk.base['T'][1], -0.02)
+        self.failUnlessAlmostEqual(b.crosstalk.base['T'][2],  0.80)
+        self.failUnlessAlmostEqual(b.crosstalk.base['T'][3],  1.07)
+
+        self.failUnlessAlmostEqual(b.crosstalk.base['G'][0], -0.03)
+        self.failUnlessAlmostEqual(b.crosstalk.base['G'][1], -0.04)
+        self.failUnlessAlmostEqual(b.crosstalk.base['G'][2],  1.51)
+        self.failUnlessAlmostEqual(b.crosstalk.base['G'][3], -0.02)
 
         xml = b.get_elements()
         b2 = bustard.Bustard(xml=xml)
diff --git a/htsworkflow/pipelines/test/testdata/bustard-config132.xml b/htsworkflow/pipelines/test/testdata/bustard-config132.xml
new file mode 100644 (file)
index 0000000..2820451
--- /dev/null
@@ -0,0 +1,129 @@
+<?xml version="1.0"?>
+<BaseCallAnalysis>
+  <Run Name="Bustard1.3.2_25-03-2009_diane">
+    <BaseCallParameters>
+      <ChastityThreshold>0.600000</ChastityThreshold>
+      <Matrix Path="/home/diane/gec/090206_HWI-EAS229_0090_311AFAAXX/Data/C1-152_Firecrest1.9.5_14-02-2009_diane/Matrix/s_4_matrix.txt">
+        <AutoFlag>0</AutoFlag>
+        <AutoLane>0</AutoLane>
+        <Cycle>2</Cycle>
+        <CycleOffset>1</CycleOffset>
+        <FirstCycle>1</FirstCycle>
+        <LastCycle>37</LastCycle>
+        <Read>1</Read>
+      </Matrix>
+      <MatrixElements Path="/home/diane/gec/090206_HWI-EAS229_0090_311AFAAXX/Data/C1-152_Firecrest1.9.5_14-02-2009_diane/Matrix/s_4_matrix.txt">
+        <Base>A</Base>
+        <Base>C</Base>
+        <Base>G</Base>
+        <Base>T</Base>
+        <Element>1.270000</Element>
+        <Element>0.210000</Element>
+        <Element>-0.020000</Element>
+        <Element>-0.030000</Element>
+        <Element>0.570000</Element>
+        <Element>0.580000</Element>
+        <Element>-0.010000</Element>
+        <Element>-0.010000</Element>
+        <Element>-0.030000</Element>
+        <Element>-0.040000</Element>
+        <Element>1.510000</Element>
+        <Element>-0.020000</Element>
+        <Element>-0.020000</Element>
+        <Element>-0.020000</Element>
+        <Element>0.800000</Element>
+        <Element>1.070000</Element>
+      </MatrixElements>
+      <Phasing Path="">
+        <AutoFlag>1</AutoFlag>
+        <AutoLane>3</AutoLane>
+        <Cycle>1</Cycle>
+        <CycleOffset>0</CycleOffset>
+        <FirstCycle>1</FirstCycle>
+        <LastCycle>37</LastCycle>
+        <Read>1</Read>
+      </Phasing>
+      <PhasingRestarts />
+      <PrbCompression>none</PrbCompression>
+      <PureBases>25</PureBases>
+      <QhgCompression>none</QhgCompression>
+      <SeqCompression>none</SeqCompression>
+      <Sig2Compression>gzip</Sig2Compression>
+      <SmtFilter>failed-chastity</SmtFilter>
+      <SmtRelation>le</SmtRelation>
+      <SmtThreshold>1.000000</SmtThreshold>
+    </BaseCallParameters>
+    <Cycles First="1" Last="37" Number="37" />
+    <Input Path="C1-37_Firecrest1.3.2_25-03-2009_diane" />
+    <RunParameters>
+      <AutoCycleFlag>1</AutoCycleFlag>
+      <BasecallFlag>1</BasecallFlag>
+      <Compression>gzip</Compression>
+      <CompressionSuffix>.gz</CompressionSuffix>
+      <Deblocked>0</Deblocked>
+      <DebugFlag>0</DebugFlag>
+      <FirstRunOnlyFlag>0</FirstRunOnlyFlag>
+      <ImagingReads Index="1">
+        <FirstCycle>1</FirstCycle>
+        <LastCycle>37</LastCycle>
+        <RunFolder>/zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX</RunFolder>
+      </ImagingReads>
+      <Instrument>HWI-EAS229</Instrument>
+      <IterativeMatrixFlag>0</IterativeMatrixFlag>
+      <MakeFlag>1</MakeFlag>
+      <MaxCycle>-1</MaxCycle>
+      <MinCycle>-1</MinCycle>
+      <Prb></Prb>
+      <Reads Index="1">
+        <FirstCycle>1</FirstCycle>
+        <LastCycle>37</LastCycle>
+        <RunFolder>/zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX</RunFolder>
+      </Reads>
+      <RunFolder>/zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX</RunFolder>
+      <RunFolderDate>090307</RunFolderDate>
+      <RunFolderId>97</RunFolderId>
+      <SelectedFolders />
+      <SelectedTiles />
+      <Sig2></Sig2>
+    </RunParameters>
+    <Software Name="Bustard" Version="1.3.2" />
+    <TileSelection>
+      <Lane Index="1">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="2">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="3">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="4">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="5">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="6">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="7">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+      <Lane Index="8">
+        <Sample>s</Sample>
+        <TileRange Max="100" Min="1" />
+      </Lane>
+    </TileSelection>
+    <Time>
+      <Start>25-03-09 15:23:10 PDT</Start>
+    </Time>
+    <User Name="diane" />
+  </Run>
+</BaseCallAnalysis>
index 4f487ea9d1106e30804fda38576b0431a6160c2b..09a2155995ed73f1031980d9ca806b0de58fe76f 100644 (file)
@@ -30,15 +30,15 @@ def make_commands(run_name, lanes, site_name, destdir):
     dest_path = os.path.join(destdir, destname)
     seq_pattern = 's_%d_*_seq.txt' % (lane,)
 
-    #cmd = ['solexa2srf', 
-    #       '-N', name_prefix,
-    #       '-n', '%3x:%3y', 
-    #       '-o', dest_path, 
-    #       seq_pattern]
-    cmd = ['illumina2srf', 
-           '-v1.0',
-           '-o', dest_path,
+    cmd = ['solexa2srf', 
+           '-N', name_prefix,
+           '-n', '%3x:%3y', 
+           '-o', dest_path, 
            seq_pattern]
+    #cmd = ['illumina2srf', 
+    #       '-v1.0',
+    #       '-o', dest_path,
+    #       seq_pattern]
 
     cmd_list.append(" ".join(cmd))
   return cmd_list