From: Diane Trout Date: Sat, 28 Mar 2009 02:17:22 +0000 (+0000) Subject: For pipeline 1.1rc1 or 1.3.2, look for the matrix files in the bustard dir X-Git-Tag: 0.2.0.2~3 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=7ea58f404c983b85a493155db3fa973b450e033e For pipeline 1.1rc1 or 1.3.2, look for the matrix files in the bustard dir also if the bustard config.xml file is present, check to see if the matrix file was forced in there. --- diff --git a/htsworkflow/pipelines/bustard.py b/htsworkflow/pipelines/bustard.py index a10c909..a8eb4c7 100644 --- a/htsworkflow/pipelines/bustard.py +++ b/htsworkflow/pipelines/bustard.py @@ -4,12 +4,14 @@ Extract configuration from Illumina Bustard Directory. This includes the version number, run date, bustard executable parameters, and phasing estimates. """ +from copy import copy from datetime import date from glob import glob import logging import os -import time import re +import sys +import time from htsworkflow.pipelines.runfolder import \ ElementTree, \ @@ -19,6 +21,8 @@ from htsworkflow.pipelines.runfolder import \ # make epydoc happy __docformat__ = "restructuredtext en" +LANE_LIST = range(1,9) + class Phasing(object): PHASING = 'Phasing' PREPHASING = 'Prephasing' @@ -44,10 +48,13 @@ class Phasing(object): def get_elements(self): root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)}) + root.tail = os.linesep phasing = ElementTree.SubElement(root, Phasing.PHASING) phasing.text = str(self.phasing) + phasing.tail = os.linesep prephasing = ElementTree.SubElement(root, Phasing.PREPHASING) prephasing.text = str(self.prephasing) + prephasing.tail = os.linesep return root def set_elements(self, tree): @@ -62,8 +69,112 @@ class Phasing(object): elif element.tag == Phasing.PREPHASING: self.prephasing = float(element.text) +class CrosstalkMatrix(object): + CROSSTALK = "MatrixElements" + BASE = 'Base' + ELEMENT = 'Element' + + def __init__(self, fromfile=None, xml=None): + self.base = {} + + if fromfile is not None: + self._initialize_from_file(fromfile) + elif xml is not None: + self.set_elements(xml) + + def _initialize_from_file(self, pathname): + data = open(pathname).readlines() + auto_header = '# Auto-generated frequency response matrix' + if data[0].strip() != auto_header or len(data) != 9: + raise RuntimeError("matrix file %s is unusual" % (pathname,)) + # skip over lines 1,2,3,4 which contain the 4 bases + self.base['A'] = [ float(v) for v in data[5].split() ] + self.base['C'] = [ float(v) for v in data[6].split() ] + self.base['G'] = [ float(v) for v in data[7].split() ] + self.base['T'] = [ float(v) for v in data[8].split() ] + + def get_elements(self): + root = ElementTree.Element(CrosstalkMatrix.CROSSTALK) + root.tail = os.linesep + base_order = ['A','C','G','T'] + for b in base_order: + base_element = ElementTree.SubElement(root, CrosstalkMatrix.BASE) + base_element.text = b + base_element.tail = os.linesep + for b in base_order: + for value in self.base[b]: + crosstalk_value = ElementTree.SubElement(root, CrosstalkMatrix.ELEMENT) + crosstalk_value.text = unicode(value) + crosstalk_value.tail = os.linesep + + return root + + def set_elements(self, tree): + if tree.tag != CrosstalkMatrix.CROSSTALK: + raise ValueError('Invalid run-xml exptected '+CrosstalkMatrix.CROSSTALK) + base_order = [] + current_base = None + current_index = 0 + for element in tree.getchildren(): + # read in the order of the bases + if element.tag == 'Base': + base_order.append(element.text) + elif element.tag == 'Element': + # we're done reading bases, now its just the 4x4 matrix + # written out as a list of elements + # if this is the first element, make a copy of the list + # to play with and initialize an empty list for the current base + if current_base is None: + current_base = copy(base_order) + self.base[current_base[0]] = [] + # we found (probably) 4 bases go to the next base + if current_index == len(base_order): + current_base.pop(0) + current_index = 0 + self.base[current_base[0]] = [] + value = float(element.text) + self.base[current_base[0]].append(value) + + current_index += 1 + else: + raise RuntimeError("Unrecognized tag in run xml: %s" %(element.tag,)) + +def crosstalk_matrix_from_bustard_config(bustard_path, bustard_config_tree): + """ + Analyze the bustard config file and try to find the crosstalk matrix. + """ + bustard_run = bustard_config_tree[0] + if bustard_run.tag != 'Run': + raise RuntimeError('Expected Run tag, got %s' % (bustard_run.tag,)) + + call_parameters = bustard_run.find('BaseCallParameters') + if call_parameters is None: + raise RuntimeError('Missing BaseCallParameters section') + + matrix = call_parameters.find('Matrix') + if matrix is None: + raise RuntimeError('Expected to find Matrix in Bustard BaseCallParameters') + + matrix_auto_flag = int(matrix.find('AutoFlag').text) + matrix_auto_lane = int(matrix.find('AutoLane').text) + + if matrix_auto_flag: + # we estimated the matrix from something in this run. + # though we don't really care which lane it was + matrix_path = os.path.join(bustard_path, 'Matrix', 's_02_matrix.txt') + matrix = CrosstalkMatrix(matrix_path) + else: + # the matrix was provided + matrix_elements = call_parameters.find('MatrixElements') + if matrix_elements is None: + raise RuntimeError('Expected to find MatrixElements in Bustard BaseCallParameters') + matrix = CrosstalkMatrix(xml=matrix_elements) + + print "matrix:", matrix + return matrix + class Bustard(object): - XML_VERSION = 1 + XML_VERSION = 2 # Xml Tags BUSTARD = 'Bustard' @@ -71,13 +182,16 @@ class Bustard(object): DATE = 'run_time' USER = 'user' PARAMETERS = 'Parameters' + BUSTARD_CONFIG = 'BaseCallAnalysis' def __init__(self, xml=None): self.version = None self.date = date.today() self.user = None self.phasing = {} + self.crosstalk = {} self.pathname = None + self.bustard_config = None if xml is not None: self.set_elements(xml) @@ -87,12 +201,8 @@ class Bustard(object): time = property(_get_time, doc='return run time as seconds since epoch') def dump(self): - print "Bustard version:", self.version - print "Run date", self.date - print "user:", self.user - for lane, tree in self.phasing.items(): - print lane - print tree + #print ElementTree.tostring(self.get_elements()) + ElementTree.dump(self.get_elements()) def get_elements(self): root = ElementTree.Element('Bustard', @@ -104,8 +214,13 @@ class Bustard(object): user = ElementTree.SubElement(root, Bustard.USER) user.text = self.user params = ElementTree.SubElement(root, Bustard.PARAMETERS) - for p in self.phasing.values(): - params.append(p.get_elements()) + + for lane in LANE_LIST: + params.append(self.phasing[lane].get_elements()) + #params.append(self.crosstalk[lane].get_elements()) + + if self.bustard_config is not None: + root.append(self.bustard_config) return root def set_elements(self, tree): @@ -125,11 +240,11 @@ class Bustard(object): for param in element: p = Phasing(xml=param) self.phasing[p.lane] = p + elif element.tag == Bustard.BUSTARD_CONFIG: + self.bustard_config = element else: raise ValueError("Unrecognized tag: %s" % (element.tag,)) - - def bustard(pathname): """ Construct a Bustard object by analyzing an Illumina Bustard directory. @@ -141,6 +256,7 @@ def bustard(pathname): Fully initialized Bustard object. """ b = Bustard() + pathname = os.path.abspath(pathname) path, name = os.path.split(pathname) groups = name.split("_") version = re.search(VERSION_RE, groups[0]) @@ -149,11 +265,26 @@ def bustard(pathname): b.date = date(*t[0:3]) b.user = groups[2] b.pathname = pathname + bustard_config_filename = os.path.join(pathname, 'config.xml') + print bustard_config_filename paramfiles = glob(os.path.join(pathname, "params?.xml")) for paramfile in paramfiles: phasing = Phasing(paramfile) assert (phasing.lane >= 1 and phasing.lane <= 8) b.phasing[phasing.lane] = phasing + # I only found these in Bustard1.9.5/1.9.6 directories + if b.version in ('1.9.5', '1.9.6'): + # at least for our runfolders for 1.9.5 and 1.9.6 matrix[1-8].txt are always the same + crosstalk_file = os.path.join(pathname, "matrix1.txt") + b.crosstalk = CrosstalkMatrix(crosstalk_file) + # for version 1.3.2 of the pipeline the bustard version number went down + # to match the rest of the pipeline. However there's now a nifty + # new (useful) bustard config file. + elif os.path.exists(bustard_config_filename): + bustard_config_root = ElementTree.parse(bustard_config_filename) + b.bustard_config = bustard_config_root.getroot() + b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname, b.bustard_config) + return b def fromxml(tree): @@ -163,3 +294,33 @@ def fromxml(tree): b = Bustard() b.set_elements(tree) return b + +def make_cmdline_parser(): + from optparse import OptionParser + parser = OptionParser('%prog: bustard_directory') + return parser + +def main(cmdline): + parser = make_cmdline_parser() + opts, args = parser.parse_args(cmdline) + + for bustard_dir in args: + print u'analyzing bustard directory: ' + unicode(bustard_dir) + bustard_object = bustard(bustard_dir) + bustard_object.dump() + + bustard_object2 = Bustard(xml=bustard_object.get_elements()) + print ('-------------------------------------') + bustard_object2.dump() + print ('=====================================') + b1_tree = bustard_object.get_elements() + b1 = ElementTree.tostring(b1_tree).split(os.linesep) + b2_tree = bustard_object2.get_elements() + b2 = ElementTree.tostring(b2_tree).split(os.linesep) + for line1, line2 in zip(b1, b2): + if b1 != b2: + print "b1: ", b1 + print "b2: ", b2 + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/htsworkflow/pipelines/firecrest.py b/htsworkflow/pipelines/firecrest.py index 847f608..bcac6ed 100644 --- a/htsworkflow/pipelines/firecrest.py +++ b/htsworkflow/pipelines/firecrest.py @@ -11,6 +11,7 @@ fromxml """ from datetime import date +from glob import glob import os import re import time @@ -117,12 +118,19 @@ def firecrest(pathname): # username f.user = groups[3] + bustard_pattern = os.path.join(pathname, 'Bustard*') # should I parse this deeper than just stashing the # contents of the matrix file? matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') - if not os.path.exists(matrix_pathname): + if os.path.exists(matrix_pathname): + # this is for firecrest < 1.3.2 + f.matrix = open(matrix_pathname, 'r').read() + elif glob(bustard_pattern) > 0: + f.matrix = None + # there are runs here. Bustard should save the matrix. + else: return None - f.matrix = open(matrix_pathname, 'r').read() + return f def fromxml(tree): diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index e107240..b8cbc56 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -321,6 +321,8 @@ def extract_results(runs, output_base_dir=None): # save run file r.save(cycle_dir) + return + # Copy Summary.htm summary_path = os.path.join(r.gerald.pathname, 'Summary.htm') if os.path.exists(summary_path): diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index 53c7301..455d8c1 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -15,14 +15,14 @@ def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37): os.mkdir(firecrest_dir) return firecrest_dir -def make_ipar_dir(data_dir): +def make_ipar_dir(data_dir, version='1.01'): """ Construct an artificial ipar parameter file and directory """ ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params') shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params')) - ipar_dir = os.path.join(data_dir, 'IPAR_1.01') + ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,)) if not os.path.exists(ipar_dir): os.mkdir(ipar_dir) return ipar_dir @@ -44,6 +44,11 @@ def make_flowcell_id(runfolder_dir, flowcell_id=None): f.write(config) f.close() +def make_bustard_config132(gerald_dir): + source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml') + destination = os.path.join(gerald_dir, 'config.xml') + shutil.copy(source, destination) + def make_matrix(matrix_filename): contents = """# Auto-generated frequency response matrix > A diff --git a/htsworkflow/pipelines/test/test_runfolder_ipar130.py b/htsworkflow/pipelines/test/test_runfolder_ipar130.py index 23e5e9d..efa6fe1 100644 --- a/htsworkflow/pipelines/test/test_runfolder_ipar130.py +++ b/htsworkflow/pipelines/test/test_runfolder_ipar130.py @@ -29,12 +29,13 @@ def make_runfolder(obj=None): data_dir = os.path.join(runfolder_dir, 'Data') os.mkdir(data_dir) - ipar_dir = make_ipar_dir(data_dir) + ipar_dir = make_ipar_dir(data_dir, '1.30') bustard_dir = os.path.join(ipar_dir, 'Bustard1.3.2_15-03-2008_diane') os.mkdir(bustard_dir) make_phasing_params(bustard_dir) + make_bustard_config132(bustard_dir) gerald_dir = os.path.join(bustard_dir, 'GERALD_15-03-2008_diane') @@ -94,6 +95,27 @@ class RunfolderTests(unittest.TestCase): self.failUnlessEqual(b.user, 'diane') self.failUnlessEqual(len(b.phasing), 8) self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + self.failUnlessEqual(b.crosstalk.base.keys(), ['A','C','T','G']) + + self.failUnlessAlmostEqual(b.crosstalk.base['A'][0], 1.27) + self.failUnlessAlmostEqual(b.crosstalk.base['A'][1], 0.20999999999999) + self.failUnlessAlmostEqual(b.crosstalk.base['A'][2], -0.02) + self.failUnlessAlmostEqual(b.crosstalk.base['A'][3], -0.03) + + self.failUnlessAlmostEqual(b.crosstalk.base['C'][0], 0.57) + self.failUnlessAlmostEqual(b.crosstalk.base['C'][1], 0.58) + self.failUnlessAlmostEqual(b.crosstalk.base['C'][2], -0.01) + self.failUnlessAlmostEqual(b.crosstalk.base['C'][3], -0.01) + + self.failUnlessAlmostEqual(b.crosstalk.base['T'][0], -0.02) + self.failUnlessAlmostEqual(b.crosstalk.base['T'][1], -0.02) + self.failUnlessAlmostEqual(b.crosstalk.base['T'][2], 0.80) + self.failUnlessAlmostEqual(b.crosstalk.base['T'][3], 1.07) + + self.failUnlessAlmostEqual(b.crosstalk.base['G'][0], -0.03) + self.failUnlessAlmostEqual(b.crosstalk.base['G'][1], -0.04) + self.failUnlessAlmostEqual(b.crosstalk.base['G'][2], 1.51) + self.failUnlessAlmostEqual(b.crosstalk.base['G'][3], -0.02) xml = b.get_elements() b2 = bustard.Bustard(xml=xml) diff --git a/htsworkflow/pipelines/test/testdata/bustard-config132.xml b/htsworkflow/pipelines/test/testdata/bustard-config132.xml new file mode 100644 index 0000000..2820451 --- /dev/null +++ b/htsworkflow/pipelines/test/testdata/bustard-config132.xml @@ -0,0 +1,129 @@ + + + + + 0.600000 + + 0 + 0 + 2 + 1 + 1 + 37 + 1 + + + A + C + G + T + 1.270000 + 0.210000 + -0.020000 + -0.030000 + 0.570000 + 0.580000 + -0.010000 + -0.010000 + -0.030000 + -0.040000 + 1.510000 + -0.020000 + -0.020000 + -0.020000 + 0.800000 + 1.070000 + + + 1 + 3 + 1 + 0 + 1 + 37 + 1 + + + none + 25 + none + none + gzip + failed-chastity + le + 1.000000 + + + + + 1 + 1 + gzip + .gz + 0 + 0 + 0 + + 1 + 37 + /zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX + + HWI-EAS229 + 0 + 1 + -1 + -1 + + + 1 + 37 + /zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX + + /zfs1/wold-lab/diane/sequencer/090307_HWI-EAS229_0097_30U0BAAXX + 090307 + 97 + + + + + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + + + + diff --git a/scripts/srf b/scripts/srf index 4f487ea..09a2155 100644 --- a/scripts/srf +++ b/scripts/srf @@ -30,15 +30,15 @@ def make_commands(run_name, lanes, site_name, destdir): dest_path = os.path.join(destdir, destname) seq_pattern = 's_%d_*_seq.txt' % (lane,) - #cmd = ['solexa2srf', - # '-N', name_prefix, - # '-n', '%3x:%3y', - # '-o', dest_path, - # seq_pattern] - cmd = ['illumina2srf', - '-v1.0', - '-o', dest_path, + cmd = ['solexa2srf', + '-N', name_prefix, + '-n', '%3x:%3y', + '-o', dest_path, seq_pattern] + #cmd = ['illumina2srf', + # '-v1.0', + # '-o', dest_path, + # seq_pattern] cmd_list.append(" ".join(cmd)) return cmd_list