From: Diane Trout Date: Thu, 8 Jul 2010 22:15:31 +0000 (+0000) Subject: Update srf2named_fastq to try to detect if the srf file is CNF1 or CNF4 X-Git-Tag: 0.4.4~9 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=4646ec51031de63b65196cb7f967b3fb25c6628c Update srf2named_fastq to try to detect if the srf file is CNF1 or CNF4 and figure out the correct option to pass to srf2fastq. --- diff --git a/scripts/srf2named_fastq.py b/scripts/srf2named_fastq.py index ebf3323..7bfc4da 100755 --- a/scripts/srf2named_fastq.py +++ b/scripts/srf2named_fastq.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import logging +import mmap from optparse import OptionParser import os from subprocess import Popen, PIPE @@ -19,7 +20,7 @@ def main(cmdline=None): opts, args = parser.parse_args(cmdline) if len(args) != 1: - parser.error("Requires one argument") + parser.error("Requires one argument, got: %s" % (str(args))) if opts.verbose: logging.basicConfig(level=logging.INFO) @@ -39,7 +40,7 @@ def main(cmdline=None): # open the srf, fastq, or compressed fastq if is_srf(args[0]): - source = srf_open(args[0], opts.cnf1) + source = srf_open(args[0]) else: source = autoopen(args[0]) @@ -55,9 +56,7 @@ def make_parser(): file can be either a fastq file or a srf file. You can also force the flowcell ID to be added to the header.""") - parser.add_option('-c','--cnf1',default=False, action="store_true", - help="pass -c to srf2fastq, needed for calibrated quality values" - ) + parser.add_option('--force', default=False, action="store_true", help="overwrite existing files.") parser.add_option('--flowcell', default=None, @@ -81,7 +80,7 @@ def srf_open(filename, cnf1=False): """ cmd = ['srf2fastq'] - if cnf1: + if is_cnf1(filename): cmd.append('-c') cmd.append(filename) @@ -185,6 +184,31 @@ def is_srf(filename): f.close() return header == "SSRF" +def is_cnf1(filename): + """ + Brute force detection if a SRF file is using CNF1/CNF4 records + """ + max_header = 1024 ** 2 + PROGRAM_ID = 'PROGRAM_ID\000' + + if not is_srf(filename): + raise ValueError("%s must be a srf file" % (filename,)) + + fd = os.open(filename, os.O_RDONLY) + f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + # alas the max search length requires python 2.6+ + program_id_location = f.find(PROGRAM_ID, 0) #, max_header) + program_header_start = program_id_location+len(PROGRAM_ID) + next_null = f.find('\000', program_header_start) #, max_header) + program_id_header = f[program_header_start:next_null] + f.close() + os.close(fd) + + if program_id_header == "solexa2srf v1.4": + return False + else: + return True + def open_write(filename, force=False): """ Open a file, but throw an exception if it already exists diff --git a/test/test_srf2fastq.py b/test/test_srf2fastq.py index f29cc70..83412c4 100644 --- a/test/test_srf2fastq.py +++ b/test/test_srf2fastq.py @@ -111,6 +111,27 @@ IIIIB+++ self.failUnlessEqual(lines1[2].rstrip(), '+') self.failUnlessEqual(lines1[3].rstrip(), '@IIIB+++') + def test_is_srf(self): + cnf4_srf = 'woldlab_070829_USI-EAS44_0017_FC11055_1.srf' + cnf4_path = os.path.join(_module_path, cnf4_srf) + cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' + cnf1_path = os.path.join(_module_path, cnf1_srf) + + is_srf = srf2named_fastq.is_srf + self.failUnlessEqual(is_srf(__file__), False) + self.failUnlessEqual(is_srf(cnf4_path), True) + self.failUnlessEqual(is_srf(cnf1_path), True) + + def test_is_cnf1(self): + cnf4_srf = 'woldlab_070829_USI-EAS44_0017_FC11055_1.srf' + cnf4_path = os.path.join(_module_path, cnf4_srf) + cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' + cnf1_path = os.path.join(_module_path, cnf1_srf) + + is_cnf1 = srf2named_fastq.is_cnf1 + self.failUnlessRaises(ValueError, is_cnf1, __file__) + self.failUnlessEqual(is_cnf1(cnf4_path), False) + self.failUnlessEqual(is_cnf1(cnf1_path), True) def suite(): return unittest.makeSuite(testSrf2Fastq,'test') diff --git a/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf b/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf new file mode 100644 index 0000000..dd802a7 Binary files /dev/null and b/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf differ diff --git a/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf b/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf new file mode 100644 index 0000000..b5d145f Binary files /dev/null and b/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf differ