Update srf2named_fastq to try to detect if the srf file is CNF1 or CNF4
authorDiane Trout <diane@caltech.edu>
Thu, 8 Jul 2010 22:15:31 +0000 (22:15 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 8 Jul 2010 22:15:31 +0000 (22:15 +0000)
and figure out the correct option to pass to srf2fastq.

scripts/srf2named_fastq.py
test/test_srf2fastq.py
test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf [new file with mode: 0644]
test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf [new file with mode: 0644]

index ebf33232a1c126a0ff1f7d0e7e3ce3d3bb1179fa..7bfc4dac828297a12936f5d5bb41bf330c7a8ed7 100755 (executable)
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import logging
+import mmap
 from optparse import OptionParser
 import os
 from subprocess import Popen, PIPE
@@ -19,7 +20,7 @@ def main(cmdline=None):
     opts, args = parser.parse_args(cmdline)
 
     if len(args) != 1:
-        parser.error("Requires one argument")
+        parser.error("Requires one argument, got: %s" % (str(args)))
 
     if opts.verbose:
         logging.basicConfig(level=logging.INFO)
@@ -39,7 +40,7 @@ def main(cmdline=None):
     
     # open the srf, fastq, or compressed fastq
     if is_srf(args[0]):
-        source = srf_open(args[0], opts.cnf1)
+        source = srf_open(args[0])
     else:
         source = autoopen(args[0])
 
@@ -55,9 +56,7 @@ def make_parser():
 
 file can be either a fastq file or a srf file.
 You can also force the flowcell ID to be added to the header.""")
-    parser.add_option('-c','--cnf1',default=False, action="store_true",
-      help="pass -c to srf2fastq, needed for calibrated quality values"
-    )
+
     parser.add_option('--force', default=False, action="store_true",
                       help="overwrite existing files.")
     parser.add_option('--flowcell', default=None,
@@ -81,7 +80,7 @@ def srf_open(filename, cnf1=False):
     """
     
     cmd = ['srf2fastq']
-    if cnf1:
+    if is_cnf1(filename):
         cmd.append('-c')
     cmd.append(filename)
       
@@ -185,6 +184,31 @@ def is_srf(filename):
     f.close()
     return header == "SSRF"
 
+def is_cnf1(filename):
+    """
+    Brute force detection if a SRF file is using CNF1/CNF4 records
+    """
+    max_header = 1024 ** 2
+    PROGRAM_ID = 'PROGRAM_ID\000'
+
+    if not is_srf(filename):
+        raise ValueError("%s must be a srf file" % (filename,))
+
+    fd = os.open(filename, os.O_RDONLY)
+    f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+    # alas the max search length requires python 2.6+
+    program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
+    program_header_start = program_id_location+len(PROGRAM_ID)
+    next_null = f.find('\000', program_header_start) #, max_header)
+    program_id_header = f[program_header_start:next_null]
+    f.close()
+    os.close(fd)
+
+    if program_id_header == "solexa2srf v1.4":
+        return False
+    else:
+        return True
+
 def open_write(filename, force=False):
     """
     Open a file, but throw an exception if it already exists
index f29cc7048ce205fe52c5e841b8bf2d1dfd46bfc0..83412c4b766e4a3a96688263d4b3a23fa72155f0 100644 (file)
@@ -111,6 +111,27 @@ IIIIB+++
         self.failUnlessEqual(lines1[2].rstrip(), '+')
         self.failUnlessEqual(lines1[3].rstrip(), '@IIIB+++')
 
+    def test_is_srf(self):        
+        cnf4_srf = 'woldlab_070829_USI-EAS44_0017_FC11055_1.srf'
+        cnf4_path = os.path.join(_module_path, cnf4_srf)
+        cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
+        cnf1_path = os.path.join(_module_path, cnf1_srf)
+        
+        is_srf = srf2named_fastq.is_srf
+        self.failUnlessEqual(is_srf(__file__), False)
+        self.failUnlessEqual(is_srf(cnf4_path), True)
+        self.failUnlessEqual(is_srf(cnf1_path), True)
+
+    def test_is_cnf1(self):        
+        cnf4_srf = 'woldlab_070829_USI-EAS44_0017_FC11055_1.srf'
+        cnf4_path = os.path.join(_module_path, cnf4_srf)
+        cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
+        cnf1_path = os.path.join(_module_path, cnf1_srf)
+        
+        is_cnf1 = srf2named_fastq.is_cnf1
+        self.failUnlessRaises(ValueError, is_cnf1, __file__)
+        self.failUnlessEqual(is_cnf1(cnf4_path), False)
+        self.failUnlessEqual(is_cnf1(cnf1_path), True)
 
 def suite():
     return unittest.makeSuite(testSrf2Fastq,'test')
diff --git a/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf b/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf
new file mode 100644 (file)
index 0000000..dd802a7
Binary files /dev/null and b/test/woldlab_070829_USI-EAS44_0017_FC11055_1.srf differ
diff --git a/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf b/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf
new file mode 100644 (file)
index 0000000..b5d145f
Binary files /dev/null and b/test/woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf differ