Add a simple utility to convert qseq to fastq files.
authorDiane Trout <diane@caltech.edu>
Tue, 1 Dec 2009 01:41:40 +0000 (01:41 +0000)
committerDiane Trout <diane@caltech.edu>
Tue, 1 Dec 2009 01:41:40 +0000 (01:41 +0000)
It'll probably morph into a more complex utility in the near future.

scripts/qseq2fastq [new file with mode: 0644]

diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq
new file mode 100644 (file)
index 0000000..56589ee
--- /dev/null
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+import os
+from optparse import OptionParser
+import numpy
+import sys
+
+def make_parser():
+    parser = OptionParser()
+    parser.add_option('-o', '--output', help='output fastq file', default=None)
+    parser.add_option('-s', '--slice',
+                      help='specify python slice, e.g. 0:75, 0:-1',
+                      default=None)
+    return parser
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
+        
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+        
+def qseq2fastq(destination, qseqs, trim=None):
+    for q in qseqs:
+        for line in open(q):
+            # parse line
+            record = line.strip().split('\t')
+            machine_name = record[0]
+            run_number = record[1]
+            lane_number = record[2]
+            tile = record[3]
+            x = record[4]
+            y = record[5]
+            index = record[6]
+            read = record[7]
+            sequence = record[8]
+            # Illumina scores are Phred + 64
+            # Fastq scores are Phread + 33
+            # the following code grabs the string, converts to short ints and
+            # subtracts 31 (64-33) to convert between the two score formats.
+            # The numpy solution is twice as fast as some of my other
+            # ideas for the conversion.
+            # sorry about the uglyness in changing from character, to 8-bit int
+            # and back to a character array
+            quality = numpy.asarray(record[9],'c')
+            quality.dtype = numpy.uint8
+            quality -= 31
+            quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+            
+            pass_qc = record[10]
+
+            destination.write('@%s_%s:%s:%s:%s:%s pf=%s%s' % ( \
+                machine_name,
+                run_number,
+                lane_number,
+                tile,
+                x,
+                y,
+                pass_qc,
+                os.linesep))
+            destination.write(sequence[trim])
+            destination.write(os.linesep)
+            destination.write('+')
+            destination.write(os.linesep)
+            destination.write(quality[trim].tostring())
+            destination.write(os.linesep)
+        
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.output is not None:
+        dest = open(opts.output, 'w')
+    else:
+        dest = sys.stdout
+
+    subseq = parse_slice(opts.slice)
+
+    qseq2fastq(dest, args, subseq)
+    
+    return 0
+
+if __name__ == "__main__":
+    main()