Modify qseq2fastq to also read from compressed tar files containing qseq files
[htsworkflow.git] / scripts / qseq2fastq
old mode 100644 (file)
new mode 100755 (executable)
index 56589ee..2f3d7ef
@@ -1,35 +1,15 @@
 #!/usr/bin/env python
 
+from glob import glob
 import os
 from optparse import OptionParser
 import numpy
 import sys
+import tarfile
 
-def make_parser():
-    parser = OptionParser()
-    parser.add_option('-o', '--output', help='output fastq file', default=None)
-    parser.add_option('-s', '--slice',
-                      help='specify python slice, e.g. 0:75, 0:-1',
-                      default=None)
-    return parser
-
-def parse_slice(slice_text):
-    if slice_text is None or len(slice_text) == 0:
-        return slice(None)
-        
-    slice_data = []
-    for element in slice_text.split(':'):
-        if len(element) == 0:
-            element = None
-        else:
-            element = int(element)
-        slice_data.append(element)
-
-    return slice(*slice_data)
-        
-def qseq2fastq(destination, qseqs, trim=None):
-    for q in qseqs:
-        for line in open(q):
+def qseq2fastq(destination, qseqs, trim=None, pf=False):
+    for qstream in qseqs:
+        for line in qstream:
             # parse line
             record = line.strip().split('\t')
             machine_name = record[0]
@@ -40,7 +20,7 @@ def qseq2fastq(destination, qseqs, trim=None):
             y = record[5]
             index = record[6]
             read = record[7]
-            sequence = record[8]
+            sequence = record[8].replace('.','N')
             # Illumina scores are Phred + 64
             # Fastq scores are Phread + 33
             # the following code grabs the string, converts to short ints and
@@ -53,17 +33,22 @@ def qseq2fastq(destination, qseqs, trim=None):
             quality.dtype = numpy.uint8
             quality -= 31
             quality.dtype = '|S1' # I'd like to know what the real numpy char type is
-            
-            pass_qc = record[10]
 
-            destination.write('@%s_%s:%s:%s:%s:%s pf=%s%s' % ( \
+            if pf:
+                pass_qc = record[10]
+                pass_qc_msg = " pf=%s" % (pass_qc)
+            else:
+                pass_qc_msg = ""
+                
+            destination.write('@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
                 machine_name,
                 run_number,
                 lane_number,
                 tile,
                 x,
                 y,
-                pass_qc,
+                read,
+                pass_qc_msg,
                 os.linesep))
             destination.write(sequence[trim])
             destination.write(os.linesep)
@@ -71,11 +56,54 @@ def qseq2fastq(destination, qseqs, trim=None):
             destination.write(os.linesep)
             destination.write(quality[trim].tostring())
             destination.write(os.linesep)
+
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,'r')
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+    
+def make_parser():
+    usage = "%prog: [options] *_qseq.txt"
+    parser = OptionParser(usage)
+    parser.add_option('-i', '--infile', default=None,
+      help='source tar file (if reading from an archive instead of a directory)')
+    parser.add_option('-o', '--output', help='output fastq file', default=None)
+    parser.add_option('-s', '--slice',
+                      help='specify python slice, e.g. 0:75, 0:-1',
+                      default=None)
+    parser.add_option('--pf', help="report pass filter flag", default=False,
+                      action="store_true")
+    return parser
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
         
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+        
+
 def main(cmdline=None):
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
 
+    if opts.infile is not None:
+        qseq_generator = tarfile_generator(opts.infile)
+    else:
+        qseq_generator = file_generator(args)
+        
     if opts.output is not None:
         dest = open(opts.output, 'w')
     else:
@@ -83,7 +111,7 @@ def main(cmdline=None):
 
     subseq = parse_slice(opts.slice)
 
-    qseq2fastq(dest, args, subseq)
+    qseq2fastq(dest, qseq_generator, subseq, opts.pf)
     
     return 0