Add support for generating fasta files in addition to fastq files

[htsworkflow.git] / scripts / qseq2fastq
diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq

old mode 100644 (file)

new mode 100755 (executable)

index 4262898..6967b1e
--- a/scripts/qseq2fastq
+++ b/scripts/qseq2fastq
@@ -1,17 +1,120 @@
  #!/usr/bin/env python
  
+from glob import glob
  import os
  from optparse import OptionParser
  import numpy
  import sys
+import tarfile
  
-def make_parser():
-    parser = OptionParser()
-    parser.add_option('-o', '--output', help='output fastq file', default=None)
-    parser.add_option('-s', '--slice',
-                      help='specify python slice, e.g. 0:75, 0:-1',
-                      default=None)
-    return parser
+class Qseq2Fastq(object):
+    """
+    Convert qseq files to fastq (or fasta) files.
+    """
+    def __init__(self, sources, pass_destination, nopass_destination=None):
+        self.sources = sources
+        self.pass_destination = pass_destination
+        if nopass_destination is not None:
+            self.nopass_destination = nopass_destination
+        else:
+            self.nopass_destination = pass_destination
+        
+        self.fastq = True
+        self.flowcell_id = ''
+        self.trim = slice(None)
+        self.reportFilter = False
+
+    def _format_flowcell_id(self):
+        """
+        Return formatted flowcell ID
+        """
+        return self.flowcell_id+"_"
+    
+    def _convert_illumina_quality(self, illumina_quality):
+        """
+        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
+        """
+        # Illumina scores are Phred + 64
+        # Fastq scores are Phread + 33
+        # the following code grabs the string, converts to short ints and
+        # subtracts 31 (64-33) to convert between the two score formats.
+        # The numpy solution is twice as fast as some of my other
+        # ideas for the conversion.
+        # sorry about the uglyness in changing from character, to 8-bit int
+        # and back to a character array
+        quality = numpy.asarray(illumina_quality,'c')
+        quality.dtype = numpy.uint8
+        quality -= 31
+        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+        return quality
+        
+    def run(self):
+        if self.fastq:
+            header_template = '@' 
+        else:
+            # fasta case
+            header_template = '>'
+        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
+        
+        for qstream in self.sources:
+            for line in qstream:
+                # parse line
+                record = line.rstrip().split('\t')
+                machine_name = record[0]
+                run_number = record[1]
+                lane_number = record[2]
+                tile = record[3]
+                x = record[4]
+                y = record[5]
+                index = record[6]
+                read = record[7]
+                sequence = record[8].replace('.','N')
+                quality = self._convert_illumina_quality(record[9])
+
+                # add pass qc filter if we want it
+                pass_qc = int(record[10])
+                if self.reportFilter:
+                    pass_qc_msg = " pf=%s" % (pass_qc)
+                else:
+                    pass_qc_msg = ""
+
+                header = header_template  % ( \
+                    machine_name,
+                    run_number,
+                    lane_number,
+                    tile,
+                    x,
+                    y,
+                    read,
+                    pass_qc_msg,
+                    os.linesep)
+
+
+                # if we passed the filter write to the "good" file
+                if pass_qc:
+                    destination = self.pass_destination
+                else:
+                    destination = self.nopass_destination
+                
+                destination.write(header)
+                destination.write(sequence[self.trim])
+                destination.write(os.linesep)
+                if self.fastq:
+                    destination.write('+')
+                    destination.write(os.linesep)
+                    destination.write(quality[self.trim].tostring())
+                    destination.write(os.linesep)
+            
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,'r')
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+    
  
  def parse_slice(slice_text):
      if slice_text is None or len(slice_text) == 0:
@@ -27,66 +130,56 @@ def parse_slice(slice_text):
  
      return slice(*slice_data)
          
-def qseq2fastq(destination, qseqs, trim=None):
-    for q in qseqs:
-        for line in open(q):
-            # parse line
-            record = line.strip().split('\t')
-            machine_name = record[0]
-            run_number = record[1]
-            lane_number = record[2]
-            tile = record[3]
-            x = record[4]
-            y = record[5]
-            index = record[6]
-            read = record[7]
-            sequence = record[8]
-            # Illumina scores are Phred + 64
-            # Fastq scores are Phread + 33
-            # the following code grabs the string, converts to short ints and
-            # subtracts 31 (64-33) to convert between the two score formats.
-            # The numpy solution is twice as fast as some of my other
-            # ideas for the conversion.
-            # sorry about the uglyness in changing from character, to 8-bit int
-            # and back to a character array
-            quality = numpy.asarray(record[9],'c')
-            quality.dtype = numpy.uint8
-            quality -= 31
-            quality.dtype = '|S1' # I'd like to know what the real numpy char type is
-            
-            pass_qc = record[10]
-
-            destination.write('@%s_%s:%s:%s:%s:%s/%s pf=%s%s' % ( \
-                machine_name,
-                run_number,
-                lane_number,
-                tile,
-                x,
-                y,
-                read,
-                pass_qc,
-                os.linesep))
-            destination.write(sequence[trim])
-            destination.write(os.linesep)
-            destination.write('+')
-            destination.write(os.linesep)
-            destination.write(quality[trim].tostring())
-            destination.write(os.linesep)
-        
+def make_parser():
+    usage = "%prog: [options] *_qseq.txt"
+    parser = OptionParser(usage)
+    parser.add_option('-a', '--fasta', default=False, action="store_true",
+                      help="produce fasta files instead of fastq files")
+    parser.add_option('-f', '--flowcell', default='', 
+                      help="Set flowcell ID for output file")
+    parser.add_option('-i', '--infile', default=None,
+      help='source tar file (if reading from an archive instead of a directory)')
+    parser.add_option('-o', '--output', default=None,
+                      help='output fastq file')
+    parser.add_option('-n', '--nopass-output', default=None,
+                      help='if provided send files that failed illumina filter '\
+                           'to a differentfile')
+    parser.add_option('-s', '--slice',
+                      help='specify python slice, e.g. 0:75, 0:-1',
+                      default=None)
+    parser.add_option('--pf', help="report pass filter flag", default=False,
+                      action="store_true")
+    return parser
+
  def main(cmdline=None):
      parser = make_parser()
      opts, args = parser.parse_args(cmdline)
  
+    if opts.infile is not None:
+        qseq_generator = tarfile_generator(opts.infile)
+    elif len(args) > 0:
+        qseq_generator = file_generator(args)
+    else:
+        qseq_generator = [sys.stdin]
+    
+        
      if opts.output is not None:
-        dest = open(opts.output, 'w')
+        output = open(opts.output, 'w')
+    else:
+        output = sys.stdout
+
+    if opts.nopass_output is not None:
+        nopass_output = open(opts.nopass_output, 'w')
      else:
-        dest = sys.stdout
+        nopass_output = None
  
-    subseq = parse_slice(opts.slice)
+    qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
+    qseq_parser.fastq = not opts.fasta
+    qseq_parser.flowcell_id = opts.flowcell
+    qseq_parser.trim = parse_slice(opts.slice)
+    qseq_parser.reportFilter = opts.pf
  
-    qseq2fastq(dest, args, subseq)
-    
-    return 0
+    qseq_parser.run()
  
  if __name__ == "__main__":
      main()