Add support for generating fasta files in addition to fastq files

[htsworkflow.git] / scripts / qseq2fastq
diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq

index 3911998bd104752038076d4613e9ab322ccbcf5c..6967b1e25b3b9bb17b9695848fd52ab9940b228c 100755 (executable)
--- a/scripts/qseq2fastq
+++ b/scripts/qseq2fastq
@@ -8,6 +8,9 @@ import sys
  import tarfile
  
  class Qseq2Fastq(object):
+    """
+    Convert qseq files to fastq (or fasta) files.
+    """
      def __init__(self, sources, pass_destination, nopass_destination=None):
          self.sources = sources
          self.pass_destination = pass_destination
@@ -16,10 +19,43 @@ class Qseq2Fastq(object):
          else:
              self.nopass_destination = pass_destination
          
+        self.fastq = True
+        self.flowcell_id = ''
          self.trim = slice(None)
          self.reportFilter = False
  
+    def _format_flowcell_id(self):
+        """
+        Return formatted flowcell ID
+        """
+        return self.flowcell_id+"_"
+    
+    def _convert_illumina_quality(self, illumina_quality):
+        """
+        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
+        """
+        # Illumina scores are Phred + 64
+        # Fastq scores are Phread + 33
+        # the following code grabs the string, converts to short ints and
+        # subtracts 31 (64-33) to convert between the two score formats.
+        # The numpy solution is twice as fast as some of my other
+        # ideas for the conversion.
+        # sorry about the uglyness in changing from character, to 8-bit int
+        # and back to a character array
+        quality = numpy.asarray(illumina_quality,'c')
+        quality.dtype = numpy.uint8
+        quality -= 31
+        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+        return quality
+        
      def run(self):
+        if self.fastq:
+            header_template = '@' 
+        else:
+            # fasta case
+            header_template = '>'
+        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
+        
          for qstream in self.sources:
              for line in qstream:
                  # parse line
@@ -33,18 +69,7 @@ class Qseq2Fastq(object):
                  index = record[6]
                  read = record[7]
                  sequence = record[8].replace('.','N')
-                # Illumina scores are Phred + 64
-                # Fastq scores are Phread + 33
-                # the following code grabs the string, converts to short ints and
-                # subtracts 31 (64-33) to convert between the two score formats.
-                # The numpy solution is twice as fast as some of my other
-                # ideas for the conversion.
-                # sorry about the uglyness in changing from character, to 8-bit int
-                # and back to a character array
-                quality = numpy.asarray(record[9],'c')
-                quality.dtype = numpy.uint8
-                quality -= 31
-                quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+                quality = self._convert_illumina_quality(record[9])
  
                  # add pass qc filter if we want it
                  pass_qc = int(record[10])
@@ -53,7 +78,7 @@ class Qseq2Fastq(object):
                  else:
                      pass_qc_msg = ""
  
-                header = '@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
+                header = header_template  % ( \
                      machine_name,
                      run_number,
                      lane_number,
@@ -74,10 +99,11 @@ class Qseq2Fastq(object):
                  destination.write(header)
                  destination.write(sequence[self.trim])
                  destination.write(os.linesep)
-                destination.write('+')
-                destination.write(os.linesep)
-                destination.write(quality[self.trim].tostring())
-                destination.write(os.linesep)
+                if self.fastq:
+                    destination.write('+')
+                    destination.write(os.linesep)
+                    destination.write(quality[self.trim].tostring())
+                    destination.write(os.linesep)
              
  def file_generator(pattern_list):
      for pattern in pattern_list:
@@ -107,6 +133,10 @@ def parse_slice(slice_text):
  def make_parser():
      usage = "%prog: [options] *_qseq.txt"
      parser = OptionParser(usage)
+    parser.add_option('-a', '--fasta', default=False, action="store_true",
+                      help="produce fasta files instead of fastq files")
+    parser.add_option('-f', '--flowcell', default='', 
+                      help="Set flowcell ID for output file")
      parser.add_option('-i', '--infile', default=None,
        help='source tar file (if reading from an archive instead of a directory)')
      parser.add_option('-o', '--output', default=None,
@@ -127,8 +157,11 @@ def main(cmdline=None):
  
      if opts.infile is not None:
          qseq_generator = tarfile_generator(opts.infile)
-    else:
+    elif len(args) > 0:
          qseq_generator = file_generator(args)
+    else:
+        qseq_generator = [sys.stdin]
+    
          
      if opts.output is not None:
          output = open(opts.output, 'w')
@@ -139,9 +172,10 @@ def main(cmdline=None):
          nopass_output = open(opts.nopass_output, 'w')
      else:
          nopass_output = None
-        
  
      qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
+    qseq_parser.fastq = not opts.fasta
+    qseq_parser.flowcell_id = opts.flowcell
      qseq_parser.trim = parse_slice(opts.slice)
      qseq_parser.reportFilter = opts.pf