scripts/qseq2fastq

   1 #!/usr/bin/env python
   2
   3 from glob import glob
   4 import os
   5 from optparse import OptionParser
   6 import numpy
   7 import sys
   8 import tarfile
   9
  10 class Qseq2Fastq(object):
  11     def __init__(self, sources, pass_destination, nopass_destination=None):
  12         self.sources = sources
  13         self.pass_destination = pass_destination
  14         if nopass_destination is not None:
  15             self.nopass_destination = nopass_destination
  16         else:
  17             self.nopass_destination = pass_destination
  18
  19         self.trim = slice(None)
  20         self.reportFilter = False
  21
  22     def run(self):
  23         for qstream in self.sources:
  24             for line in qstream:
  25                 # parse line
  26                 record = line.rstrip().split('\t')
  27                 machine_name = record[0]
  28                 run_number = record[1]
  29                 lane_number = record[2]
  30                 tile = record[3]
  31                 x = record[4]
  32                 y = record[5]
  33                 index = record[6]
  34                 read = record[7]
  35                 sequence = record[8].replace('.','N')
  36                 # Illumina scores are Phred + 64
  37                 # Fastq scores are Phread + 33
  38                 # the following code grabs the string, converts to short ints and
  39                 # subtracts 31 (64-33) to convert between the two score formats.
  40                 # The numpy solution is twice as fast as some of my other
  41                 # ideas for the conversion.
  42                 # sorry about the uglyness in changing from character, to 8-bit int
  43                 # and back to a character array
  44                 quality = numpy.asarray(record[9],'c')
  45                 quality.dtype = numpy.uint8
  46                 quality -= 31
  47                 quality.dtype = '|S1' # I'd like to know what the real numpy char type is
  48
  49                 # add pass qc filter if we want it
  50                 pass_qc = int(record[10])
  51                 if self.reportFilter:
  52                     pass_qc_msg = " pf=%s" % (pass_qc)
  53                 else:
  54                     pass_qc_msg = ""
  55
  56                 header = '@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
  57                     machine_name,
  58                     run_number,
  59                     lane_number,
  60                     tile,
  61                     x,
  62                     y,
  63                     read,
  64                     pass_qc_msg,
  65                     os.linesep)
  66
  67
  68                 # if we passed the filter write to the "good" file
  69                 if pass_qc:
  70                     destination = self.pass_destination
  71                 else:
  72                     destination = self.nopass_destination
  73
  74                 destination.write(header)
  75                 destination.write(sequence[self.trim])
  76                 destination.write(os.linesep)
  77                 destination.write('+')
  78                 destination.write(os.linesep)
  79                 destination.write(quality[self.trim].tostring())
  80                 destination.write(os.linesep)
  81
  82 def file_generator(pattern_list):
  83     for pattern in pattern_list:
  84         for filename in glob(pattern):
  85             yield open(filename,'r')
  86
  87 def tarfile_generator(tarfilename):
  88     archive = tarfile.open(tarfilename,'r|*')
  89     for tarinfo in archive:
  90         yield archive.extractfile(tarinfo)
  91
  92
  93 def parse_slice(slice_text):
  94     if slice_text is None or len(slice_text) == 0:
  95         return slice(None)
  96
  97     slice_data = []
  98     for element in slice_text.split(':'):
  99         if len(element) == 0:
 100             element = None
 101         else:
 102             element = int(element)
 103         slice_data.append(element)
 104
 105     return slice(*slice_data)
 106
 107 def make_parser():
 108     usage = "%prog: [options] *_qseq.txt"
 109     parser = OptionParser(usage)
 110     parser.add_option('-i', '--infile', default=None,
 111       help='source tar file (if reading from an archive instead of a directory)')
 112     parser.add_option('-o', '--output', default=None,
 113                       help='output fastq file')
 114     parser.add_option('-n', '--nopass-output', default=None,
 115                       help='if provided send files that failed illumina filter '\
 116                            'to a differentfile')
 117     parser.add_option('-s', '--slice',
 118                       help='specify python slice, e.g. 0:75, 0:-1',
 119                       default=None)
 120     parser.add_option('--pf', help="report pass filter flag", default=False,
 121                       action="store_true")
 122     return parser
 123
 124 def main(cmdline=None):
 125     parser = make_parser()
 126     opts, args = parser.parse_args(cmdline)
 127
 128     if opts.infile is not None:
 129         qseq_generator = tarfile_generator(opts.infile)
 130     else:
 131         qseq_generator = file_generator(args)
 132
 133     if opts.output is not None:
 134         output = open(opts.output, 'w')
 135     else:
 136         output = sys.stdout
 137
 138     if opts.nopass_output is not None:
 139         nopass_output = open(opts.nopass_output, 'w')
 140     else:
 141         nopass_output = None
 142
 143
 144     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
 145     qseq_parser.trim = parse_slice(opts.slice)
 146     qseq_parser.reportFilter = opts.pf
 147
 148     qseq_parser.run()
 149
 150 if __name__ == "__main__":
 151     main()