Extend qseq2fastq to write to two fastq files,
[htsworkflow.git] / scripts / qseq2fastq
1 #!/usr/bin/env python
2
3 from glob import glob
4 import os
5 from optparse import OptionParser
6 import numpy
7 import sys
8 import tarfile
9
10 class Qseq2Fastq(object):
11     def __init__(self, sources, pass_destination, nopass_destination=None):
12         self.sources = sources
13         self.pass_destination = pass_destination
14         if nopass_destination is not None:
15             self.nopass_destination = nopass_destination
16         else:
17             self.nopass_destination = pass_destination
18         
19         self.trim = slice(None)
20         self.reportFilter = False
21
22     def run(self):
23         for qstream in self.sources:
24             for line in qstream:
25                 # parse line
26                 record = line.rstrip().split('\t')
27                 machine_name = record[0]
28                 run_number = record[1]
29                 lane_number = record[2]
30                 tile = record[3]
31                 x = record[4]
32                 y = record[5]
33                 index = record[6]
34                 read = record[7]
35                 sequence = record[8].replace('.','N')
36                 # Illumina scores are Phred + 64
37                 # Fastq scores are Phread + 33
38                 # the following code grabs the string, converts to short ints and
39                 # subtracts 31 (64-33) to convert between the two score formats.
40                 # The numpy solution is twice as fast as some of my other
41                 # ideas for the conversion.
42                 # sorry about the uglyness in changing from character, to 8-bit int
43                 # and back to a character array
44                 quality = numpy.asarray(record[9],'c')
45                 quality.dtype = numpy.uint8
46                 quality -= 31
47                 quality.dtype = '|S1' # I'd like to know what the real numpy char type is
48
49                 # add pass qc filter if we want it
50                 pass_qc = int(record[10])
51                 if self.reportFilter:
52                     pass_qc_msg = " pf=%s" % (pass_qc)
53                 else:
54                     pass_qc_msg = ""
55
56                 header = '@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
57                     machine_name,
58                     run_number,
59                     lane_number,
60                     tile,
61                     x,
62                     y,
63                     read,
64                     pass_qc_msg,
65                     os.linesep)
66
67
68                 # if we passed the filter write to the "good" file
69                 if pass_qc:
70                     destination = self.pass_destination
71                 else:
72                     destination = self.nopass_destination
73                 
74                 destination.write(header)
75                 destination.write(sequence[self.trim])
76                 destination.write(os.linesep)
77                 destination.write('+')
78                 destination.write(os.linesep)
79                 destination.write(quality[self.trim].tostring())
80                 destination.write(os.linesep)
81             
82 def file_generator(pattern_list):
83     for pattern in pattern_list:
84         for filename in glob(pattern):
85             yield open(filename,'r')
86
87 def tarfile_generator(tarfilename):
88     archive = tarfile.open(tarfilename,'r|*')
89     for tarinfo in archive:
90         yield archive.extractfile(tarinfo)
91     
92
93 def parse_slice(slice_text):
94     if slice_text is None or len(slice_text) == 0:
95         return slice(None)
96         
97     slice_data = []
98     for element in slice_text.split(':'):
99         if len(element) == 0:
100             element = None
101         else:
102             element = int(element)
103         slice_data.append(element)
104
105     return slice(*slice_data)
106         
107 def make_parser():
108     usage = "%prog: [options] *_qseq.txt"
109     parser = OptionParser(usage)
110     parser.add_option('-i', '--infile', default=None,
111       help='source tar file (if reading from an archive instead of a directory)')
112     parser.add_option('-o', '--output', default=None,
113                       help='output fastq file')
114     parser.add_option('-n', '--nopass-output', default=None,
115                       help='if provided send files that failed illumina filter '\
116                            'to a differentfile')
117     parser.add_option('-s', '--slice',
118                       help='specify python slice, e.g. 0:75, 0:-1',
119                       default=None)
120     parser.add_option('--pf', help="report pass filter flag", default=False,
121                       action="store_true")
122     return parser
123
124 def main(cmdline=None):
125     parser = make_parser()
126     opts, args = parser.parse_args(cmdline)
127
128     if opts.infile is not None:
129         qseq_generator = tarfile_generator(opts.infile)
130     else:
131         qseq_generator = file_generator(args)
132         
133     if opts.output is not None:
134         output = open(opts.output, 'w')
135     else:
136         output = sys.stdout
137
138     if opts.nopass_output is not None:
139         nopass_output = open(opts.nopass_output, 'w')
140     else:
141         nopass_output = None
142         
143
144     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
145     qseq_parser.trim = parse_slice(opts.slice)
146     qseq_parser.reportFilter = opts.pf
147
148     qseq_parser.run()
149
150 if __name__ == "__main__":
151     main()