Add support for generating fasta files in addition to fastq files
[htsworkflow.git] / scripts / qseq2fastq
1 #!/usr/bin/env python
2
3 from glob import glob
4 import os
5 from optparse import OptionParser
6 import numpy
7 import sys
8 import tarfile
9
10 class Qseq2Fastq(object):
11     """
12     Convert qseq files to fastq (or fasta) files.
13     """
14     def __init__(self, sources, pass_destination, nopass_destination=None):
15         self.sources = sources
16         self.pass_destination = pass_destination
17         if nopass_destination is not None:
18             self.nopass_destination = nopass_destination
19         else:
20             self.nopass_destination = pass_destination
21         
22         self.fastq = True
23         self.flowcell_id = ''
24         self.trim = slice(None)
25         self.reportFilter = False
26
27     def _format_flowcell_id(self):
28         """
29         Return formatted flowcell ID
30         """
31         return self.flowcell_id+"_"
32     
33     def _convert_illumina_quality(self, illumina_quality):
34         """
35         Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
36         """
37         # Illumina scores are Phred + 64
38         # Fastq scores are Phread + 33
39         # the following code grabs the string, converts to short ints and
40         # subtracts 31 (64-33) to convert between the two score formats.
41         # The numpy solution is twice as fast as some of my other
42         # ideas for the conversion.
43         # sorry about the uglyness in changing from character, to 8-bit int
44         # and back to a character array
45         quality = numpy.asarray(illumina_quality,'c')
46         quality.dtype = numpy.uint8
47         quality -= 31
48         quality.dtype = '|S1' # I'd like to know what the real numpy char type is
49         return quality
50         
51     def run(self):
52         if self.fastq:
53             header_template = '@' 
54         else:
55             # fasta case
56             header_template = '>'
57         header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
58         
59         for qstream in self.sources:
60             for line in qstream:
61                 # parse line
62                 record = line.rstrip().split('\t')
63                 machine_name = record[0]
64                 run_number = record[1]
65                 lane_number = record[2]
66                 tile = record[3]
67                 x = record[4]
68                 y = record[5]
69                 index = record[6]
70                 read = record[7]
71                 sequence = record[8].replace('.','N')
72                 quality = self._convert_illumina_quality(record[9])
73
74                 # add pass qc filter if we want it
75                 pass_qc = int(record[10])
76                 if self.reportFilter:
77                     pass_qc_msg = " pf=%s" % (pass_qc)
78                 else:
79                     pass_qc_msg = ""
80
81                 header = header_template  % ( \
82                     machine_name,
83                     run_number,
84                     lane_number,
85                     tile,
86                     x,
87                     y,
88                     read,
89                     pass_qc_msg,
90                     os.linesep)
91
92
93                 # if we passed the filter write to the "good" file
94                 if pass_qc:
95                     destination = self.pass_destination
96                 else:
97                     destination = self.nopass_destination
98                 
99                 destination.write(header)
100                 destination.write(sequence[self.trim])
101                 destination.write(os.linesep)
102                 if self.fastq:
103                     destination.write('+')
104                     destination.write(os.linesep)
105                     destination.write(quality[self.trim].tostring())
106                     destination.write(os.linesep)
107             
108 def file_generator(pattern_list):
109     for pattern in pattern_list:
110         for filename in glob(pattern):
111             yield open(filename,'r')
112
113 def tarfile_generator(tarfilename):
114     archive = tarfile.open(tarfilename,'r|*')
115     for tarinfo in archive:
116         yield archive.extractfile(tarinfo)
117     
118
119 def parse_slice(slice_text):
120     if slice_text is None or len(slice_text) == 0:
121         return slice(None)
122         
123     slice_data = []
124     for element in slice_text.split(':'):
125         if len(element) == 0:
126             element = None
127         else:
128             element = int(element)
129         slice_data.append(element)
130
131     return slice(*slice_data)
132         
133 def make_parser():
134     usage = "%prog: [options] *_qseq.txt"
135     parser = OptionParser(usage)
136     parser.add_option('-a', '--fasta', default=False, action="store_true",
137                       help="produce fasta files instead of fastq files")
138     parser.add_option('-f', '--flowcell', default='', 
139                       help="Set flowcell ID for output file")
140     parser.add_option('-i', '--infile', default=None,
141       help='source tar file (if reading from an archive instead of a directory)')
142     parser.add_option('-o', '--output', default=None,
143                       help='output fastq file')
144     parser.add_option('-n', '--nopass-output', default=None,
145                       help='if provided send files that failed illumina filter '\
146                            'to a differentfile')
147     parser.add_option('-s', '--slice',
148                       help='specify python slice, e.g. 0:75, 0:-1',
149                       default=None)
150     parser.add_option('--pf', help="report pass filter flag", default=False,
151                       action="store_true")
152     return parser
153
154 def main(cmdline=None):
155     parser = make_parser()
156     opts, args = parser.parse_args(cmdline)
157
158     if opts.infile is not None:
159         qseq_generator = tarfile_generator(opts.infile)
160     elif len(args) > 0:
161         qseq_generator = file_generator(args)
162     else:
163         qseq_generator = [sys.stdin]
164     
165         
166     if opts.output is not None:
167         output = open(opts.output, 'w')
168     else:
169         output = sys.stdout
170
171     if opts.nopass_output is not None:
172         nopass_output = open(opts.nopass_output, 'w')
173     else:
174         nopass_output = None
175
176     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
177     qseq_parser.fastq = not opts.fasta
178     qseq_parser.flowcell_id = opts.flowcell
179     qseq_parser.trim = parse_slice(opts.slice)
180     qseq_parser.reportFilter = opts.pf
181
182     qseq_parser.run()
183
184 if __name__ == "__main__":
185     main()