Move the knowledge of the urls for the REST API to one new file
[htsworkflow.git] / scripts / qseq2fastq
1 #!/usr/bin/env python
2
3 from glob import glob
4 import os
5 from optparse import OptionParser
6 import numpy
7 import sys
8 import tarfile
9
10 class Qseq2Fastq(object):
11     """
12     Convert qseq files to fastq (or fasta) files.
13     """
14     def __init__(self, sources, pass_destination, nopass_destination=None):
15         self.sources = sources
16         self.pass_destination = pass_destination
17         if nopass_destination is not None:
18             self.nopass_destination = nopass_destination
19         else:
20             self.nopass_destination = pass_destination
21         
22         self.fastq = True
23         self.flowcell_id = None
24         self.trim = slice(None)
25         self.reportFilter = False
26
27     def _format_flowcell_id(self):
28         """
29         Return formatted flowcell ID
30         """
31         if self.flowcell_id is not None:
32             return self.flowcell_id+"_"
33         else:
34             return ""
35     
36     def _convert_illumina_quality(self, illumina_quality):
37         """
38         Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
39         """
40         # Illumina scores are Phred + 64
41         # Fastq scores are Phread + 33
42         # the following code grabs the string, converts to short ints and
43         # subtracts 31 (64-33) to convert between the two score formats.
44         # The numpy solution is twice as fast as some of my other
45         # ideas for the conversion.
46         # sorry about the uglyness in changing from character, to 8-bit int
47         # and back to a character array
48         quality = numpy.asarray(illumina_quality,'c')
49         quality.dtype = numpy.uint8
50         quality -= 31
51         quality.dtype = '|S1' # I'd like to know what the real numpy char type is
52         return quality
53         
54     def run(self):
55         if self.fastq:
56             header_template = '@' 
57         else:
58             # fasta case
59             header_template = '>'
60         header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
61         
62         for qstream in self.sources:
63             for line in qstream:
64                 # parse line
65                 record = line.rstrip().split('\t')
66                 machine_name = record[0]
67                 run_number = record[1]
68                 lane_number = record[2]
69                 tile = record[3]
70                 x = record[4]
71                 y = record[5]
72                 index = record[6]
73                 read = record[7]
74                 sequence = record[8].replace('.','N')
75                 quality = self._convert_illumina_quality(record[9])
76
77                 # add pass qc filter if we want it
78                 pass_qc = int(record[10])
79                 if self.reportFilter:
80                     pass_qc_msg = " pf=%s" % (pass_qc)
81                 else:
82                     pass_qc_msg = ""
83
84                 header = header_template  % ( \
85                     machine_name,
86                     run_number,
87                     lane_number,
88                     tile,
89                     x,
90                     y,
91                     read,
92                     pass_qc_msg,
93                     os.linesep)
94
95
96                 # if we passed the filter write to the "good" file
97                 if pass_qc:
98                     destination = self.pass_destination
99                 else:
100                     destination = self.nopass_destination
101                 
102                 destination.write(header)
103                 destination.write(sequence[self.trim])
104                 destination.write(os.linesep)
105                 if self.fastq:
106                     destination.write('+')
107                     destination.write(os.linesep)
108                     destination.write(quality[self.trim].tostring())
109                     destination.write(os.linesep)
110             
111 def file_generator(pattern_list):
112     for pattern in pattern_list:
113         for filename in glob(pattern):
114             yield open(filename,'r')
115
116 def tarfile_generator(tarfilename):
117     archive = tarfile.open(tarfilename,'r|*')
118     for tarinfo in archive:
119         yield archive.extractfile(tarinfo)
120     
121
122 def parse_slice(slice_text):
123     if slice_text is None or len(slice_text) == 0:
124         return slice(None)
125         
126     slice_data = []
127     for element in slice_text.split(':'):
128         if len(element) == 0:
129             element = None
130         else:
131             element = int(element)
132         slice_data.append(element)
133
134     return slice(*slice_data)
135         
136 def make_parser():
137     usage = "%prog: [options] *_qseq.txt"
138     parser = OptionParser(usage)
139     parser.add_option('-a', '--fasta', default=False, action="store_true",
140                       help="produce fasta files instead of fastq files")
141     parser.add_option('-f', '--flowcell', default=None, 
142                       help="Set flowcell ID for output file")
143     parser.add_option('-i', '--infile', default=None,
144       help='source tar file (if reading from an archive instead of a directory)')
145     parser.add_option('-o', '--output', default=None,
146                       help='output fastq file')
147     parser.add_option('-n', '--nopass-output', default=None,
148                       help='if provided send files that failed illumina filter '\
149                            'to a differentfile')
150     parser.add_option('-s', '--slice',
151                       help='specify python slice, e.g. 0:75, 0:-1',
152                       default=None)
153     parser.add_option('--pf', help="report pass filter flag", default=False,
154                       action="store_true")
155     return parser
156
157 def main(cmdline=None):
158     parser = make_parser()
159     opts, args = parser.parse_args(cmdline)
160
161     if opts.infile is not None:
162         qseq_generator = tarfile_generator(opts.infile)
163     elif len(args) > 0:
164         qseq_generator = file_generator(args)
165     else:
166         qseq_generator = [sys.stdin]
167     
168         
169     if opts.output is not None:
170         output = open(opts.output, 'w')
171     else:
172         output = sys.stdout
173
174     if opts.nopass_output is not None:
175         nopass_output = open(opts.nopass_output, 'w')
176     else:
177         nopass_output = None
178
179     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
180     qseq_parser.fastq = not opts.fasta
181     qseq_parser.flowcell_id = opts.flowcell
182     qseq_parser.trim = parse_slice(opts.slice)
183     qseq_parser.reportFilter = opts.pf
184
185     qseq_parser.run()
186
187 if __name__ == "__main__":
188     main()