Add documentation about which Phred format we're generating
[htsworkflow.git] / htsworkflow / pipelines / qseq2fastq.py
1 #!/usr/bin/env python
2 from glob import glob
3 import os
4 from optparse import OptionParser
5 import numpy
6 import sys
7 import tarfile
8
9 from htsworkflow.version import version
10
11 def main(cmdline=None):
12     parser = make_parser()
13     opts, args = parser.parse_args(cmdline)
14
15     if opts.version:
16         print version()
17         return 0
18     
19     if opts.infile is not None:
20         qseq_generator = tarfile_generator(opts.infile)
21     elif len(args) > 0:
22         qseq_generator = file_generator(args)
23     else:
24         qseq_generator = [sys.stdin]
25             
26     if opts.output is not None:
27         output = open(opts.output, 'w')
28     else:
29         output = sys.stdout
30
31     if opts.nopass_output is not None:
32         nopass_output = open(opts.nopass_output, 'w')
33     else:
34         nopass_output = None
35
36     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
37     qseq_parser.fastq = not opts.fasta
38     qseq_parser.flowcell_id = opts.flowcell
39     qseq_parser.trim = parse_slice(opts.slice)
40     qseq_parser.reportFilter = opts.pf
41
42     qseq_parser.run()
43
44
45 def make_parser():
46     usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
47     parser = OptionParser(usage)
48     parser.add_option("-a", "--fasta", default=False, action="store_true",
49                       help="produce fasta files instead of fastq files")
50     parser.add_option("-f", "--flowcell", default=None, 
51                       help="Set flowcell ID for output file")
52     parser.add_option("-i", "--infile", default=None,
53       help='source tar file (if reading from an archive instead of a directory)')
54     parser.add_option("-o", "--output", default=None,
55                       help="output fastq file")
56     parser.add_option("-n", "--nopass-output", default=None,
57                       help="if provided send files that failed illumina filter "\
58                            "to a differentfile")
59     parser.add_option("-s", "--slice",
60                       help="specify python slice, e.g. 0:75, 0:-1",
61                       default=None)
62     parser.add_option("--pf", help="report pass filter flag", default=False,
63                       action="store_true")
64     parser.add_option("--version", default=False, action="store_true",
65                       help="report software version")
66     
67     return parser
68
69             
70 def file_generator(pattern_list):
71     for pattern in pattern_list:
72         for filename in glob(pattern):
73             yield open(filename,"r")
74
75
76 def tarfile_generator(tarfilename):
77     archive = tarfile.open(tarfilename,'r|*')
78     for tarinfo in archive:
79         yield archive.extractfile(tarinfo)
80
81
82 def parse_slice(slice_text):
83     if slice_text is None or len(slice_text) == 0:
84         return slice(None)
85         
86     slice_data = []
87     for element in slice_text.split(':'):
88         if len(element) == 0:
89             element = None
90         else:
91             element = int(element)
92         slice_data.append(element)
93
94     return slice(*slice_data)
95
96             
97 def file_generator(pattern_list):
98     for pattern in pattern_list:
99         for filename in glob(pattern):
100             yield open(filename,"r")
101
102
103 def tarfile_generator(tarfilename):
104     archive = tarfile.open(tarfilename,'r|*')
105     for tarinfo in archive:
106         yield archive.extractfile(tarinfo)
107
108
109 def parse_slice(slice_text):
110     if slice_text is None or len(slice_text) == 0:
111         return slice(None)
112         
113     slice_data = []
114     for element in slice_text.split(':'):
115         if len(element) == 0:
116             element = None
117         else:
118             element = int(element)
119         slice_data.append(element)
120
121     return slice(*slice_data)
122
123
124 class Qseq2Fastq(object):
125     """
126     Convert qseq files to fastq (or fasta) files.
127     """
128     def __init__(self, sources, pass_destination, nopass_destination=None):
129         self.sources = sources
130         self.pass_destination = pass_destination
131         if nopass_destination is not None:
132             self.nopass_destination = nopass_destination
133         else:
134             self.nopass_destination = pass_destination
135         
136         self.fastq = True
137         self.flowcell_id = None
138         self.trim = slice(None)
139         self.reportFilter = False
140
141     def _format_flowcell_id(self):
142         """
143         Return formatted flowcell ID
144         """
145         if self.flowcell_id is not None:
146             return self.flowcell_id+"_"
147         else:
148             return ""
149     
150     def _convert_illumina_quality(self, illumina_quality):
151         """
152         Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
153         """
154         # Illumina scores are Phred + 64
155         # Fastq scores are Phread + 33
156         # the following code grabs the string, converts to short ints and
157         # subtracts 31 (64-33) to convert between the two score formats.
158         # The numpy solution is twice as fast as some of my other
159         # ideas for the conversion.
160         # sorry about the uglyness in changing from character, to 8-bit int
161         # and back to a character array
162         quality = numpy.asarray(illumina_quality,'c')
163         quality.dtype = numpy.uint8
164         quality -= 31
165         quality.dtype = '|S1' # I'd like to know what the real numpy char type is
166         return quality
167         
168     def run(self):
169         if self.fastq:
170             header_template = '@' 
171         else:
172             # fasta case
173             header_template = '>'
174         header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
175         
176         for qstream in self.sources:
177             for line in qstream:
178                 # parse line
179                 record = line.rstrip().split('\t')
180                 machine_name = record[0]
181                 run_number = record[1]
182                 lane_number = record[2]
183                 tile = record[3]
184                 x = record[4]
185                 y = record[5]
186                 index = record[6]
187                 read = record[7]
188                 sequence = record[8].replace('.','N')
189                 quality = self._convert_illumina_quality(record[9])
190
191                 # add pass qc filter if we want it
192                 pass_qc = int(record[10])
193                 if self.reportFilter:
194                     pass_qc_msg = " pf=%s" % (pass_qc)
195                 else:
196                     pass_qc_msg = ""
197
198                 header = header_template  % ( \
199                     machine_name,
200                     run_number,
201                     lane_number,
202                     tile,
203                     x,
204                     y,
205                     read,
206                     pass_qc_msg,
207                     os.linesep)
208
209
210                 # if we passed the filter write to the "good" file
211                 if pass_qc:
212                     destination = self.pass_destination
213                 else:
214                     destination = self.nopass_destination
215                 
216                 destination.write(header)
217                 destination.write(sequence[self.trim])
218                 destination.write(os.linesep)
219                 if self.fastq:
220                     destination.write('+')
221                     destination.write(os.linesep)
222                     destination.write(quality[self.trim].tostring())
223                     destination.write(os.linesep)
224
225 if __name__ == "__main__":
226     main()