Unicode munging for python3
[htsworkflow.git] / htsworkflow / pipelines / qseq2fastq.py
1 #!/usr/bin/env python
2 """Convert a collection of qseq or a tar file of qseq files to a fastq file
3 """
4 from __future__ import print_function, unicode_literals
5 from glob import glob
6 import os
7 from optparse import OptionParser
8 import numpy
9 import sys
10 import tarfile
11
12 from htsworkflow.util.version import version
13 from htsworkflow.util.conversion import parse_slice
14 from htsworkflow.pipelines.desplit_fastq import open_output
15
16
17 def main(cmdline=None):
18     """Command line driver: [None, '-i', 'tarfile', '-o', 'target.fastq']
19     """
20     parser = make_parser()
21     opts, args = parser.parse_args(cmdline)
22
23     if opts.version:
24         print(version())
25         return 0
26
27     if opts.infile is not None:
28         qseq_generator = tarfile_generator(opts.infile)
29     elif len(args) > 0:
30         qseq_generator = file_generator(args)
31     else:
32         qseq_generator = [sys.stdin]
33
34     if opts.output is not None:
35         output = open_output(opts.output, opts)
36     else:
37         output = sys.stdout
38
39     if opts.nopass_output is not None:
40         nopass_output = open_output(opts.nopass_output, opts)
41     else:
42         nopass_output = None
43
44     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
45     qseq_parser.fastq = not opts.fasta
46     qseq_parser.flowcell_id = opts.flowcell
47     qseq_parser.trim = parse_slice(opts.slice)
48     qseq_parser.reportFilter = opts.pf
49
50     qseq_parser.run()
51
52
53 def make_parser():
54     """Return option parser"""
55     usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
56     parser = OptionParser(usage)
57     parser.add_option("-a", "--fasta", default=False, action="store_true",
58                       help="produce fasta files instead of fastq files")
59     parser.add_option("-f", "--flowcell", default=None,
60                       help="Set flowcell ID for output file")
61     parser.add_option("-i", "--infile", default=None,
62                       help='source tar file (if reading from an archive '\
63                            'instead of a directory)')
64     parser.add_option("-o", "--output", default=None,
65                       help="output fastq file")
66     parser.add_option("-n", "--nopass-output", default=None,
67                       help="if provided send files that failed "\
68                            "illumina filter to a differentfile")
69     parser.add_option("-s", "--slice",
70                       help="specify python slice, e.g. 0:75, 0:-1",
71                       default=None)
72     parser.add_option("--pf", help="report pass filter flag", default=False,
73                       action="store_true")
74     parser.add_option('--gzip', default=False, action='store_true',
75                       help='gzip output')
76     parser.add_option('--bzip', default=False, action='store_true',
77                       help='bzip output')
78     parser.add_option("--version", default=False, action="store_true",
79                       help="report software version")
80
81     return parser
82
83
84 def file_generator(pattern_list):
85     """Given a list of glob patterns yield open streams for matching files"""
86     for pattern in pattern_list:
87         for filename in glob(pattern):
88             # this needs to return bytes, because tarfile generate does
89             yield open(filename, "rb")
90
91
92 def tarfile_generator(tarfilename):
93     """Yield open streams for files inside a tarfile"""
94     archive = tarfile.open(tarfilename, 'r|*')
95     for tarinfo in archive:
96         yield archive.extractfile(tarinfo)
97
98
99 class Qseq2Fastq(object):
100     """
101     Convert qseq files to fastq (or fasta) files.
102     """
103     def __init__(self, sources, pass_destination, nopass_destination=None):
104         self.sources = sources
105         self.pass_destination = pass_destination
106         if nopass_destination is not None:
107             self.nopass_destination = nopass_destination
108         else:
109             self.nopass_destination = pass_destination
110
111         self.fastq = True
112         self.flowcell_id = None
113         self.trim = slice(None)
114         self.report_filter = False
115
116     def _format_flowcell_id(self):
117         """
118         Return formatted flowcell ID
119         """
120         if self.flowcell_id is not None:
121             return self.flowcell_id + "_"
122         else:
123             return ""
124
125     def run(self):
126         """Run conversion
127         (Used to match threading/multiprocessing API)
128         """
129         if self.fastq:
130             header_template = '@'
131         else:
132             # fasta case
133             header_template = '>'
134         header_template += self._format_flowcell_id() + \
135                            '%s_%s:%s:%s:%s:%s/%s%s%s'
136
137         for qstream in self.sources:
138             for line in qstream:
139                 # parse line
140                 record = line.decode('ascii').rstrip().split('\t')
141                 machine_name = record[0]
142                 run_number = record[1]
143                 lane_number = record[2]
144                 tile = record[3]
145                 x = record[4]
146                 y = record[5]
147                 #index = record[6]
148                 read = record[7]
149                 sequence = record[8].replace('.', 'N')
150                 quality = convert_illumina_quality(record[9])
151
152                 # add pass qc filter if we want it
153                 pass_qc = int(record[10])
154                 if self.report_filter:
155                     pass_qc_msg = " pf=%s" % (pass_qc)
156                 else:
157                     pass_qc_msg = ""
158
159                 header = header_template % ( \
160                     machine_name,
161                     run_number,
162                     lane_number,
163                     tile,
164                     x,
165                     y,
166                     read,
167                     pass_qc_msg,
168                     os.linesep)
169
170                 # if we passed the filter write to the "good" file
171                 if pass_qc:
172                     destination = self.pass_destination
173                 else:
174                     destination = self.nopass_destination
175
176                 destination.write(header)
177                 destination.write(sequence[self.trim])
178                 destination.write(os.linesep)
179                 if self.fastq:
180                     destination.write('+')
181                     destination.write(os.linesep)
182                     destination.write(quality[self.trim].tobytes().decode('ascii'))
183                     destination.write(os.linesep)
184
185 def convert_illumina_quality(illumina_quality):
186     """Convert an Illumina quality score to a Phred ASCII quality score.
187     """
188     # Illumina scores are Phred + 64
189     # Fastq scores are Phread + 33
190     # the following code grabs the string, converts to short ints and
191     # subtracts 31 (64-33) to convert between the two score formats.
192     # The numpy solution is twice as fast as some of my other
193     # ideas for the conversion.
194     # sorry about the uglyness in changing from character, to 8-bit int
195     # and back to a character array
196     quality = numpy.asarray(illumina_quality, 'c')
197     quality.dtype = numpy.uint8
198     quality -= 31
199      # I'd like to know what the real numpy char type is
200     quality.dtype = '|S1'
201     return quality
202
203
204 if __name__ == "__main__":
205     main()