Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / qseq2fastq.py
1 #!/usr/bin/env python
2 """Convert a collection of qseq or a tar file of qseq files to a fastq file
3 """
4 from glob import glob
5 import os
6 from optparse import OptionParser
7 import numpy
8 import sys
9 import tarfile
10
11 from htsworkflow.util.version import version
12 from htsworkflow.util.conversion import parse_slice
13
14
15 def main(cmdline=None):
16     """Command line driver: [None, '-i', 'tarfile', '-o', 'target.fastq']
17     """
18     parser = make_parser()
19     opts, args = parser.parse_args(cmdline)
20
21     if opts.version:
22         print(version())
23         return 0
24
25     if opts.infile is not None:
26         qseq_generator = tarfile_generator(opts.infile)
27     elif len(args) > 0:
28         qseq_generator = file_generator(args)
29     else:
30         qseq_generator = [sys.stdin]
31
32     if opts.output is not None:
33         output = open(opts.output, 'w')
34     else:
35         output = sys.stdout
36
37     if opts.nopass_output is not None:
38         nopass_output = open(opts.nopass_output, 'w')
39     else:
40         nopass_output = None
41
42     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
43     qseq_parser.fastq = not opts.fasta
44     qseq_parser.flowcell_id = opts.flowcell
45     qseq_parser.trim = parse_slice(opts.slice)
46     qseq_parser.reportFilter = opts.pf
47
48     qseq_parser.run()
49
50
51 def make_parser():
52     """Return option parser"""
53     usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
54     parser = OptionParser(usage)
55     parser.add_option("-a", "--fasta", default=False, action="store_true",
56                       help="produce fasta files instead of fastq files")
57     parser.add_option("-f", "--flowcell", default=None,
58                       help="Set flowcell ID for output file")
59     parser.add_option("-i", "--infile", default=None,
60                       help='source tar file (if reading from an archive '\
61                            'instead of a directory)')
62     parser.add_option("-o", "--output", default=None,
63                       help="output fastq file")
64     parser.add_option("-n", "--nopass-output", default=None,
65                       help="if provided send files that failed "\
66                            "illumina filter to a differentfile")
67     parser.add_option("-s", "--slice",
68                       help="specify python slice, e.g. 0:75, 0:-1",
69                       default=None)
70     parser.add_option("--pf", help="report pass filter flag", default=False,
71                       action="store_true")
72     parser.add_option("--version", default=False, action="store_true",
73                       help="report software version")
74
75     return parser
76
77
78 def file_generator(pattern_list):
79     """Given a list of glob patterns yield open streams for matching files"""
80     for pattern in pattern_list:
81         for filename in glob(pattern):
82             yield open(filename, "r")
83
84
85 def tarfile_generator(tarfilename):
86     """Yield open streams for files inside a tarfile"""
87     archive = tarfile.open(tarfilename, 'r|*')
88     for tarinfo in archive:
89         yield archive.extractfile(tarinfo)
90
91
92 class Qseq2Fastq(object):
93     """
94     Convert qseq files to fastq (or fasta) files.
95     """
96     def __init__(self, sources, pass_destination, nopass_destination=None):
97         self.sources = sources
98         self.pass_destination = pass_destination
99         if nopass_destination is not None:
100             self.nopass_destination = nopass_destination
101         else:
102             self.nopass_destination = pass_destination
103
104         self.fastq = True
105         self.flowcell_id = None
106         self.trim = slice(None)
107         self.report_filter = False
108
109     def _format_flowcell_id(self):
110         """
111         Return formatted flowcell ID
112         """
113         if self.flowcell_id is not None:
114             return self.flowcell_id + "_"
115         else:
116             return ""
117
118     def run(self):
119         """Run conversion
120         (Used to match threading/multiprocessing API)
121         """
122         if self.fastq:
123             header_template = '@'
124         else:
125             # fasta case
126             header_template = '>'
127         header_template += self._format_flowcell_id() + \
128                            '%s_%s:%s:%s:%s:%s/%s%s%s'
129
130         for qstream in self.sources:
131             for line in qstream:
132                 # parse line
133                 record = line.rstrip().split('\t')
134                 machine_name = record[0]
135                 run_number = record[1]
136                 lane_number = record[2]
137                 tile = record[3]
138                 x = record[4]
139                 y = record[5]
140                 #index = record[6]
141                 read = record[7]
142                 sequence = record[8].replace('.', 'N')
143                 quality = convert_illumina_quality(record[9])
144
145                 # add pass qc filter if we want it
146                 pass_qc = int(record[10])
147                 if self.report_filter:
148                     pass_qc_msg = " pf=%s" % (pass_qc)
149                 else:
150                     pass_qc_msg = ""
151
152                 header = header_template % ( \
153                     machine_name,
154                     run_number,
155                     lane_number,
156                     tile,
157                     x,
158                     y,
159                     read,
160                     pass_qc_msg,
161                     os.linesep)
162
163                 # if we passed the filter write to the "good" file
164                 if pass_qc:
165                     destination = self.pass_destination
166                 else:
167                     destination = self.nopass_destination
168
169                 destination.write(header)
170                 destination.write(sequence[self.trim])
171                 destination.write(os.linesep)
172                 if self.fastq:
173                     destination.write('+')
174                     destination.write(os.linesep)
175                     destination.write(quality[self.trim].tostring())
176                     destination.write(os.linesep)
177
178 def convert_illumina_quality(illumina_quality):
179     """Convert an Illumina quality score to a Phred ASCII quality score.
180     """
181     # Illumina scores are Phred + 64
182     # Fastq scores are Phread + 33
183     # the following code grabs the string, converts to short ints and
184     # subtracts 31 (64-33) to convert between the two score formats.
185     # The numpy solution is twice as fast as some of my other
186     # ideas for the conversion.
187     # sorry about the uglyness in changing from character, to 8-bit int
188     # and back to a character array
189     quality = numpy.asarray(illumina_quality, 'c')
190     quality.dtype = numpy.uint8
191     quality -= 31
192      # I'd like to know what the real numpy char type is
193     quality.dtype = '|S1'
194     return quality
195
196
197 if __name__ == "__main__":
198     main()