add more from __future__ import print_function, unicode_literals
[htsworkflow.git] / htsworkflow / pipelines / qseq2fastq.py
1 #!/usr/bin/env python
2 """Convert a collection of qseq or a tar file of qseq files to a fastq file
3 """
4 from __future__ import print_function, unicode_literals
5 from glob import glob
6 import os
7 from optparse import OptionParser
8 import numpy
9 import sys
10 import tarfile
11
12 from htsworkflow.util.version import version
13 from htsworkflow.util.conversion import parse_slice
14
15
16 def main(cmdline=None):
17     """Command line driver: [None, '-i', 'tarfile', '-o', 'target.fastq']
18     """
19     parser = make_parser()
20     opts, args = parser.parse_args(cmdline)
21
22     if opts.version:
23         print(version())
24         return 0
25
26     if opts.infile is not None:
27         qseq_generator = tarfile_generator(opts.infile)
28     elif len(args) > 0:
29         qseq_generator = file_generator(args)
30     else:
31         qseq_generator = [sys.stdin]
32
33     if opts.output is not None:
34         output = open(opts.output, 'w')
35     else:
36         output = sys.stdout
37
38     if opts.nopass_output is not None:
39         nopass_output = open(opts.nopass_output, 'w')
40     else:
41         nopass_output = None
42
43     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
44     qseq_parser.fastq = not opts.fasta
45     qseq_parser.flowcell_id = opts.flowcell
46     qseq_parser.trim = parse_slice(opts.slice)
47     qseq_parser.reportFilter = opts.pf
48
49     qseq_parser.run()
50
51
52 def make_parser():
53     """Return option parser"""
54     usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
55     parser = OptionParser(usage)
56     parser.add_option("-a", "--fasta", default=False, action="store_true",
57                       help="produce fasta files instead of fastq files")
58     parser.add_option("-f", "--flowcell", default=None,
59                       help="Set flowcell ID for output file")
60     parser.add_option("-i", "--infile", default=None,
61                       help='source tar file (if reading from an archive '\
62                            'instead of a directory)')
63     parser.add_option("-o", "--output", default=None,
64                       help="output fastq file")
65     parser.add_option("-n", "--nopass-output", default=None,
66                       help="if provided send files that failed "\
67                            "illumina filter to a differentfile")
68     parser.add_option("-s", "--slice",
69                       help="specify python slice, e.g. 0:75, 0:-1",
70                       default=None)
71     parser.add_option("--pf", help="report pass filter flag", default=False,
72                       action="store_true")
73     parser.add_option("--version", default=False, action="store_true",
74                       help="report software version")
75
76     return parser
77
78
79 def file_generator(pattern_list):
80     """Given a list of glob patterns yield open streams for matching files"""
81     for pattern in pattern_list:
82         for filename in glob(pattern):
83             yield open(filename, "r")
84
85
86 def tarfile_generator(tarfilename):
87     """Yield open streams for files inside a tarfile"""
88     archive = tarfile.open(tarfilename, 'r|*')
89     for tarinfo in archive:
90         yield archive.extractfile(tarinfo)
91
92
93 class Qseq2Fastq(object):
94     """
95     Convert qseq files to fastq (or fasta) files.
96     """
97     def __init__(self, sources, pass_destination, nopass_destination=None):
98         self.sources = sources
99         self.pass_destination = pass_destination
100         if nopass_destination is not None:
101             self.nopass_destination = nopass_destination
102         else:
103             self.nopass_destination = pass_destination
104
105         self.fastq = True
106         self.flowcell_id = None
107         self.trim = slice(None)
108         self.report_filter = False
109
110     def _format_flowcell_id(self):
111         """
112         Return formatted flowcell ID
113         """
114         if self.flowcell_id is not None:
115             return self.flowcell_id + "_"
116         else:
117             return ""
118
119     def run(self):
120         """Run conversion
121         (Used to match threading/multiprocessing API)
122         """
123         if self.fastq:
124             header_template = '@'
125         else:
126             # fasta case
127             header_template = '>'
128         header_template += self._format_flowcell_id() + \
129                            '%s_%s:%s:%s:%s:%s/%s%s%s'
130
131         for qstream in self.sources:
132             for line in qstream:
133                 # parse line
134                 record = line.rstrip().split('\t')
135                 machine_name = record[0]
136                 run_number = record[1]
137                 lane_number = record[2]
138                 tile = record[3]
139                 x = record[4]
140                 y = record[5]
141                 #index = record[6]
142                 read = record[7]
143                 sequence = record[8].replace('.', 'N')
144                 quality = convert_illumina_quality(record[9])
145
146                 # add pass qc filter if we want it
147                 pass_qc = int(record[10])
148                 if self.report_filter:
149                     pass_qc_msg = " pf=%s" % (pass_qc)
150                 else:
151                     pass_qc_msg = ""
152
153                 header = header_template % ( \
154                     machine_name,
155                     run_number,
156                     lane_number,
157                     tile,
158                     x,
159                     y,
160                     read,
161                     pass_qc_msg,
162                     os.linesep)
163
164                 # if we passed the filter write to the "good" file
165                 if pass_qc:
166                     destination = self.pass_destination
167                 else:
168                     destination = self.nopass_destination
169
170                 destination.write(header)
171                 destination.write(sequence[self.trim])
172                 destination.write(os.linesep)
173                 if self.fastq:
174                     destination.write('+')
175                     destination.write(os.linesep)
176                     destination.write(quality[self.trim].tostring())
177                     destination.write(os.linesep)
178
179 def convert_illumina_quality(illumina_quality):
180     """Convert an Illumina quality score to a Phred ASCII quality score.
181     """
182     # Illumina scores are Phred + 64
183     # Fastq scores are Phread + 33
184     # the following code grabs the string, converts to short ints and
185     # subtracts 31 (64-33) to convert between the two score formats.
186     # The numpy solution is twice as fast as some of my other
187     # ideas for the conversion.
188     # sorry about the uglyness in changing from character, to 8-bit int
189     # and back to a character array
190     quality = numpy.asarray(illumina_quality, 'c')
191     quality.dtype = numpy.uint8
192     quality -= 31
193      # I'd like to know what the real numpy char type is
194     quality.dtype = '|S1'
195     return quality
196
197
198 if __name__ == "__main__":
199     main()