2 """Convert a collection of qseq or a tar file of qseq files to a fastq file
4 from __future__ import print_function, unicode_literals
7 from optparse import OptionParser
12 from htsworkflow.util.version import version
13 from htsworkflow.util.conversion import parse_slice
14 from htsworkflow.pipelines.desplit_fastq import open_output
17 def main(cmdline=None):
18 """Command line driver: [None, '-i', 'tarfile', '-o', 'target.fastq']
20 parser = make_parser()
21 opts, args = parser.parse_args(cmdline)
27 if opts.infile is not None:
28 qseq_generator = tarfile_generator(opts.infile)
30 qseq_generator = file_generator(args)
32 qseq_generator = [sys.stdin]
34 if opts.output is not None:
35 output = open_output(opts.output, opts)
39 if opts.nopass_output is not None:
40 nopass_output = open_output(opts.nopass_output, opts)
44 qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
45 qseq_parser.fastq = not opts.fasta
46 qseq_parser.flowcell_id = opts.flowcell
47 qseq_parser.trim = parse_slice(opts.slice)
48 qseq_parser.reportFilter = opts.pf
54 """Return option parser"""
55 usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
56 parser = OptionParser(usage)
57 parser.add_option("-a", "--fasta", default=False, action="store_true",
58 help="produce fasta files instead of fastq files")
59 parser.add_option("-f", "--flowcell", default=None,
60 help="Set flowcell ID for output file")
61 parser.add_option("-i", "--infile", default=None,
62 help='source tar file (if reading from an archive '\
63 'instead of a directory)')
64 parser.add_option("-o", "--output", default=None,
65 help="output fastq file")
66 parser.add_option("-n", "--nopass-output", default=None,
67 help="if provided send files that failed "\
68 "illumina filter to a differentfile")
69 parser.add_option("-s", "--slice",
70 help="specify python slice, e.g. 0:75, 0:-1",
72 parser.add_option("--pf", help="report pass filter flag", default=False,
74 parser.add_option('--gzip', default=False, action='store_true',
76 parser.add_option('--bzip', default=False, action='store_true',
78 parser.add_option("--version", default=False, action="store_true",
79 help="report software version")
84 def file_generator(pattern_list):
85 """Given a list of glob patterns yield open streams for matching files"""
86 for pattern in pattern_list:
87 for filename in glob(pattern):
88 # this needs to return bytes, because tarfile generate does
89 yield open(filename, "rb")
92 def tarfile_generator(tarfilename):
93 """Yield open streams for files inside a tarfile"""
94 archive = tarfile.open(tarfilename, 'r|*')
95 for tarinfo in archive:
96 yield archive.extractfile(tarinfo)
99 class Qseq2Fastq(object):
101 Convert qseq files to fastq (or fasta) files.
103 def __init__(self, sources, pass_destination, nopass_destination=None):
104 self.sources = sources
105 self.pass_destination = pass_destination
106 if nopass_destination is not None:
107 self.nopass_destination = nopass_destination
109 self.nopass_destination = pass_destination
112 self.flowcell_id = None
113 self.trim = slice(None)
114 self.report_filter = False
116 def _format_flowcell_id(self):
118 Return formatted flowcell ID
120 if self.flowcell_id is not None:
121 return self.flowcell_id + "_"
127 (Used to match threading/multiprocessing API)
130 header_template = '@'
133 header_template = '>'
134 header_template += self._format_flowcell_id() + \
135 '%s_%s:%s:%s:%s:%s/%s%s%s'
137 for qstream in self.sources:
140 record = line.decode('ascii').rstrip().split('\t')
141 machine_name = record[0]
142 run_number = record[1]
143 lane_number = record[2]
149 sequence = record[8].replace('.', 'N')
150 quality = convert_illumina_quality(record[9])
152 # add pass qc filter if we want it
153 pass_qc = int(record[10])
154 if self.report_filter:
155 pass_qc_msg = " pf=%s" % (pass_qc)
159 header = header_template % ( \
170 # if we passed the filter write to the "good" file
172 destination = self.pass_destination
174 destination = self.nopass_destination
176 destination.write(header)
177 destination.write(sequence[self.trim])
178 destination.write(os.linesep)
180 destination.write('+')
181 destination.write(os.linesep)
182 destination.write(quality[self.trim].tobytes().decode('ascii'))
183 destination.write(os.linesep)
185 def convert_illumina_quality(illumina_quality):
186 """Convert an Illumina quality score to a Phred ASCII quality score.
188 # Illumina scores are Phred + 64
189 # Fastq scores are Phread + 33
190 # the following code grabs the string, converts to short ints and
191 # subtracts 31 (64-33) to convert between the two score formats.
192 # The numpy solution is twice as fast as some of my other
193 # ideas for the conversion.
194 # sorry about the uglyness in changing from character, to 8-bit int
195 # and back to a character array
196 quality = numpy.asarray(illumina_quality, 'c')
197 quality.dtype = numpy.uint8
199 # I'd like to know what the real numpy char type is
200 quality.dtype = '|S1'
204 if __name__ == "__main__":