Also include the read ID in the fastq header
[htsworkflow.git] / scripts / qseq2fastq
1 #!/usr/bin/env python
2
3 import os
4 from optparse import OptionParser
5 import numpy
6 import sys
7
8 def make_parser():
9     parser = OptionParser()
10     parser.add_option('-o', '--output', help='output fastq file', default=None)
11     parser.add_option('-s', '--slice',
12                       help='specify python slice, e.g. 0:75, 0:-1',
13                       default=None)
14     return parser
15
16 def parse_slice(slice_text):
17     if slice_text is None or len(slice_text) == 0:
18         return slice(None)
19         
20     slice_data = []
21     for element in slice_text.split(':'):
22         if len(element) == 0:
23             element = None
24         else:
25             element = int(element)
26         slice_data.append(element)
27
28     return slice(*slice_data)
29         
30 def qseq2fastq(destination, qseqs, trim=None):
31     for q in qseqs:
32         for line in open(q):
33             # parse line
34             record = line.strip().split('\t')
35             machine_name = record[0]
36             run_number = record[1]
37             lane_number = record[2]
38             tile = record[3]
39             x = record[4]
40             y = record[5]
41             index = record[6]
42             read = record[7]
43             sequence = record[8]
44             # Illumina scores are Phred + 64
45             # Fastq scores are Phread + 33
46             # the following code grabs the string, converts to short ints and
47             # subtracts 31 (64-33) to convert between the two score formats.
48             # The numpy solution is twice as fast as some of my other
49             # ideas for the conversion.
50             # sorry about the uglyness in changing from character, to 8-bit int
51             # and back to a character array
52             quality = numpy.asarray(record[9],'c')
53             quality.dtype = numpy.uint8
54             quality -= 31
55             quality.dtype = '|S1' # I'd like to know what the real numpy char type is
56             
57             pass_qc = record[10]
58
59             destination.write('@%s_%s:%s:%s:%s:%s/%s pf=%s%s' % ( \
60                 machine_name,
61                 run_number,
62                 lane_number,
63                 tile,
64                 x,
65                 y,
66                 read,
67                 pass_qc,
68                 os.linesep))
69             destination.write(sequence[trim])
70             destination.write(os.linesep)
71             destination.write('+')
72             destination.write(os.linesep)
73             destination.write(quality[trim].tostring())
74             destination.write(os.linesep)
75         
76 def main(cmdline=None):
77     parser = make_parser()
78     opts, args = parser.parse_args(cmdline)
79
80     if opts.output is not None:
81         dest = open(opts.output, 'w')
82     else:
83         dest = sys.stdout
84
85     subseq = parse_slice(opts.slice)
86
87     qseq2fastq(dest, args, subseq)
88     
89     return 0
90
91 if __name__ == "__main__":
92     main()