Modify qseq2fastq to also read from compressed tar files containing qseq files
[htsworkflow.git] / scripts / qseq2fastq
1 #!/usr/bin/env python
2
3 from glob import glob
4 import os
5 from optparse import OptionParser
6 import numpy
7 import sys
8 import tarfile
9
10 def qseq2fastq(destination, qseqs, trim=None, pf=False):
11     for qstream in qseqs:
12         for line in qstream:
13             # parse line
14             record = line.strip().split('\t')
15             machine_name = record[0]
16             run_number = record[1]
17             lane_number = record[2]
18             tile = record[3]
19             x = record[4]
20             y = record[5]
21             index = record[6]
22             read = record[7]
23             sequence = record[8].replace('.','N')
24             # Illumina scores are Phred + 64
25             # Fastq scores are Phread + 33
26             # the following code grabs the string, converts to short ints and
27             # subtracts 31 (64-33) to convert between the two score formats.
28             # The numpy solution is twice as fast as some of my other
29             # ideas for the conversion.
30             # sorry about the uglyness in changing from character, to 8-bit int
31             # and back to a character array
32             quality = numpy.asarray(record[9],'c')
33             quality.dtype = numpy.uint8
34             quality -= 31
35             quality.dtype = '|S1' # I'd like to know what the real numpy char type is
36
37             if pf:
38                 pass_qc = record[10]
39                 pass_qc_msg = " pf=%s" % (pass_qc)
40             else:
41                 pass_qc_msg = ""
42                 
43             destination.write('@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
44                 machine_name,
45                 run_number,
46                 lane_number,
47                 tile,
48                 x,
49                 y,
50                 read,
51                 pass_qc_msg,
52                 os.linesep))
53             destination.write(sequence[trim])
54             destination.write(os.linesep)
55             destination.write('+')
56             destination.write(os.linesep)
57             destination.write(quality[trim].tostring())
58             destination.write(os.linesep)
59
60 def file_generator(pattern_list):
61     for pattern in pattern_list:
62         for filename in glob(pattern):
63             yield open(filename,'r')
64
65 def tarfile_generator(tarfilename):
66     archive = tarfile.open(tarfilename,'r|*')
67     for tarinfo in archive:
68         yield archive.extractfile(tarinfo)
69     
70 def make_parser():
71     usage = "%prog: [options] *_qseq.txt"
72     parser = OptionParser(usage)
73     parser.add_option('-i', '--infile', default=None,
74       help='source tar file (if reading from an archive instead of a directory)')
75     parser.add_option('-o', '--output', help='output fastq file', default=None)
76     parser.add_option('-s', '--slice',
77                       help='specify python slice, e.g. 0:75, 0:-1',
78                       default=None)
79     parser.add_option('--pf', help="report pass filter flag", default=False,
80                       action="store_true")
81     return parser
82
83 def parse_slice(slice_text):
84     if slice_text is None or len(slice_text) == 0:
85         return slice(None)
86         
87     slice_data = []
88     for element in slice_text.split(':'):
89         if len(element) == 0:
90             element = None
91         else:
92             element = int(element)
93         slice_data.append(element)
94
95     return slice(*slice_data)
96         
97
98 def main(cmdline=None):
99     parser = make_parser()
100     opts, args = parser.parse_args(cmdline)
101
102     if opts.infile is not None:
103         qseq_generator = tarfile_generator(opts.infile)
104     else:
105         qseq_generator = file_generator(args)
106         
107     if opts.output is not None:
108         dest = open(opts.output, 'w')
109     else:
110         dest = sys.stdout
111
112     subseq = parse_slice(opts.slice)
113
114     qseq2fastq(dest, qseq_generator, subseq, opts.pf)
115     
116     return 0
117
118 if __name__ == "__main__":
119     main()