5 from optparse import OptionParser
10 class Qseq2Fastq(object):
11 def __init__(self, sources, pass_destination, nopass_destination=None):
12 self.sources = sources
13 self.pass_destination = pass_destination
14 if nopass_destination is not None:
15 self.nopass_destination = nopass_destination
17 self.nopass_destination = pass_destination
19 self.trim = slice(None)
20 self.reportFilter = False
23 for qstream in self.sources:
26 record = line.rstrip().split('\t')
27 machine_name = record[0]
28 run_number = record[1]
29 lane_number = record[2]
35 sequence = record[8].replace('.','N')
36 # Illumina scores are Phred + 64
37 # Fastq scores are Phread + 33
38 # the following code grabs the string, converts to short ints and
39 # subtracts 31 (64-33) to convert between the two score formats.
40 # The numpy solution is twice as fast as some of my other
41 # ideas for the conversion.
42 # sorry about the uglyness in changing from character, to 8-bit int
43 # and back to a character array
44 quality = numpy.asarray(record[9],'c')
45 quality.dtype = numpy.uint8
47 quality.dtype = '|S1' # I'd like to know what the real numpy char type is
49 # add pass qc filter if we want it
50 pass_qc = int(record[10])
52 pass_qc_msg = " pf=%s" % (pass_qc)
56 header = '@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
68 # if we passed the filter write to the "good" file
70 destination = self.pass_destination
72 destination = self.nopass_destination
74 destination.write(header)
75 destination.write(sequence[self.trim])
76 destination.write(os.linesep)
77 destination.write('+')
78 destination.write(os.linesep)
79 destination.write(quality[self.trim].tostring())
80 destination.write(os.linesep)
82 def file_generator(pattern_list):
83 for pattern in pattern_list:
84 for filename in glob(pattern):
85 yield open(filename,'r')
87 def tarfile_generator(tarfilename):
88 archive = tarfile.open(tarfilename,'r|*')
89 for tarinfo in archive:
90 yield archive.extractfile(tarinfo)
93 def parse_slice(slice_text):
94 if slice_text is None or len(slice_text) == 0:
98 for element in slice_text.split(':'):
102 element = int(element)
103 slice_data.append(element)
105 return slice(*slice_data)
108 usage = "%prog: [options] *_qseq.txt"
109 parser = OptionParser(usage)
110 parser.add_option('-i', '--infile', default=None,
111 help='source tar file (if reading from an archive instead of a directory)')
112 parser.add_option('-o', '--output', default=None,
113 help='output fastq file')
114 parser.add_option('-n', '--nopass-output', default=None,
115 help='if provided send files that failed illumina filter '\
116 'to a differentfile')
117 parser.add_option('-s', '--slice',
118 help='specify python slice, e.g. 0:75, 0:-1',
120 parser.add_option('--pf', help="report pass filter flag", default=False,
124 def main(cmdline=None):
125 parser = make_parser()
126 opts, args = parser.parse_args(cmdline)
128 if opts.infile is not None:
129 qseq_generator = tarfile_generator(opts.infile)
131 qseq_generator = file_generator(args)
133 if opts.output is not None:
134 output = open(opts.output, 'w')
138 if opts.nopass_output is not None:
139 nopass_output = open(opts.nopass_output, 'w')
144 qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
145 qseq_parser.trim = parse_slice(opts.slice)
146 qseq_parser.reportFilter = opts.pf
150 if __name__ == "__main__":