5 from optparse import OptionParser
10 class Qseq2Fastq(object):
12 Convert qseq files to fastq (or fasta) files.
14 def __init__(self, sources, pass_destination, nopass_destination=None):
15 self.sources = sources
16 self.pass_destination = pass_destination
17 if nopass_destination is not None:
18 self.nopass_destination = nopass_destination
20 self.nopass_destination = pass_destination
23 self.flowcell_id = None
24 self.trim = slice(None)
25 self.reportFilter = False
27 def _format_flowcell_id(self):
29 Return formatted flowcell ID
31 if self.flowcell_id is not None:
32 return self.flowcell_id+"_"
36 def _convert_illumina_quality(self, illumina_quality):
38 Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
40 # Illumina scores are Phred + 64
41 # Fastq scores are Phread + 33
42 # the following code grabs the string, converts to short ints and
43 # subtracts 31 (64-33) to convert between the two score formats.
44 # The numpy solution is twice as fast as some of my other
45 # ideas for the conversion.
46 # sorry about the uglyness in changing from character, to 8-bit int
47 # and back to a character array
48 quality = numpy.asarray(illumina_quality,'c')
49 quality.dtype = numpy.uint8
51 quality.dtype = '|S1' # I'd like to know what the real numpy char type is
60 header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
62 for qstream in self.sources:
65 record = line.rstrip().split('\t')
66 machine_name = record[0]
67 run_number = record[1]
68 lane_number = record[2]
74 sequence = record[8].replace('.','N')
75 quality = self._convert_illumina_quality(record[9])
77 # add pass qc filter if we want it
78 pass_qc = int(record[10])
80 pass_qc_msg = " pf=%s" % (pass_qc)
84 header = header_template % ( \
96 # if we passed the filter write to the "good" file
98 destination = self.pass_destination
100 destination = self.nopass_destination
102 destination.write(header)
103 destination.write(sequence[self.trim])
104 destination.write(os.linesep)
106 destination.write('+')
107 destination.write(os.linesep)
108 destination.write(quality[self.trim].tostring())
109 destination.write(os.linesep)
111 def file_generator(pattern_list):
112 for pattern in pattern_list:
113 for filename in glob(pattern):
114 yield open(filename,'r')
116 def tarfile_generator(tarfilename):
117 archive = tarfile.open(tarfilename,'r|*')
118 for tarinfo in archive:
119 yield archive.extractfile(tarinfo)
122 def parse_slice(slice_text):
123 if slice_text is None or len(slice_text) == 0:
127 for element in slice_text.split(':'):
128 if len(element) == 0:
131 element = int(element)
132 slice_data.append(element)
134 return slice(*slice_data)
137 usage = "%prog: [options] *_qseq.txt"
138 parser = OptionParser(usage)
139 parser.add_option('-a', '--fasta', default=False, action="store_true",
140 help="produce fasta files instead of fastq files")
141 parser.add_option('-f', '--flowcell', default=None,
142 help="Set flowcell ID for output file")
143 parser.add_option('-i', '--infile', default=None,
144 help='source tar file (if reading from an archive instead of a directory)')
145 parser.add_option('-o', '--output', default=None,
146 help='output fastq file')
147 parser.add_option('-n', '--nopass-output', default=None,
148 help='if provided send files that failed illumina filter '\
149 'to a differentfile')
150 parser.add_option('-s', '--slice',
151 help='specify python slice, e.g. 0:75, 0:-1',
153 parser.add_option('--pf', help="report pass filter flag", default=False,
157 def main(cmdline=None):
158 parser = make_parser()
159 opts, args = parser.parse_args(cmdline)
161 if opts.infile is not None:
162 qseq_generator = tarfile_generator(opts.infile)
164 qseq_generator = file_generator(args)
166 qseq_generator = [sys.stdin]
169 if opts.output is not None:
170 output = open(opts.output, 'w')
174 if opts.nopass_output is not None:
175 nopass_output = open(opts.nopass_output, 'w')
179 qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
180 qseq_parser.fastq = not opts.fasta
181 qseq_parser.flowcell_id = opts.flowcell
182 qseq_parser.trim = parse_slice(opts.slice)
183 qseq_parser.reportFilter = opts.pf
187 if __name__ == "__main__":