From 1ad7bb0976cad40d628cb59f8ed24b7b5947c725 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Tue, 1 Dec 2009 01:41:40 +0000 Subject: [PATCH] Add a simple utility to convert qseq to fastq files. It'll probably morph into a more complex utility in the near future. --- scripts/qseq2fastq | 91 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 scripts/qseq2fastq diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq new file mode 100644 index 0000000..56589ee --- /dev/null +++ b/scripts/qseq2fastq @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +import os +from optparse import OptionParser +import numpy +import sys + +def make_parser(): + parser = OptionParser() + parser.add_option('-o', '--output', help='output fastq file', default=None) + parser.add_option('-s', '--slice', + help='specify python slice, e.g. 0:75, 0:-1', + default=None) + return parser + +def parse_slice(slice_text): + if slice_text is None or len(slice_text) == 0: + return slice(None) + + slice_data = [] + for element in slice_text.split(':'): + if len(element) == 0: + element = None + else: + element = int(element) + slice_data.append(element) + + return slice(*slice_data) + +def qseq2fastq(destination, qseqs, trim=None): + for q in qseqs: + for line in open(q): + # parse line + record = line.strip().split('\t') + machine_name = record[0] + run_number = record[1] + lane_number = record[2] + tile = record[3] + x = record[4] + y = record[5] + index = record[6] + read = record[7] + sequence = record[8] + # Illumina scores are Phred + 64 + # Fastq scores are Phread + 33 + # the following code grabs the string, converts to short ints and + # subtracts 31 (64-33) to convert between the two score formats. + # The numpy solution is twice as fast as some of my other + # ideas for the conversion. + # sorry about the uglyness in changing from character, to 8-bit int + # and back to a character array + quality = numpy.asarray(record[9],'c') + quality.dtype = numpy.uint8 + quality -= 31 + quality.dtype = '|S1' # I'd like to know what the real numpy char type is + + pass_qc = record[10] + + destination.write('@%s_%s:%s:%s:%s:%s pf=%s%s' % ( \ + machine_name, + run_number, + lane_number, + tile, + x, + y, + pass_qc, + os.linesep)) + destination.write(sequence[trim]) + destination.write(os.linesep) + destination.write('+') + destination.write(os.linesep) + destination.write(quality[trim].tostring()) + destination.write(os.linesep) + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + if opts.output is not None: + dest = open(opts.output, 'w') + else: + dest = sys.stdout + + subseq = parse_slice(opts.slice) + + qseq2fastq(dest, args, subseq) + + return 0 + +if __name__ == "__main__": + main() -- 2.30.2