#!/usr/bin/env python
+from glob import glob
import os
from optparse import OptionParser
import numpy
import sys
+import tarfile
-def make_parser():
- parser = OptionParser()
- parser.add_option('-o', '--output', help='output fastq file', default=None)
- parser.add_option('-s', '--slice',
- help='specify python slice, e.g. 0:75, 0:-1',
- default=None)
- return parser
-
-def parse_slice(slice_text):
- if slice_text is None or len(slice_text) == 0:
- return slice(None)
-
- slice_data = []
- for element in slice_text.split(':'):
- if len(element) == 0:
- element = None
- else:
- element = int(element)
- slice_data.append(element)
-
- return slice(*slice_data)
-
-def qseq2fastq(destination, qseqs, trim=None):
- for q in qseqs:
- for line in open(q):
+def qseq2fastq(destination, qseqs, trim=None, pf=False):
+ for qstream in qseqs:
+ for line in qstream:
# parse line
record = line.strip().split('\t')
machine_name = record[0]
y = record[5]
index = record[6]
read = record[7]
- sequence = record[8]
+ sequence = record[8].replace('.','N')
# Illumina scores are Phred + 64
# Fastq scores are Phread + 33
# the following code grabs the string, converts to short ints and
quality.dtype = numpy.uint8
quality -= 31
quality.dtype = '|S1' # I'd like to know what the real numpy char type is
-
- pass_qc = record[10]
- destination.write('@%s_%s:%s:%s:%s:%s pf=%s%s' % ( \
+ if pf:
+ pass_qc = record[10]
+ pass_qc_msg = " pf=%s" % (pass_qc)
+ else:
+ pass_qc_msg = ""
+
+ destination.write('@%s_%s:%s:%s:%s:%s/%s%s%s' % ( \
machine_name,
run_number,
lane_number,
tile,
x,
y,
- pass_qc,
+ read,
+ pass_qc_msg,
os.linesep))
destination.write(sequence[trim])
destination.write(os.linesep)
destination.write(os.linesep)
destination.write(quality[trim].tostring())
destination.write(os.linesep)
+
+def file_generator(pattern_list):
+ for pattern in pattern_list:
+ for filename in glob(pattern):
+ yield open(filename,'r')
+
+def tarfile_generator(tarfilename):
+ archive = tarfile.open(tarfilename,'r|*')
+ for tarinfo in archive:
+ yield archive.extractfile(tarinfo)
+
+def make_parser():
+ usage = "%prog: [options] *_qseq.txt"
+ parser = OptionParser(usage)
+ parser.add_option('-i', '--infile', default=None,
+ help='source tar file (if reading from an archive instead of a directory)')
+ parser.add_option('-o', '--output', help='output fastq file', default=None)
+ parser.add_option('-s', '--slice',
+ help='specify python slice, e.g. 0:75, 0:-1',
+ default=None)
+ parser.add_option('--pf', help="report pass filter flag", default=False,
+ action="store_true")
+ return parser
+
+def parse_slice(slice_text):
+ if slice_text is None or len(slice_text) == 0:
+ return slice(None)
+ slice_data = []
+ for element in slice_text.split(':'):
+ if len(element) == 0:
+ element = None
+ else:
+ element = int(element)
+ slice_data.append(element)
+
+ return slice(*slice_data)
+
+
def main(cmdline=None):
parser = make_parser()
opts, args = parser.parse_args(cmdline)
+ if opts.infile is not None:
+ qseq_generator = tarfile_generator(opts.infile)
+ else:
+ qseq_generator = file_generator(args)
+
if opts.output is not None:
dest = open(opts.output, 'w')
else:
subseq = parse_slice(opts.slice)
- qseq2fastq(dest, args, subseq)
+ qseq2fastq(dest, qseq_generator, subseq, opts.pf)
return 0