From c4a11293939ee85eb38f77108195a337de38f494 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Tue, 1 Apr 2014 14:44:46 -0700 Subject: [PATCH] Start implementing tool to compute library fastq quality --- htsworkflow/pipelines/fastq.py | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 htsworkflow/pipelines/fastq.py diff --git a/htsworkflow/pipelines/fastq.py b/htsworkflow/pipelines/fastq.py new file mode 100644 index 0000000..cf77005 --- /dev/null +++ b/htsworkflow/pipelines/fastq.py @@ -0,0 +1,44 @@ +'''summarize fastq file +''' +import os +import numpy + +def summarize_hiseq_fastq(stream): + reads = 0 + pass_qc = 0 + bad_read = False + mean = None + eol_length = len(os.linesep) + + for i, line in enumerate(stream): + if i % 4 == 0: + # header looks like this + # @HWI-ST0787:114:D0PMDACXX:8:1101:1605:2154 1:N:0:TAGCTT + # we want the :N (passed filter) or :Y (failed filter) + reads += 1 + # if flag is 'N' we are not a bad read + bad_read = False if line[line.rfind(' ') + 3] == 'N' else True + if not bad_read: + pass_qc += 1 + + elif i % 4 == 3: + # score + if not bad_read: + # don't include bad reads in score + # score = numpy.asarray(list(line.rstrip()), dtype='c') # 3.5 min + #score = numpy.asarray(line[:-eol_length], dtype='c') # 2 min + score = numpy.asarray(line[:-eol_length], dtype='c') # 1.4 min + score.dtype = numpy.int8 + + if mean is None: + mean = numpy.zeros(len(score), dtype=numpy.float) + + delta = score - mean + mean = mean + delta / pass_qc + + return (reads, pass_qc, mean) + +if __name__ == '__main__': + import sys + with open(sys.argv[1], 'r') as instream: + print summarize_hiseq_fastq(instream) -- 2.30.2