X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=blobdiff_plain;f=htsworkflow%2Fpipelines%2Ffastq.py;fp=htsworkflow%2Fpipelines%2Ffastq.py;h=cf77005f3c391fdd47a8f967520425ed4c88ab63;hp=0000000000000000000000000000000000000000;hb=c4a11293939ee85eb38f77108195a337de38f494;hpb=656d4c2cd964fa689dfa8b642818bc58a05b2615 diff --git a/htsworkflow/pipelines/fastq.py b/htsworkflow/pipelines/fastq.py new file mode 100644 index 0000000..cf77005 --- /dev/null +++ b/htsworkflow/pipelines/fastq.py @@ -0,0 +1,44 @@ +'''summarize fastq file +''' +import os +import numpy + +def summarize_hiseq_fastq(stream): + reads = 0 + pass_qc = 0 + bad_read = False + mean = None + eol_length = len(os.linesep) + + for i, line in enumerate(stream): + if i % 4 == 0: + # header looks like this + # @HWI-ST0787:114:D0PMDACXX:8:1101:1605:2154 1:N:0:TAGCTT + # we want the :N (passed filter) or :Y (failed filter) + reads += 1 + # if flag is 'N' we are not a bad read + bad_read = False if line[line.rfind(' ') + 3] == 'N' else True + if not bad_read: + pass_qc += 1 + + elif i % 4 == 3: + # score + if not bad_read: + # don't include bad reads in score + # score = numpy.asarray(list(line.rstrip()), dtype='c') # 3.5 min + #score = numpy.asarray(line[:-eol_length], dtype='c') # 2 min + score = numpy.asarray(line[:-eol_length], dtype='c') # 1.4 min + score.dtype = numpy.int8 + + if mean is None: + mean = numpy.zeros(len(score), dtype=numpy.float) + + delta = score - mean + mean = mean + delta / pass_qc + + return (reads, pass_qc, mean) + +if __name__ == '__main__': + import sys + with open(sys.argv[1], 'r') as instream: + print summarize_hiseq_fastq(instream)