X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=blobdiff_plain;f=htsworkflow%2Fpipelines%2Ffastq.py;fp=htsworkflow%2Fpipelines%2Ffastq.py;h=cf77005f3c391fdd47a8f967520425ed4c88ab63;hp=0000000000000000000000000000000000000000;hb=c4a11293939ee85eb38f77108195a337de38f494;hpb=656d4c2cd964fa689dfa8b642818bc58a05b2615

diff --git a/htsworkflow/pipelines/fastq.py b/htsworkflow/pipelines/fastq.py
new file mode 100644
index 0000000..cf77005
--- /dev/null
+++ b/htsworkflow/pipelines/fastq.py
@@ -0,0 +1,44 @@
+'''summarize fastq file
+'''
+import os
+import numpy
+
+def summarize_hiseq_fastq(stream):
+    reads = 0
+    pass_qc = 0
+    bad_read = False
+    mean = None
+    eol_length = len(os.linesep)
+
+    for i, line in enumerate(stream):
+        if i % 4 == 0:
+            # header  looks like this
+            # @HWI-ST0787:114:D0PMDACXX:8:1101:1605:2154 1:N:0:TAGCTT
+            # we want the :N (passed filter) or :Y (failed filter)
+            reads += 1
+            # if flag is 'N' we are not a bad read
+            bad_read = False if line[line.rfind(' ') + 3] == 'N' else True
+            if not bad_read:
+                pass_qc += 1
+
+        elif i % 4 == 3:
+            # score
+            if not bad_read:
+                # don't include bad reads in score
+                # score = numpy.asarray(list(line.rstrip()), dtype='c') # 3.5 min
+                #score = numpy.asarray(line[:-eol_length], dtype='c') # 2 min
+                score = numpy.asarray(line[:-eol_length], dtype='c') # 1.4 min
+                score.dtype = numpy.int8
+
+                if mean is None:
+                    mean = numpy.zeros(len(score), dtype=numpy.float)
+
+                delta = score - mean
+                mean = mean + delta / pass_qc
+
+    return (reads, pass_qc, mean)
+
+if __name__ == '__main__':
+    import sys
+    with open(sys.argv[1], 'r') as instream:
+        print summarize_hiseq_fastq(instream)