cf77005f3c391fdd47a8f967520425ed4c88ab63
[htsworkflow.git] / htsworkflow / pipelines / fastq.py
1 '''summarize fastq file
2 '''
3 import os
4 import numpy
5
6 def summarize_hiseq_fastq(stream):
7     reads = 0
8     pass_qc = 0
9     bad_read = False
10     mean = None
11     eol_length = len(os.linesep)
12
13     for i, line in enumerate(stream):
14         if i % 4 == 0:
15             # header  looks like this
16             # @HWI-ST0787:114:D0PMDACXX:8:1101:1605:2154 1:N:0:TAGCTT
17             # we want the :N (passed filter) or :Y (failed filter)
18             reads += 1
19             # if flag is 'N' we are not a bad read
20             bad_read = False if line[line.rfind(' ') + 3] == 'N' else True
21             if not bad_read:
22                 pass_qc += 1
23
24         elif i % 4 == 3:
25             # score
26             if not bad_read:
27                 # don't include bad reads in score
28                 # score = numpy.asarray(list(line.rstrip()), dtype='c') # 3.5 min
29                 #score = numpy.asarray(line[:-eol_length], dtype='c') # 2 min
30                 score = numpy.asarray(line[:-eol_length], dtype='c') # 1.4 min
31                 score.dtype = numpy.int8
32
33                 if mean is None:
34                     mean = numpy.zeros(len(score), dtype=numpy.float)
35
36                 delta = score - mean
37                 mean = mean + delta / pass_qc
38
39     return (reads, pass_qc, mean)
40
41 if __name__ == '__main__':
42     import sys
43     with open(sys.argv[1], 'r') as instream:
44         print summarize_hiseq_fastq(instream)