convert some print statements to logger messages
[htsworkflow.git] / htsworkflow / util / validate.py
1 #!/usr/bin/env python
2 from optparse import OptionParser
3 import os
4 import re
5 import sys
6 import logging
7
8 LOGGER = logging.getLogger(__name__)
9
10 def main(cmdline=None):
11     parser = make_parser()
12     opts, args = parser.parse_args(cmdline)
13
14     error_happened = False
15     for filename in args[1:]:
16         stream = open(filename, 'r')
17         
18         if opts.fastq:
19             errors = validate_fastq(stream,
20                                     opts.format,
21                                     opts.uniform_lengths,
22                                     opts.max_errors)
23             if errors > 0:
24                 LOGGER.error("%s failed validation", filename)
25                 error_happened = True
26
27         stream.close()
28
29     if error_happened:
30         return 1
31     
32     return 0
33
34 def make_parser():
35     parser = OptionParser()
36     parser.add_option("--fastq", action="store_true", default=False,
37                       help="verify arguments are valid fastq file")
38     parser.add_option("--uniform-lengths", action="store_true", default=False,
39                       help="require all reads to be of the same length")
40     parser.add_option("--max-errors", type="int", default=None)
41     encodings=['phred33', 'phred64']
42     parser.add_option("--format", type="choice",
43                       choices=encodings,
44                       default='phred64',
45                       help="choose quality encoding one of: %s" % (", ".join(encodings)))
46                       
47     return parser
48
49
50 def validate_fastq(stream, format='phred33', uniform_length=False, max_errors=None):
51     """Validate that a fastq file isn't corrupted
52
53     uniform_length - requires that all sequence & qualities must be
54                      the same lengths.
55
56     returns number of errors found
57     """
58     FQ_NONE = 0
59     FQ_H1 = 1
60     FQ_SEQ = 2
61     FQ_H2 = 3
62     FQ_QUAL = 4
63     h1_re = re.compile("^@[\s\w:-]*$")
64     seq_re = re.compile("^[AGCT.N]+$", re.IGNORECASE)
65     h2_re = re.compile("^\+[\s\w:-]*$")
66     phred33 = re.compile("^[!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ]+$")
67     phred64 = re.compile("^[@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh]+$")
68
69     if format == 'phred33':
70         quality_re = phred33
71     elif format == 'phred64':
72         quality_re = phred64
73     else:
74         raise ValueError("Unrecognized quality format name")
75
76     state = FQ_H1
77     length = None
78     line_number = 1
79     errors = 0
80     for line in stream:
81         line = line.rstrip()
82         len_errors = 0
83         if state == FQ_H1:
84             # reset length at start of new record for non-uniform check
85             if not uniform_length:
86                 length = None
87             # start of record checks
88             errors += validate_re(h1_re, line, line_number, "FAIL H1")
89             state = FQ_SEQ
90         elif state == FQ_SEQ:
91             errors += validate_re(seq_re, line, line_number, "FAIL SEQ")
92             length, len_errors = validate_length(line, length, line_number,
93                                                  "FAIL SEQ LEN")
94             errors += len_errors
95             state = FQ_H2
96         elif state == FQ_H2:
97             errors += validate_re(h2_re, line, line_number, "FAIL H2")
98             state = FQ_QUAL
99         elif state == FQ_QUAL:
100             errors += validate_re(quality_re, line, line_number, "FAIL QUAL")
101             length, len_errors = validate_length(line, length, line_number,
102                                                  "FAIL QUAL LEN")
103             errors += len_errors
104             state = FQ_H1
105         else:
106             raise RuntimeError("Invalid state: %d" % (state,))
107         line_number += 1
108         if max_errors is not None and errors > max_errors:
109             break
110         
111     return errors
112
113 def validate_re(pattern, line, line_number, errmsg):
114     if pattern.match(line) is None:
115         LOGGER.error("%s [%d]: %s", errmsg, line_number, line)
116         return 1
117     else:
118         return 0
119
120 def validate_length(line, line_length, line_number, errmsg):
121     """
122     if line_length is None, sets it
123     """
124     error_count = 0
125     if line_length is None:
126         line_length = len(line)
127     elif len(line) != line_length:
128         LOGGER.error("%s %d: %s", errmsg, line_number, line)
129         error_count = 1
130     return line_length, error_count
131