Add validation script 0.5.0
authorDiane Trout <diane@caltech.edu>
Sat, 8 Jan 2011 01:22:36 +0000 (17:22 -0800)
committerDiane Trout <diane@caltech.edu>
Sat, 8 Jan 2011 01:22:36 +0000 (17:22 -0800)
Current version just validates illumina phred64 fastq files.
But I can add more later.

htsworkflow/util/test/test_validate.py [new file with mode: 0644]
htsworkflow/util/validate.py [new file with mode: 0644]
scripts/htsw-validate [new file with mode: 0755]
setup.py

diff --git a/htsworkflow/util/test/test_validate.py b/htsworkflow/util/test/test_validate.py
new file mode 100644 (file)
index 0000000..e1906eb
--- /dev/null
@@ -0,0 +1,44 @@
+import os
+from StringIO import StringIO
+import unittest
+
+from htsworkflow.util import validate
+
+class TestValidate(unittest.TestCase):
+    def test_fastq_works(self):
+        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(0, errors)
+
+    def test_fastq_diff_length_uniform(self):
+        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n> abcd\nAGCTT\n@\nJJJJJ\n")
+        errors = validate.validate_fastq(q, True)
+        self.failUnlessEqual(2, errors)
+
+    def test_fastq_diff_length_variable(self):
+        q = StringIO(u"> abc\nAGCT\n@\n@@@@\n> abcd\nAGCTT\n@\nJJJJJ\n")
+        errors = validate.validate_fastq(q, False)
+        self.failUnlessEqual(0, errors)
+
+    def test_fastq_qual_short(self):
+        q = StringIO(u"> abc\nAGCT\n@\nSS\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+    def test_fastq_seq_invalid_char(self):
+        q = StringIO(u"> abc\nAGC\u1310\n@\nPQRS\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+    def test_fastq_qual_invalid_char(self):
+        q = StringIO(u"> abc\nAGC.\n@\n!@#J\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+def suite():
+    return unittest.makeSuite(testValidate, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
+
diff --git a/htsworkflow/util/validate.py b/htsworkflow/util/validate.py
new file mode 100644 (file)
index 0000000..f7b8212
--- /dev/null
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser
+import os
+import re
+import sys
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    for filename in args[1:]:
+        stream = open(filename, 'r')
+        if opts.fastq:
+            validate_fastq(f, opts.uniform_lengths)
+        stream.close()
+    return 0
+
+def make_parser():
+    parser = OptionParser()
+    parser.add_option("--fastq", action="store_true", default=False,
+                      help="verify arguments are valid fastq file")
+    parser.add_option("--uniform-lengths", action="store_true", default=False,
+                      help="require all reads to be of the same length")
+                      
+    return parser
+
+
+def validate_fastq(stream, uniform_length=False):
+    """Validate that a fastq file isn't corrupted
+
+    uniform_length - requires that all sequence & qualities must be
+                     the same lengths.
+
+    returns number of errors found
+    """
+    FQ_NONE = 0
+    FQ_H1 = 1
+    FQ_SEQ = 2
+    FQ_H2 = 3
+    FQ_QUAL = 4
+    h1_re = re.compile("^>[ \t\w]*$")
+    seq_re = re.compile("^[AGCT.N]+$", re.IGNORECASE)
+    h2_re = re.compile("^@[ \t\w]*$")
+    phred33 = re.compile("^[!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ]+$")
+    phred64 = re.compile("^[@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh]+$")
+
+    state = FQ_H1
+    length = None
+    line_number = 1
+    errors = 0
+    for line in stream:
+        line = line.rstrip()
+        if state == FQ_H1:
+            # reset length at start of new record for non-uniform check
+            if not uniform_length:
+                length = None
+            # start of record checks
+            errors = validate_re(h1_re, line, line_number, errors,
+                                 "FAIL H1")
+            state = FQ_SEQ
+        elif state == FQ_SEQ:
+            errors = validate_re(seq_re, line, line_number, errors,
+                                 "FAIL SEQ")
+            length, errors = validate_length(line, length, line_number,
+                                             errors,
+                                             "FAIL SEQ LEN")
+            state = FQ_H2
+        elif state == FQ_H2:
+            errors = validate_re(h2_re, line, line_number, errors, "FAIL H2")
+            state = FQ_QUAL
+        elif state == FQ_QUAL:
+            errors = validate_re(phred64, line, line_number, errors,
+                                 "FAIL QUAL")
+            length, errors = validate_length(line, length, line_number, errors,
+                                            "FAIL QUAL LEN")
+            state = FQ_H1
+        else:
+            raise RuntimeError("Invalid state: %d" % (state,))
+        line_number += 1
+    return errors
+
+def validate_re(pattern, line, line_number, error_count, errmsg):
+    if pattern.match(line) is None:
+        print errmsg, "[%d]: %s" % (line_number, line)
+        error_count += 1
+    return error_count
+
+def validate_length(line, line_length, line_number, error_count, errmsg):
+    """
+    if line_length is None, sets it
+    """
+    if line_length is None:
+        line_length = len(line)
+    elif len(line) != line_length:
+        print errmsg, "%d: %s" %(line_number, line)
+        error_count += 1
+    return line_length, error_count
+    
diff --git a/scripts/htsw-validate b/scripts/htsw-validate
new file mode 100755 (executable)
index 0000000..52b1b74
--- /dev/null
@@ -0,0 +1,6 @@
+import sys
+
+from htsworkflow.util import validate
+
+if __name__ == "__main__":
+    sys.exit(validate.main(sys.argv))
index f074100418f83647cc60481d8f1af4493763b608..0c1a203d01999d2c8af669aae16642d8012ca768 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -34,5 +34,6 @@ setup(
         "scripts/htsw-srf",
         "scripts/htsw-srf2fastq",
         "scripts/htsw-update-archive",
+        "scripts/htsw-validate",
         ],
     )