From 0d119d6437fe30721d752d75a380c02e60433d1a Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 7 Jan 2011 17:22:36 -0800 Subject: [PATCH] Add validation script Current version just validates illumina phred64 fastq files. But I can add more later. --- htsworkflow/util/test/test_validate.py | 44 ++++++++++++ htsworkflow/util/validate.py | 99 ++++++++++++++++++++++++++ scripts/htsw-validate | 6 ++ setup.py | 1 + 4 files changed, 150 insertions(+) create mode 100644 htsworkflow/util/test/test_validate.py create mode 100644 htsworkflow/util/validate.py create mode 100755 scripts/htsw-validate diff --git a/htsworkflow/util/test/test_validate.py b/htsworkflow/util/test/test_validate.py new file mode 100644 index 0000000..e1906eb --- /dev/null +++ b/htsworkflow/util/test/test_validate.py @@ -0,0 +1,44 @@ +import os +from StringIO import StringIO +import unittest + +from htsworkflow.util import validate + +class TestValidate(unittest.TestCase): + def test_fastq_works(self): + q = StringIO(u"> abc\nAGCT\n@\nBBBB\n") + errors = validate.validate_fastq(q) + self.failUnlessEqual(0, errors) + + def test_fastq_diff_length_uniform(self): + q = StringIO(u"> abc\nAGCT\n@\nBBBB\n> abcd\nAGCTT\n@\nJJJJJ\n") + errors = validate.validate_fastq(q, True) + self.failUnlessEqual(2, errors) + + def test_fastq_diff_length_variable(self): + q = StringIO(u"> abc\nAGCT\n@\n@@@@\n> abcd\nAGCTT\n@\nJJJJJ\n") + errors = validate.validate_fastq(q, False) + self.failUnlessEqual(0, errors) + + def test_fastq_qual_short(self): + q = StringIO(u"> abc\nAGCT\n@\nSS\n") + errors = validate.validate_fastq(q) + self.failUnlessEqual(1, errors) + + def test_fastq_seq_invalid_char(self): + q = StringIO(u"> abc\nAGC\u1310\n@\nPQRS\n") + errors = validate.validate_fastq(q) + self.failUnlessEqual(1, errors) + + def test_fastq_qual_invalid_char(self): + q = StringIO(u"> abc\nAGC.\n@\n!@#J\n") + errors = validate.validate_fastq(q) + self.failUnlessEqual(1, errors) + +def suite(): + return unittest.makeSuite(testValidate, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + diff --git a/htsworkflow/util/validate.py b/htsworkflow/util/validate.py new file mode 100644 index 0000000..f7b8212 --- /dev/null +++ b/htsworkflow/util/validate.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +from optparse import OptionParser +import os +import re +import sys + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + for filename in args[1:]: + stream = open(filename, 'r') + if opts.fastq: + validate_fastq(f, opts.uniform_lengths) + stream.close() + return 0 + +def make_parser(): + parser = OptionParser() + parser.add_option("--fastq", action="store_true", default=False, + help="verify arguments are valid fastq file") + parser.add_option("--uniform-lengths", action="store_true", default=False, + help="require all reads to be of the same length") + + return parser + + +def validate_fastq(stream, uniform_length=False): + """Validate that a fastq file isn't corrupted + + uniform_length - requires that all sequence & qualities must be + the same lengths. + + returns number of errors found + """ + FQ_NONE = 0 + FQ_H1 = 1 + FQ_SEQ = 2 + FQ_H2 = 3 + FQ_QUAL = 4 + h1_re = re.compile("^>[ \t\w]*$") + seq_re = re.compile("^[AGCT.N]+$", re.IGNORECASE) + h2_re = re.compile("^@[ \t\w]*$") + phred33 = re.compile("^[!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ]+$") + phred64 = re.compile("^[@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh]+$") + + state = FQ_H1 + length = None + line_number = 1 + errors = 0 + for line in stream: + line = line.rstrip() + if state == FQ_H1: + # reset length at start of new record for non-uniform check + if not uniform_length: + length = None + # start of record checks + errors = validate_re(h1_re, line, line_number, errors, + "FAIL H1") + state = FQ_SEQ + elif state == FQ_SEQ: + errors = validate_re(seq_re, line, line_number, errors, + "FAIL SEQ") + length, errors = validate_length(line, length, line_number, + errors, + "FAIL SEQ LEN") + state = FQ_H2 + elif state == FQ_H2: + errors = validate_re(h2_re, line, line_number, errors, "FAIL H2") + state = FQ_QUAL + elif state == FQ_QUAL: + errors = validate_re(phred64, line, line_number, errors, + "FAIL QUAL") + length, errors = validate_length(line, length, line_number, errors, + "FAIL QUAL LEN") + state = FQ_H1 + else: + raise RuntimeError("Invalid state: %d" % (state,)) + line_number += 1 + return errors + +def validate_re(pattern, line, line_number, error_count, errmsg): + if pattern.match(line) is None: + print errmsg, "[%d]: %s" % (line_number, line) + error_count += 1 + return error_count + +def validate_length(line, line_length, line_number, error_count, errmsg): + """ + if line_length is None, sets it + """ + if line_length is None: + line_length = len(line) + elif len(line) != line_length: + print errmsg, "%d: %s" %(line_number, line) + error_count += 1 + return line_length, error_count + diff --git a/scripts/htsw-validate b/scripts/htsw-validate new file mode 100755 index 0000000..52b1b74 --- /dev/null +++ b/scripts/htsw-validate @@ -0,0 +1,6 @@ +import sys + +from htsworkflow.util import validate + +if __name__ == "__main__": + sys.exit(validate.main(sys.argv)) diff --git a/setup.py b/setup.py index f074100..0c1a203 100644 --- a/setup.py +++ b/setup.py @@ -34,5 +34,6 @@ setup( "scripts/htsw-srf", "scripts/htsw-srf2fastq", "scripts/htsw-update-archive", + "scripts/htsw-validate", ], ) -- 2.30.2