From 5eaabe17d8c4c780cf989da0e837b800d46d1674 Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Mon, 9 May 2011 14:52:04 -0700
Subject: [PATCH] Validate fastq files in both phred33 & phred64 versions Do
 not validate some imaginary fasta/fastq hybrid that I imagined late some
 night.

I needed to add a parameter to pick which fastq format is in use.
---
 settings.py => htsworkflow/settings.py |  0
 htsworkflow/util/test/test_validate.py | 28 ++++++----
 htsworkflow/util/validate.py           | 72 ++++++++++++++++++--------
 3 files changed, 70 insertions(+), 30 deletions(-)
 rename settings.py => htsworkflow/settings.py (100%)

diff --git a/settings.py b/htsworkflow/settings.py
similarity index 100%
rename from settings.py
rename to htsworkflow/settings.py
diff --git a/htsworkflow/util/test/test_validate.py b/htsworkflow/util/test/test_validate.py
index e1906eb..513a3a3 100644
--- a/htsworkflow/util/test/test_validate.py
+++ b/htsworkflow/util/test/test_validate.py
@@ -5,33 +5,43 @@ import unittest
 from htsworkflow.util import validate
 
 class TestValidate(unittest.TestCase):
-    def test_fastq_works(self):
-        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n")
+    def test_phred33_works(self):
+        q = StringIO(u"@ abc\nAGCT\n+\nBBBB\n")
         errors = validate.validate_fastq(q)
         self.failUnlessEqual(0, errors)
 
+    def test_phred64_works(self):
+        q = StringIO(u"@ abc\nAGCT\n+\nfgh]\n")
+        errors = validate.validate_fastq(q, 'phred64')
+        self.failUnlessEqual(0, errors)
+
+    def test_fasta_fails(self):
+        q = StringIO(u">abc\nAGCT\n>foo\nCGAT\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(3, errors)
+
     def test_fastq_diff_length_uniform(self):
-        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n> abcd\nAGCTT\n@\nJJJJJ\n")
-        errors = validate.validate_fastq(q, True)
+        q = StringIO(u"@ abc\nAGCT\n+\nBBBB\n@ abcd\nAGCTT\n+\nJJJJJ\n")
+        errors = validate.validate_fastq(q, 'phred33', True)
         self.failUnlessEqual(2, errors)
 
     def test_fastq_diff_length_variable(self):
-        q = StringIO(u"> abc\nAGCT\n@\n@@@@\n> abcd\nAGCTT\n@\nJJJJJ\n")
-        errors = validate.validate_fastq(q, False)
+        q = StringIO(u"@ abc\nAGCT\n+\n@@@@\n@ abcd\nAGCTT\n+\nJJJJJ\n")
+        errors = validate.validate_fastq(q, 'phred33', False)
         self.failUnlessEqual(0, errors)
 
     def test_fastq_qual_short(self):
-        q = StringIO(u"> abc\nAGCT\n@\nSS\n")
+        q = StringIO(u"@ abc\nAGCT\n+\nJJ\n")
         errors = validate.validate_fastq(q)
         self.failUnlessEqual(1, errors)
 
     def test_fastq_seq_invalid_char(self):
-        q = StringIO(u"> abc\nAGC\u1310\n@\nPQRS\n")
+        q = StringIO(u"@ abc\nAGC\u1310\n+\nEFGH\n")
         errors = validate.validate_fastq(q)
         self.failUnlessEqual(1, errors)
 
     def test_fastq_qual_invalid_char(self):
-        q = StringIO(u"> abc\nAGC.\n@\n!@#J\n")
+        q = StringIO(u"+ abc\nAGC.\n+\n!@#J\n")
         errors = validate.validate_fastq(q)
         self.failUnlessEqual(1, errors)
 
diff --git a/htsworkflow/util/validate.py b/htsworkflow/util/validate.py
index f7b8212..959acc2 100644
--- a/htsworkflow/util/validate.py
+++ b/htsworkflow/util/validate.py
@@ -9,11 +9,24 @@ def main(cmdline=None):
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
 
+    error_happened = False
     for filename in args[1:]:
         stream = open(filename, 'r')
+        
         if opts.fastq:
-            validate_fastq(f, opts.uniform_lengths)
+            errors = validate_fastq(stream,
+                                    opts.format,
+                                    opts.uniform_lengths,
+                                    opts.max_errors)
+            if errors > 0:
+                print "%s failed validation" % (filename,)
+                error_happened = True
+
         stream.close()
+
+    if error_happened:
+        return 1
+    
     return 0
 
 def make_parser():
@@ -22,11 +35,17 @@ def make_parser():
                       help="verify arguments are valid fastq file")
     parser.add_option("--uniform-lengths", action="store_true", default=False,
                       help="require all reads to be of the same length")
+    parser.add_option("--max-errors", type="int", default=None)
+    encodings=['phred33', 'phred64']
+    parser.add_option("--format", type="choice",
+                      choices=encodings,
+                      default='phred64',
+                      help="choose quality encoding one of: %s" % (", ".join(encodings)))
                       
     return parser
 
 
-def validate_fastq(stream, uniform_length=False):
+def validate_fastq(stream, format='phred33', uniform_length=False, max_errors=None):
     """Validate that a fastq file isn't corrupted
 
     uniform_length - requires that all sequence & qualities must be
@@ -39,61 +58,72 @@ def validate_fastq(stream, uniform_length=False):
     FQ_SEQ = 2
     FQ_H2 = 3
     FQ_QUAL = 4
-    h1_re = re.compile("^>[ \t\w]*$")
+    h1_re = re.compile("^@[\s\w:-]*$")
     seq_re = re.compile("^[AGCT.N]+$", re.IGNORECASE)
-    h2_re = re.compile("^@[ \t\w]*$")
+    h2_re = re.compile("^\+[\s\w:-]*$")
     phred33 = re.compile("^[!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ]+$")
     phred64 = re.compile("^[@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh]+$")
 
+    if format == 'phred33':
+        quality_re = phred33
+    elif format == 'phred64':
+        quality_re = phred64
+    else:
+        raise ValueError("Unrecognized quality format name")
+
     state = FQ_H1
     length = None
     line_number = 1
     errors = 0
     for line in stream:
         line = line.rstrip()
+        len_errors = 0
         if state == FQ_H1:
             # reset length at start of new record for non-uniform check
             if not uniform_length:
                 length = None
             # start of record checks
-            errors = validate_re(h1_re, line, line_number, errors,
-                                 "FAIL H1")
+            errors += validate_re(h1_re, line, line_number, "FAIL H1")
             state = FQ_SEQ
         elif state == FQ_SEQ:
-            errors = validate_re(seq_re, line, line_number, errors,
-                                 "FAIL SEQ")
-            length, errors = validate_length(line, length, line_number,
-                                             errors,
-                                             "FAIL SEQ LEN")
+            errors += validate_re(seq_re, line, line_number, "FAIL SEQ")
+            length, len_errors = validate_length(line, length, line_number,
+                                                 "FAIL SEQ LEN")
+            errors += len_errors
             state = FQ_H2
         elif state == FQ_H2:
-            errors = validate_re(h2_re, line, line_number, errors, "FAIL H2")
+            errors += validate_re(h2_re, line, line_number, "FAIL H2")
             state = FQ_QUAL
         elif state == FQ_QUAL:
-            errors = validate_re(phred64, line, line_number, errors,
-                                 "FAIL QUAL")
-            length, errors = validate_length(line, length, line_number, errors,
-                                            "FAIL QUAL LEN")
+            errors += validate_re(quality_re, line, line_number, "FAIL QUAL")
+            length, len_errors = validate_length(line, length, line_number,
+                                                 "FAIL QUAL LEN")
+            errors += len_errors
             state = FQ_H1
         else:
             raise RuntimeError("Invalid state: %d" % (state,))
         line_number += 1
+        if max_errors is not None and errors > max_errors:
+            break
+        
     return errors
 
-def validate_re(pattern, line, line_number, error_count, errmsg):
+def validate_re(pattern, line, line_number, errmsg):
     if pattern.match(line) is None:
         print errmsg, "[%d]: %s" % (line_number, line)
-        error_count += 1
-    return error_count
+        return 1
+    else:
+        return 0
 
-def validate_length(line, line_length, line_number, error_count, errmsg):
+def validate_length(line, line_length, line_number, errmsg):
     """
     if line_length is None, sets it
     """
+    error_count = 0
     if line_length is None:
         line_length = len(line)
     elif len(line) != line_length:
         print errmsg, "%d: %s" %(line_number, line)
-        error_count += 1
+        error_count = 1
     return line_length, error_count
     
-- 
2.30.2