Merge branch 'master' into debianized debianized
authorDiane Trout <diane@caltech.edu>
Mon, 7 Feb 2011 23:53:55 +0000 (15:53 -0800)
committerDiane Trout <diane@caltech.edu>
Mon, 7 Feb 2011 23:53:55 +0000 (15:53 -0800)
12 files changed:
MANIFEST.in
extra/ucsc_encode_submission/ucsc_gather.py
htsworkflow/frontend/samples/models.py
htsworkflow/frontend/templates/samples/library_detail.html
htsworkflow/frontend/urls.py
htsworkflow/pipelines/qseq2fastq.py
htsworkflow/pipelines/test/test_eland.py
htsworkflow/pipelines/test/test_qseq2fastq.py [new file with mode: 0644]
htsworkflow/util/test/test_validate.py [new file with mode: 0644]
htsworkflow/util/validate.py [new file with mode: 0644]
scripts/htsw-validate [new file with mode: 0755]
setup.py

index c4e95cde4d873e21c79e3870647bbebb5c8ace78..c96b250a1ed7f752d2e79ca91e85019ad77deffd 100644 (file)
@@ -1,2 +1,2 @@
 include RELEASE-VERSION
-version.py
+include version.py
index d160b219b7014fc01d02d1068276168642cc3c85..dc5fe184cb14204499671990f26ba56b414c1f1c 100755 (executable)
@@ -46,6 +46,9 @@ def main(cmdline=None):
     for a in args:
         library_result_map.extend(read_library_result_map(a))
 
+    if opts.make_tree_from is not None:
+        make_tree_from(opts.make_tree_from, library_result_map)
+            
     if opts.daf is not None:
         link_daf(opts.daf, library_result_map)
 
@@ -83,6 +86,9 @@ def make_parser():
     parser = OptionParser()
 
     # commands
+    parser.add_option('--make-tree-from',
+                      help="create directories & link data files",
+                      default=None)
     parser.add_option('--fastq', help="generate scripts for making fastq files",
                       default=False, action="store_true")
 
@@ -113,6 +119,26 @@ def make_parser():
     return parser
 
 
+def make_tree_from(source_path, library_result_map):
+    """Create a tree using data files from source path.
+    """
+    for lib_id, lib_path in library_result_map:
+        if not os.path.exists(lib_path):
+            logging.info("Making dir {0}".format(lib_path))
+            os.mkdir(lib_path)
+        source_lib_dir = os.path.join(source_path, lib_path)
+        if os.path.exists(source_lib_dir):
+            pass
+        for filename in os.listdir(source_lib_dir):
+            source_pathname = os.path.join(source_lib_dir, filename)
+            target_pathname = os.path.join(lib_path, filename)
+            if not os.path.exists(source_pathname):
+                raise IOError("{0} does not exist".format(source_pathname))
+            if not os.path.exists(target_pathname):
+                os.symlink(source_pathname, target_pathname)
+                logging.info(
+                    'LINK {0} to {1}'.format(source_pathname, target_pathname))
+    
 def build_fastqs(host, apidata, sequences_path, library_result_map, 
                  force=False ):
     """
@@ -423,12 +449,14 @@ def make_condor_archive_script(ininame, files):
     script = """Universe = vanilla
 
 Executable = /bin/tar
-arguments = czvf ../%(archivename)s %(filelist)s
+arguments = czvhf ../%(archivename)s %(filelist)s
 
 Error = compress.err.$(Process).log
 Output = compress.out.$(Process).log
 Log = /tmp/submission-compress-%(user)s.log
 initialdir = %(initialdir)s
+environment="GZIP=-3"
+request_memory = 20
 
 queue 
 """
@@ -596,10 +624,11 @@ class NameToViewMap(object):
 
         self.patterns = [
             ('*.bai',                   None),
-            ('*.bam',                   self._guess_bam_view),
             ('*.splices.bam',           'Splices'),
+            ('*.bam',                   self._guess_bam_view),
             ('junctions.bed',           'Junctions'),
             ('*.jnct',                  'Junctions'),
+            ('*.unique.bigwig',         None),
             ('*.plus.bigwig',           'PlusSignal'),
             ('*.minus.bigwig',          'MinusSignal'),
             ('*.bigwig',                'Signal'),
@@ -607,21 +636,25 @@ class NameToViewMap(object):
             ('*.condor',                None),
             ('*.daf',                   None),
             ('*.ddf',                   None),
-            ('cufflinks-0.9.0-genes.expr',       'GeneDeNovo'),
-            ('cufflinks-0.9.0-transcripts.expr', 'TranscriptDeNovo'),
-            ('cufflinks-0.9.0-transcripts.gtf',  'GeneModel'),
-            ('GENCODE-v3c-genes.expr',       'GeneGencV3c'),
-            ('GENCODE-v3c-transcripts.expr', 'TranscriptGencV3c'),
-            ('GENCODE-v4-genes.expr',       'GeneGencV4'),
-            ('GENCODE-v4-transcripts.expr', 'TranscriptGencV4'),
-            ('GENCODE-v4-transcript.expr', 'TranscriptGencV4'),
+            ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
+            ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
+            ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
+            ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
+            ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
+            ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
+            ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
+            ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
+            ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
+            ('*_1.75mers.fastq',              'FastqRd1'),
+            ('*_2.75mers.fastq',              'FastqRd2'),
             ('*_r1.fastq',              'FastqRd1'),
             ('*_r2.fastq',              'FastqRd2'),
             ('*.fastq',                 'Fastq'),
             ('*.gtf',                   'GeneModel'),
             ('*.ini',                   None),
             ('*.log',                   None),
-            ('*.stats.txt',             'InsLength'),
+            ('paired-end-distribution*', 'InsLength'),
+            ('*.stats.txt',              'InsLength'),
             ('*.srf',                   None),
             ('*.wig',                   None),
             ('*.zip',                   None),
@@ -630,6 +663,7 @@ class NameToViewMap(object):
         self.views = {
             None: {"MapAlgorithm": "NA"},
             "Paired": {"MapAlgorithm": ma},
+            "Aligns": {"MapAlgorithm": ma},
             "Single": {"MapAlgorithm": ma},
             "Splices": {"MapAlgorithm": ma},
             "Junctions": {"MapAlgorithm": ma},
@@ -639,14 +673,14 @@ class NameToViewMap(object):
             "GeneModel": {"MapAlgorithm": ma},
             "GeneDeNovo": {"MapAlgorithm": ma},
             "TranscriptDeNovo": {"MapAlgorithm": ma},
-            "GeneGencV3c": {"MapAlgorithm": ma},
+            "GeneGCV3c": {"MapAlgorithm": ma},
+            "TranscriptGCV3c": {"MapAlgorithm": ma},
             "TranscriptGencV3c": {"MapAlgorithm": ma},
-            "GeneGencV4": {"MapAlgorithm": ma},
-            "TranscriptGencV4": {"MapAlgorithm": ma},
+            "GeneGCV4": {"MapAlgorithm": ma},
+            "TranscriptGCV4": {"MapAlgorithm": ma},
             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
-            "GeneModel": {"MapAlgorithm": ma},
             "InsLength": {"MapAlgorithm": ma},
             }
         # view name is one of the attributes
@@ -695,7 +729,7 @@ class NameToViewMap(object):
         if is_paired:
             return "Paired"
         else:
-            return "Align"
+            return "Aligns"
 
 
     def _is_paired(self, lib_id, lib_info):
index 0a65651abb02a54c4bb397ec93a12a058d73d20f..59b4c244d8b77805207a3562de316172463113fc 100644 (file)
@@ -161,7 +161,8 @@ class Library(models.Model):
       ('1A', 'Ligation, then gel'),
       ('PCR', 'Ligation, then PCR'),
       ('1Ab', 'Ligation, PCR, then gel'),
-      ('1Aa', 'Ligation, gel, then PCR'),
+      ('1Ac', 'Ligation, gel, then 12x PCR'),
+      ('1Aa', 'Ligation, gel, then 18x PCR'),
       ('2A', 'Ligation, PCR, gel, PCR'),
       ('Done', 'Completed'),
     )
index e790fc491e679c6c022e38708bbb3517d60b167d..6c6fa8d93a79015a4e3daf22941ea3cdc1a8d3da 100644 (file)
@@ -66,6 +66,7 @@
   <b>Concentration</b>: {{ lib.undiluted_concentration }} ng/µl<br/>
   <b>Gel Cut Size</b>: {{ lib.gel_cut_size }}<br/>
   <b>Insert Size</b>: {{ lib.insert_size }}<br/>
+  <b>Replicate</b>: {{ lib.replicate }}<br/>
   <b>Made By</b>: {{ lib.made_by }} <br/>
   <b>Affiliations</b>:
   <ul>
index 57d00e1daf4d39664ca96b66daa08015a1686dd8..7856f0628c62f2f57bfc928f4390e3abbdd49dc9 100644 (file)
@@ -43,9 +43,9 @@ urlpatterns = patterns('',
     # sample / library information
     (r'^samples/', include('htsworkflow.frontend.samples.urls')),                   
     # Raw result files
-    (r'^results/(?P<flowcell_id>\w+)/(?P<cnm>C[1-9]-[0-9]+)/summary/',
+    (r'^results/(?P<flowcell_id>\w+)/(?P<cnm>C[0-9]+-[0-9]+)/summary/',
       'htsworkflow.frontend.samples.views.summaryhtm_fc_cnm'),
-    (r'^results/(?P<flowcell_id>\w+)/(?P<cnm>C[1-9]-[0-9]+)/eland_result/(?P<lane>[1-8])',
+    (r'^results/(?P<flowcell_id>\w+)/(?P<cnm>C[0-9]+-[0-9]+)/eland_result/(?P<lane>[1-8])',
       'htsworkflow.frontend.samples.views.result_fc_cnm_eland_lane'),
     (r'^results/(?P<fc_id>\w+)/(?P<cnm>C[1-9]-[0-9]+)/bedfile/(?P<lane>[1-8])/ucsc',
       'htsworkflow.frontend.samples.views.bedfile_fc_cnm_eland_lane_ucsc'),
index 286663b24e43ad9c11177fbe1d47fb1ea5fbc946..ac44c1edde131f198e5d02f9d64a644bcdb00f68 100755 (executable)
@@ -43,7 +43,7 @@ def main(cmdline=None):
 
 
 def make_parser():
-    usage = "%prog: [options] *_qseq.txt"
+    usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
     parser = OptionParser(usage)
     parser.add_option("-a", "--fasta", default=False, action="store_true",
                       help="produce fasta files instead of fastq files")
index bd215c1b13a4a7d6dd402b4778e46e99e903e503..4ffb3e7db1120e893690e8684c6a89f87dd7111c 100644 (file)
@@ -164,7 +164,7 @@ class ElandTests(unittest.TestCase):
 
 
 def suite():
-    return unittest.makeSquite(ElandTests, 'test')
+    return unittest.makeSuite(ElandTests, 'test')
 
 if __name__ == "__main__":
     unittest.main(defaultTest="suite")
diff --git a/htsworkflow/pipelines/test/test_qseq2fastq.py b/htsworkflow/pipelines/test/test_qseq2fastq.py
new file mode 100644 (file)
index 0000000..1c32924
--- /dev/null
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import unittest
+
+import htsworkflow.pipelines.qseq2fastq as qseq2fastq
+
+class TestQseq2Fastq(unittest.TestCase):
+    def test_parse_slice(self):
+        s = qseq2fastq.parse_slice("1:")
+        self.failUnlessEqual(s.start, 1)
+        self.failUnlessEqual(s.stop, None)
+
+        s = qseq2fastq.parse_slice("0:2")
+        self.failUnlessEqual(s.start, 0)
+        self.failUnlessEqual(s.stop, 2)
+
+def suite():
+    return unittest.makeSuite(TestQseq2Fastq, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
+    
diff --git a/htsworkflow/util/test/test_validate.py b/htsworkflow/util/test/test_validate.py
new file mode 100644 (file)
index 0000000..e1906eb
--- /dev/null
@@ -0,0 +1,44 @@
+import os
+from StringIO import StringIO
+import unittest
+
+from htsworkflow.util import validate
+
+class TestValidate(unittest.TestCase):
+    def test_fastq_works(self):
+        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(0, errors)
+
+    def test_fastq_diff_length_uniform(self):
+        q = StringIO(u"> abc\nAGCT\n@\nBBBB\n> abcd\nAGCTT\n@\nJJJJJ\n")
+        errors = validate.validate_fastq(q, True)
+        self.failUnlessEqual(2, errors)
+
+    def test_fastq_diff_length_variable(self):
+        q = StringIO(u"> abc\nAGCT\n@\n@@@@\n> abcd\nAGCTT\n@\nJJJJJ\n")
+        errors = validate.validate_fastq(q, False)
+        self.failUnlessEqual(0, errors)
+
+    def test_fastq_qual_short(self):
+        q = StringIO(u"> abc\nAGCT\n@\nSS\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+    def test_fastq_seq_invalid_char(self):
+        q = StringIO(u"> abc\nAGC\u1310\n@\nPQRS\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+    def test_fastq_qual_invalid_char(self):
+        q = StringIO(u"> abc\nAGC.\n@\n!@#J\n")
+        errors = validate.validate_fastq(q)
+        self.failUnlessEqual(1, errors)
+
+def suite():
+    return unittest.makeSuite(testValidate, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
+
diff --git a/htsworkflow/util/validate.py b/htsworkflow/util/validate.py
new file mode 100644 (file)
index 0000000..f7b8212
--- /dev/null
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+from optparse import OptionParser
+import os
+import re
+import sys
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    for filename in args[1:]:
+        stream = open(filename, 'r')
+        if opts.fastq:
+            validate_fastq(f, opts.uniform_lengths)
+        stream.close()
+    return 0
+
+def make_parser():
+    parser = OptionParser()
+    parser.add_option("--fastq", action="store_true", default=False,
+                      help="verify arguments are valid fastq file")
+    parser.add_option("--uniform-lengths", action="store_true", default=False,
+                      help="require all reads to be of the same length")
+                      
+    return parser
+
+
+def validate_fastq(stream, uniform_length=False):
+    """Validate that a fastq file isn't corrupted
+
+    uniform_length - requires that all sequence & qualities must be
+                     the same lengths.
+
+    returns number of errors found
+    """
+    FQ_NONE = 0
+    FQ_H1 = 1
+    FQ_SEQ = 2
+    FQ_H2 = 3
+    FQ_QUAL = 4
+    h1_re = re.compile("^>[ \t\w]*$")
+    seq_re = re.compile("^[AGCT.N]+$", re.IGNORECASE)
+    h2_re = re.compile("^@[ \t\w]*$")
+    phred33 = re.compile("^[!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ]+$")
+    phred64 = re.compile("^[@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh]+$")
+
+    state = FQ_H1
+    length = None
+    line_number = 1
+    errors = 0
+    for line in stream:
+        line = line.rstrip()
+        if state == FQ_H1:
+            # reset length at start of new record for non-uniform check
+            if not uniform_length:
+                length = None
+            # start of record checks
+            errors = validate_re(h1_re, line, line_number, errors,
+                                 "FAIL H1")
+            state = FQ_SEQ
+        elif state == FQ_SEQ:
+            errors = validate_re(seq_re, line, line_number, errors,
+                                 "FAIL SEQ")
+            length, errors = validate_length(line, length, line_number,
+                                             errors,
+                                             "FAIL SEQ LEN")
+            state = FQ_H2
+        elif state == FQ_H2:
+            errors = validate_re(h2_re, line, line_number, errors, "FAIL H2")
+            state = FQ_QUAL
+        elif state == FQ_QUAL:
+            errors = validate_re(phred64, line, line_number, errors,
+                                 "FAIL QUAL")
+            length, errors = validate_length(line, length, line_number, errors,
+                                            "FAIL QUAL LEN")
+            state = FQ_H1
+        else:
+            raise RuntimeError("Invalid state: %d" % (state,))
+        line_number += 1
+    return errors
+
+def validate_re(pattern, line, line_number, error_count, errmsg):
+    if pattern.match(line) is None:
+        print errmsg, "[%d]: %s" % (line_number, line)
+        error_count += 1
+    return error_count
+
+def validate_length(line, line_length, line_number, error_count, errmsg):
+    """
+    if line_length is None, sets it
+    """
+    if line_length is None:
+        line_length = len(line)
+    elif len(line) != line_length:
+        print errmsg, "%d: %s" %(line_number, line)
+        error_count += 1
+    return line_length, error_count
+    
diff --git a/scripts/htsw-validate b/scripts/htsw-validate
new file mode 100755 (executable)
index 0000000..52b1b74
--- /dev/null
@@ -0,0 +1,6 @@
+import sys
+
+from htsworkflow.util import validate
+
+if __name__ == "__main__":
+    sys.exit(validate.main(sys.argv))
index f074100418f83647cc60481d8f1af4493763b608..0c1a203d01999d2c8af669aae16642d8012ca768 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -34,5 +34,6 @@ setup(
         "scripts/htsw-srf",
         "scripts/htsw-srf2fastq",
         "scripts/htsw-update-archive",
+        "scripts/htsw-validate",
         ],
     )