Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
authorDiane Trout <diane@caltech.edu>
Mon, 6 Aug 2012 21:17:01 +0000 (14:17 -0700)
committerDiane Trout <diane@caltech.edu>
Mon, 6 Aug 2012 21:17:01 +0000 (14:17 -0700)
12 files changed:
encode_submission/ucsc_gather.py
htsworkflow/frontend/bcmagic/fixtures/initial_data.json
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/sequences.py
htsworkflow/pipelines/test/test_sequences.py
htsworkflow/submission/geo.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_fastqs.sparql [new file with mode: 0644]
htsworkflow/templates/geo_files.sparql
htsworkflow/templates/geo_run_details.sparql [new file with mode: 0644]
htsworkflow/templates/geo_submission.soft
htsworkflow/templates/srf.condor

index c45e3820bbc8b9f146684bef9ed8f5443fb03f8e..811ffdfc277373e86b6f0f3fe7f9782463c8bdb0 100644 (file)
@@ -21,6 +21,9 @@ from zipfile import ZipFile
 
 import RDF
 
+if not 'DJANGO_SETTINGS_MODULE' in os.environ:
+    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
+
 from htsworkflow.util import api
 from htsworkflow.util.rdfhelp import \
      dafTermOntology, \
index 1c82a26470cfc95d4e5f0a16baf80e34c390eea1..7b48d0b8736f7b3742ddad72dee474e5da00e6bf 100644 (file)
@@ -1,15 +1,15 @@
-[{"pk": 1, 
-  "model": "bcmagic.keywordmap", 
-  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)", 
-             "url_template": "/samples/freezer/{{ uuid }}/", 
+[{"pk": 1,
+  "model": "bcmagic.keywordmap",
+  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)",
+             "url_template": "/samples/freezer/{{ uuid }}/",
              "keyword": "frzr"}},
- {"pk": 2, 
-  "model": "bcmagic.keywordmap", 
-  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)", 
-             "url_template": "/samples/container/{{ uuid }}/", 
+ {"pk": 2,
+  "model": "bcmagic.keywordmap",
+  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)",
+             "url_template": "/samples/container/{{ uuid }}/",
              "keyword": "cntr"}},
- {"pk": 3, 
-  "model": "bcmagic.keywordmap", 
+ {"pk": 3,
+  "model": "bcmagic.keywordmap",
   "fields": {"regex": "(?P<sampleid>\\d+)\\|(?P<owner>[A-Za-z0-9_\\- ]+)",
              "url_template": "/samples/sample/{{ sampleid }}/",
              "keyword": "s"}},
   "model": "bcmagic.keywordmap",
   "fields": {"regex": "(?P<search>[\\S\\s]+)",
              "url_template": "http://www.google.com/search?q={{ search }}",
-             "keyword": "gg"}}, 
- {"pk": 5, 
-  "model": "bcmagic.keywordmap", 
-  "fields": {"regex": "(?P<search>[\\S\\s]+)", 
-             "url_template": "http://www.flickr.com/search/?q={{ search }}", 
-             "keyword": "flickr"}}, 
- {"pk": 6, 
-  "model": "bcmagic.keywordmap", 
-  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)", 
-             "url_template": "/inventory/{{ uuid }}/", 
-             "keyword": "invu"}}, 
- {"pk": 7, 
-  "model": "bcmagic.keywordmap", 
-  "fields": {"regex": "(?P<barcode_id>.+)", 
-             "url_template": "/inventory/{{barcode_id}}/", 
-             "keyword": "invb"}}, 
- {"pk": 1, 
-  "model": "bcmagic.printer", 
-  "fields": {"name": "ZM400 1.25x1", 
-             "label_height": 1.0, 
-             "notes": "Everyday use labels", 
-             "label_width": 1.25, 
-             "label_shape": "Square", 
-             "model": "Zebra ZM400", 
-             "ip_address": "131.215.54.194"}}, 
- {"pk": 2, 
-  "model": "bcmagic.printer", 
-  "fields": {"name": "ZM400 3x3", 
-             "label_height": 3.0, 
-             "notes": "Larger everyday use labels", 
-             "label_width": 3.0, 
-             "label_shape": "Square", 
-             "model": "Zebra ZM400", 
-             "ip_address": "131.215.34.199"}}]
+             "keyword": "gg"}},
+ {"pk": 5,
+  "model": "bcmagic.keywordmap",
+  "fields": {"regex": "(?P<search>[\\S\\s]+)",
+             "url_template": "http://www.flickr.com/search/?q={{ search }}",
+             "keyword": "flickr"}},
+ {"pk": 6,
+  "model": "bcmagic.keywordmap",
+  "fields": {"regex": "(?P<uuid>[A-Fa-f0-9]+)",
+             "url_template": "/inventory/{{ uuid }}/",
+             "keyword": "invu"}},
+ {"pk": 7,
+  "model": "bcmagic.keywordmap",
+  "fields": {"regex": "(?P<barcode_id>.+)",
+             "url_template": "/inventory/{{barcode_id}}/",
+             "keyword": "invb"}},
+ {"pk": 1,
+  "model": "bcmagic.printer",
+  "fields": {"name": "ZM400 1.25x1",
+             "label_height": 1.0,
+             "notes": "Everyday use labels",
+             "label_width": 1.25,
+             "label_shape": "Square",
+             "model": "Zebra ZM400",
+             "ip_address": "131.215.34.116"}},
+ {"pk": 2,
+  "model": "bcmagic.printer",
+  "fields": {"name": "ZM400 3x3",
+             "label_height": 3.0,
+             "notes": "Larger everyday use labels",
+             "label_width": 3.0,
+             "label_shape": "Square",
+             "model": "Zebra ZM400",
+             "ip_address": "131.215.34.117"}}]
index 67dc505f09aa878383f5ac5d3f0a6aa9de4f3f95..18a38e0ab1848cb128d987178c7c5ff4fc70e5d3 100644 (file)
@@ -632,7 +632,7 @@ def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
     lanes = []
     for lane in r.gerald.lanes:
         lane_parameters = r.gerald.lanes.get(lane, None)
-        if lane_parameters is not None and lane_parameters.analysis != 'none':
+        if lane_parameters is not None:
             lanes.append(lane)
 
     run_name = srf.pathname_to_run_name(r.pathname)
index 772af7b432754245b7a5eddb9ab3b1de45842eb7..0e5612a8e47b393684f9ee7f69cd779cca97c607 100644 (file)
@@ -76,7 +76,7 @@ class SequenceFile(object):
     def key(self):
         return (self.flowcell, self.lane, self.read, self.project, self.split)
 
-    def unicode(self):
+    def __unicode__(self):
         return unicode(self.path)
 
     def __eq__(self, other):
@@ -98,6 +98,9 @@ class SequenceFile(object):
 
         return True
 
+    def __ne__(self, other):
+        return not self == other
+
     def __repr__(self):
         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
 
@@ -255,8 +258,8 @@ def parse_fastq_pf_flag(records):
         elif fastq_type.startswith('all'):
             pf = None
         else:
-            raise ValueError("Unrecognized fastq name %s at %s" % \
-                             (records[-1], os.path.join(path,filename)))
+            raise ValueError("Unrecognized fastq name: %s" % (
+                "_".join(records),))
 
     return pf
 
index e00f5ec8efaf12ee2a94d70749407c44fe754742..9c85f390d47cbe901f54c055bdac7efdfc832c57 100644 (file)
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 import os
+import shutil
+import tempfile
 import unittest
 
 from htsworkflow.pipelines import sequences
@@ -23,7 +25,7 @@ class SequenceFileTests(unittest.TestCase):
 
         for t in tests:
             path = sequences.get_flowcell_cycle(t[0])
-            self.failUnlessEqual(path, t[1])
+            self.assertEqual(path, t[1])
 
     def test_flowcell_cycle(self):
         """
@@ -32,13 +34,13 @@ class SequenceFileTests(unittest.TestCase):
         path = '/root/42BW9AAXX/C1-152'
         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
 
-        self.failUnlessEqual(flowcell, '42BW9AAXX')
-        self.failUnlessEqual(start, 1)
-        self.failUnlessEqual(stop, 152)
-        self.failUnlessEqual(project, None)
+        self.assertEqual(flowcell, '42BW9AAXX')
+        self.assertEqual(start, 1)
+        self.assertEqual(stop, 152)
+        self.assertEqual(project, None)
 
         path = '/root/42BW9AAXX/other'
-        self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
+        self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
 
     def test_flowcell_project_cycle(self):
         """
@@ -47,110 +49,203 @@ class SequenceFileTests(unittest.TestCase):
         path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
 
-        self.failUnlessEqual(flowcell, '42BW9AAXX')
-        self.failUnlessEqual(start, 1)
-        self.failUnlessEqual(stop, 152)
-        self.failUnlessEqual(project, 'Project_12345_Index1')
+        self.assertEqual(flowcell, '42BW9AAXX')
+        self.assertEqual(start, 1)
+        self.assertEqual(stop, 152)
+        self.assertEqual(project, 'Project_12345_Index1')
 
         path = '/root/42BW9AAXX/other'
-        self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
+        self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
 
     def test_srf(self):
         path = '/root/42BW9AAXX/C1-38'
         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
+        other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_5.srf'
         pathname = os.path.join(path,name)
-        f = sequences.parse_srf(path, name)
+        f0 = sequences.parse_srf(path, name)
+        f1 = sequences.parse_srf(path, name)
+        fother = sequences.parse_srf(path, other)
+
+        self.assertEqual(f0.filetype, 'srf')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<srf 42BW9AAXX 4 %s>" % (pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 4)
+        self.assertEqual(f0.read, None)
+        self.assertEqual(f0.pf, None)
+        self.assertEqual(f0.cycle, 38)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
-        self.failUnlessEqual(f.filetype, 'srf')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, None)
-        self.failUnlessEqual(f.pf, None)
-        self.failUnlessEqual(f.cycle, 38)
 
     def test_qseq(self):
         path = '/root/42BW9AAXX/C1-36'
         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
+        other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1.tar.bz2'
         pathname = os.path.join(path,name)
-        f = sequences.parse_qseq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'qseq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, 1)
-        self.failUnlessEqual(f.pf, None)
-        self.failUnlessEqual(f.cycle, 36)
-
+        f0 = sequences.parse_qseq(path, name)
+        f1 = sequences.parse_qseq(path, name)
+        fother = sequences.parse_qseq(path, other)
+
+        self.assertEqual(f0.filetype, 'qseq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<qseq 42BW9AAXX 4 %s>" %(pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 4)
+        self.assertEqual(f0.read, 1)
+        self.assertEqual(f0.pf, None)
+        self.assertEqual(f0.cycle, 36)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
         path = '/root/ilmn200901/C1-202'
         name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
+        other = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r2.tar.bz2'
         pathname = os.path.join(path, name)
-        f = sequences.parse_qseq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'qseq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.lane, 1)
-        self.failUnlessEqual(f.read, 1)
-        self.failUnlessEqual(f.pf, None)
-        self.failUnlessEqual(f.cycle, 202)
+        f0 = sequences.parse_qseq(path, name)
+        f1 = sequences.parse_qseq(path, name)
+        fother = sequences.parse_qseq(path, other)
+
+        self.assertEqual(f0.filetype, 'qseq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<qseq ilmn200901 1 %s>" %(pathname,))
+        self.assertEqual(f0.lane, 1)
+        self.assertEqual(f0.read, 1)
+        self.assertEqual(f0.pf, None)
+        self.assertEqual(f0.cycle, 202)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
     def test_fastq(self):
         path = '/root/42BW9AAXX/C1-38'
         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
+        other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1_pass.fastq.bz2'
         pathname = os.path.join(path,name)
-        f = sequences.parse_fastq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'fastq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, 1)
-        self.failUnlessEqual(f.pf, True)
-        self.failUnlessEqual(f.cycle, 38)
+        f0 = sequences.parse_fastq(path, name)
+        f1 = sequences.parse_fastq(path, name)
+        fother = sequences.parse_fastq(path, other)
+
+        self.assertEqual(f0.filetype, 'fastq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" % (pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 4)
+        self.assertEqual(f0.read, 1)
+        self.assertEqual(f0.pf, True)
+        self.assertEqual(f0.cycle, 38)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
+        other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
         pathname = os.path.join(path,name)
-        f = sequences.parse_fastq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'fastq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, 2)
-        self.failUnlessEqual(f.pf, False)
-        self.failUnlessEqual(f.cycle, 38)
+        f0 = sequences.parse_fastq(path, name)
+        f1 = sequences.parse_fastq(path, name)
+        fother = sequences.parse_fastq(path, other)
+
+        self.assertEqual(f0.filetype, 'fastq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" %(pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 4)
+        self.assertEqual(f0.read, 2)
+        self.assertEqual(f0.pf, False)
+        self.assertEqual(f0.cycle, 38)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
     def test_project_fastq(self):
         path = '/root/42BW9AAXX/C1-38/Project_12345'
         name = '11111_NoIndex_L001_R1_001.fastq.gz'
+        other = '22222_NoIndex_L001_R1_001.fastq.gz'
         pathname = os.path.join(path,name)
-        f = sequences.parse_fastq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'split_fastq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 1)
-        self.failUnlessEqual(f.read, 1)
-        self.failUnlessEqual(f.pf, True)
-        self.failUnlessEqual(f.project, '11111')
-        self.failUnlessEqual(f.index, 'NoIndex')
-        self.failUnlessEqual(f.cycle, 38)
+        f0 = sequences.parse_fastq(path, name)
+        f1 = sequences.parse_fastq(path, name)
+        fother = sequences.parse_fastq(path, other)
+
+        self.assertEqual(f0.filetype, 'split_fastq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" %(pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 1)
+        self.assertEqual(f0.read, 1)
+        self.assertEqual(f0.pf, True)
+        self.assertEqual(f0.project, '11111')
+        self.assertEqual(f0.index, 'NoIndex')
+        self.assertEqual(f0.cycle, 38)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
 
         name = '11112_AAATTT_L001_R2_003.fastq.gz'
+        other = '11112_AAATTT_L002_R2_003.fastq.gz'
         pathname = os.path.join(path,name)
-        f = sequences.parse_fastq(path, name)
-
-        self.failUnlessEqual(f.filetype, 'split_fastq')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 1)
-        self.failUnlessEqual(f.read, 2)
-        self.failUnlessEqual(f.pf, True)
-        self.failUnlessEqual(f.project, '11112')
-        self.failUnlessEqual(f.index, 'AAATTT')
-        self.failUnlessEqual(f.cycle, 38)
+        f0 = sequences.parse_fastq(path, name)
+        f1 = sequences.parse_fastq(path, name)
+        fother = sequences.parse_fastq(path, other)
+
+        self.assertEqual(f0.filetype, 'split_fastq')
+        self.assertEqual(f0.path, pathname)
+        self.assertEqual(unicode(f0), unicode(pathname))
+        self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" % (pathname,))
+        self.assertEqual(f0.flowcell, '42BW9AAXX')
+        self.assertEqual(f0.lane, 1)
+        self.assertEqual(f0.read, 2)
+        self.assertEqual(f0.pf, True)
+        self.assertEqual(f0.project, '11112')
+        self.assertEqual(f0.index, 'AAATTT')
+        self.assertEqual(f0.cycle, 38)
+        self.assertEqual(f0.make_target_name('/tmp'),
+                         os.path.join('/tmp', name))
+
+        self.assertEqual(f0, f1)
+        self.assertNotEqual(f0, fother)
+
+    def test_parse_fastq_pf_flag(self):
+        other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
+        data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
+                'l1', 'r2', 'nopass']
+        self.assertEqual(sequences.parse_fastq_pf_flag(data), False)
+
+        data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
+                'l1', 'r2', 'pass']
+        self.assertEqual(sequences.parse_fastq_pf_flag(data), True)
+
+        data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
+                'l1', 'r2', 'all']
+        self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
+
+        data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
+                'l1', 'r2']
+        self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
+
+        data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
+                'l1', 'r2', 'all', 'newthing']
+        self.assertRaises(ValueError, sequences.parse_fastq_pf_flag, data)
+
 
     def test_project_fastq_hashing(self):
         """Can we tell the difference between sequence files?
@@ -164,9 +259,9 @@ class SequenceFileTests(unittest.TestCase):
         for a_name, b_name in names:
             a = sequences.parse_fastq(path, a_name)
             b = sequences.parse_fastq(path, b_name)
-            self.failIfEqual(a, b)
-            self.failIfEqual(a.key(), b.key())
-            self.failIfEqual(hash(a), hash(b))
+            self.assertNotEqual(a, b)
+            self.assertNotEqual(a.key(), b.key())
+            self.assertNotEqual(hash(a), hash(b))
 
     def test_eland(self):
         path = '/root/42BW9AAXX/C1-38'
@@ -174,35 +269,30 @@ class SequenceFileTests(unittest.TestCase):
         pathname = os.path.join(path,name)
         f = sequences.parse_eland(path, name)
 
-        self.failUnlessEqual(f.filetype, 'eland')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, None)
-        self.failUnlessEqual(f.pf, None)
-        self.failUnlessEqual(f.cycle, 38)
+        self.assertEqual(f.filetype, 'eland')
+        self.assertEqual(f.path, pathname)
+        self.assertEqual(f.flowcell, '42BW9AAXX')
+        self.assertEqual(f.lane, 4)
+        self.assertEqual(f.read, None)
+        self.assertEqual(f.pf, None)
+        self.assertEqual(f.cycle, 38)
+        self.assertEqual(f.make_target_name('/tmp'),
+                         '/tmp/42BW9AAXX_38_s_4_eland_extended.txt.bz2')
 
         path = '/root/42BW9AAXX/C1-152'
         name = 's_4_1_eland_extended.txt.bz2'
         pathname = os.path.join(path,name)
         f = sequences.parse_eland(path, name)
 
-        self.failUnlessEqual(f.filetype, 'eland')
-        self.failUnlessEqual(f.path, pathname)
-        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
-        self.failUnlessEqual(f.lane, 4)
-        self.failUnlessEqual(f.read, 1)
-        self.failUnlessEqual(f.pf, None)
-        self.failUnlessEqual(f.cycle, 152)
-
-    def test_sequence_file_equality(self):
-        path = '/root/42BW9AAXX/C1-38'
-        name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
-
-        f1_qseq = sequences.parse_qseq(path, name)
-        f2_qseq = sequences.parse_qseq(path, name)
-
-        self.failUnlessEqual(f1_qseq, f2_qseq)
+        self.assertEqual(f.filetype, 'eland')
+        self.assertEqual(f.path, pathname)
+        self.assertEqual(f.flowcell, '42BW9AAXX')
+        self.assertEqual(f.lane, 4)
+        self.assertEqual(f.read, 1)
+        self.assertEqual(f.pf, None)
+        self.assertEqual(f.cycle, 152)
+        self.assertEqual(f.make_target_name('/tmp'),
+                         '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
 
     def test_sql(self):
         """
@@ -228,8 +318,65 @@ class SequenceFileTests(unittest.TestCase):
 
         count = c.execute("select count(*) from sequences")
         row = count.fetchone()
-        self.failUnlessEqual(row[0], 4)
-
+        self.assertEqual(row[0], 4)
+
+    def test_scan_for_sequences(self):
+        # simulate tree
+        seen = set()
+        should_see = set(['fastq', 'srf', 'eland', 'qseq'])
+        with SimulateTree() as tree:
+            seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
+            for s in seqs:
+                self.assertEquals(s.flowcell, '42BW9AAXX')
+                self.assertEquals(s.cycle, 33)
+                seen.add(s.filetype)
+
+            self.assertEquals(len(seqs), 8)
+
+        self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
+        self.assertEqual(len(should_see.difference(seen)), 0)
+
+class SimulateTree(object):
+    def __init__(self):
+        self.root = tempfile.mkdtemp(prefix='sequences_')
+
+        fc = self.mkflowcell(self.root, '42BW9AAXX', 'C1-33')
+        files = [
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2.md5',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2.md5',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_2.srf',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_pass.fastq.bz2',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_pass.fastq.bz2',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_nopass.fastq.bz2',
+            'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_nopass.fastq.bz2',
+            's_1_eland_extended.txt.bz2',
+            's_1_eland_extended.txt.bz2.md5',
+            ]
+        for f in files:
+            self.mkfile(fc, f)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        shutil.rmtree(self.root)
+
+    def mkflowcell(self, *components):
+        head = self.root
+        for c in components:
+            head = os.path.join(head, c)
+            if not os.path.exists(head):
+                os.mkdir(head)
+        return head
+
+    def mkfile(self, flowcell, filename):
+        pathname = os.path.join(flowcell, filename)
+        stream = open(pathname,'w')
+        stream.write(pathname)
+        stream.write(os.linesep)
+        stream.close()
 
 def suite():
     return unittest.makeSuite(SequenceFileTests,'test')
index 8ff349fe9ae7444c5cc82f20c15e8458d6cc0599..a3ac2f17b904c832c9996336bb74d0186f0ddef9 100644 (file)
@@ -35,11 +35,9 @@ class GEOSubmission(Submission):
                 errmsg = 'Confused there are more than one samples for %s'
                 LOGGER.debug(errmsg % (str(an_analysis,)))
             metadata = metadata[0]
-            metadata['raw'] = self.get_sample_files(an_analysis,
-                                                    geoSoftNS['raw'])
-            metadata['supplimental'] = self.get_sample_files(
-                an_analysis,
-                geoSoftNS['supplemental'])
+            metadata['raw'] = self.get_raw_files(an_analysis)
+            metadata['supplimental'] = self.get_sample_files(an_analysis)
+            metadata['run'] = self.get_run_details(an_analysis)
             samples.append(metadata)
 
         soft_template = loader.get_template('geo_submission.soft')
@@ -98,18 +96,106 @@ class GEOSubmission(Submission):
 
         results = self.execute_query(query_template, context)
         for r in results:
-
             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
+
         return results
 
-    def get_sample_files(self, analysis_node, file_class):
-        """Gather files
+    def get_sample_files(self, analysis_node):
+        """Gather derived files
         """
         query_template = loader.get_template('geo_files.sparql')
 
         context = Context({
             'submission': str(analysis_node.uri),
-            'file_class': str(file_class)
+            'file_class': str(geoSoftNS['supplemental'])
+            })
+
+        return self.execute_query(query_template, context)
+
+    def get_raw_files(self, analysis_node):
+        """Gather raw data e.g. fastq files.
+        """
+        query_template = loader.get_template('geo_fastqs.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(geoSoftNS['raw']),
+            })
+
+        lanes = {}
+        for row in self.execute_query(query_template, context):
+            data = {}
+            for k, v in row.items():
+                data[k] = v
+            lane = str(data['lane'])
+            lanes.setdefault(lane, []).append(data)
+        result = []
+        for lane, files in lanes.items():
+            if len(files) > 2:
+                errmsg = "Don't know what to do with more than 2 raw files"
+                raise ValueError(errmsg)
+            elif len(files) == 2:
+                is_paired = True
+            elif len(files) == 1:
+                is_paired = False
+            elif len(files) == 0:
+                raise RuntimeError("Empty lane list discovered")
+            files = self._format_filename(files, is_paired)
+            files = self._format_flowcell_type(files, is_paired)
+            files = self._format_read_length(files, is_paired)
+            result.append(files[0])
+        return result
+
+    def _format_flowcell_type(self, files, is_paired):
+        """Used by get_raw_files to format value for single_or_paired-end
+        """
+        for f in files:
+            if 'flowcell_type' in f:
+                flowcell_type = fromTypedNode(f['flowcell_type'])
+                if flowcell_type is None:
+                    pass
+                elif flowcell_type.lower() == "paired":
+                    f['flowcell_type'] = 'paired-end'
+                else:
+                    f['flowcell_type'] = 'single'
+
+        return files
+
+    def _format_read_length(self, files, is_paired):
+        """Format
+        """
+        read_count = 2 if is_paired else 1
+        for f in files:
+            if 'read_length' in f:
+                read_length = str(fromTypedNode(f['read_length']))
+                f['read_length'] = ",".join([read_length] * read_count)
+        return files
+
+    def _format_filename(self, files, is_paired):
+        """Format file name for get_raw_files, also report if paired
+        """
+        if len(files) == 2:
+            # should be paired
+            f0 = files[0]
+            f1 = files[1]
+            f0['filename'] = "%s, %s" % (str(f0['filename']),
+                                         str(f1['filename']))
+            f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
+                                       str(f1['md5sum']))
+            del files[1]
+        else:
+            files[0]['filename'] = str(files[0]['filename'])
+            files[0]['md5sum'] = str(files[0]['md5sum'])
+        return files
+
+
+    def get_run_details(self, analysis_node):
+        """Get information about runs
+        """
+        query_template = loader.get_template('geo_run_details.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
             })
 
         return self.execute_query(query_template, context)
index e4ce90c73b073287913ec2f76f5f1d5f0d9bd887..6dd630aeda90fe5ad08a96da8be610f628642b04 100644 (file)
@@ -149,13 +149,44 @@ class Submission(object):
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 
     def _add_library_details_to_model(self, libNode):
+        # attributes that can have multiple values
+        set_attributes = set((libraryOntology['has_lane'],
+                              libraryOntology['has_mappings'],
+                              dafTermOntology['has_file']))
         parser = RDF.Parser(name='rdfa')
         new_statements = parser.parse_as_stream(libNode.uri)
+        toadd = []
         for s in new_statements:
+            # always add "collections"
+            if s.predicate in set_attributes:
+                toadd.append(s)
+                continue
             # don't override things we already have in the model
             targets = list(self.model.get_targets(s.subject, s.predicate))
             if len(targets) == 0:
-                self.model.append(s)
+                toadd.append(s)
+
+        for s in toadd:
+            self.model.append(s)
+
+        self._add_lane_details(libNode)
+
+    def _add_lane_details(self, libNode):
+        """Import lane details
+        """
+        query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
+        lanes = []
+        for lane_stmt in self.model.find_statements(query):
+            lanes.append(lane_stmt.object)
+
+        parser = RDF.Parser(name='rdfa')
+        for lane in lanes:
+            LOGGER.debug("Importing %s" % (lane.uri,))
+            try:
+                parser.parse_into_model(self.model, lane.uri)
+            except RDF.RedlandError, e:
+                LOGGER.error("Error accessing %s" % (lane.uri,))
+                raise e
 
 
     def find_best_match(self, filename):
diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql
new file mode 100644 (file)
index 0000000..e7fcbc1
--- /dev/null
@@ -0,0 +1,31 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
+WHERE {
+  <{{submission}}> submissionOntology:library ?library ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        libraryOntology:has_lane ?lane ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+  ?library libraryOntology:has_lane ?lane .
+  ?lane libraryOntology:flowcell ?flowcell .
+  ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+            libraryOntology:read_length ?read_length ;
+            libraryOntology:flowcell_type ?flowcell_type ;
+  OPTIONAL { ?flowcell libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version . }
+  OPTIONAL {?flowcell  libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version . }
+  OPTIONAL {?flowcell  libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model . }
+}
+
index 7b66f4f174f15d78b4b26bd2985522d37bc35827..e3fcb9d8d4028c57f752e254019227f6e61b85bf 100644 (file)
@@ -4,15 +4,27 @@ PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
 PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
 
-select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
 WHERE {
   <{{submission}}> ucscDaf:has_file ?file ;
                    a submissionOntology:submission .
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
+        libraryOntology:has_lane ?lane ;
         a ?file_type .
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
+  OPTIONAL { ?lane libraryOntology:flowcell ?flowcell .
+             ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+                       libraryOntology:read_length ?read_length ;
+                       libraryOntology:flowcell_type ?flowcell_type ;
+                       libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version ;
+                       libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version ;
+                       libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model
+  }
 }
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_run_details.sparql b/htsworkflow/templates/geo_run_details.sparql
new file mode 100644 (file)
index 0000000..dc97107
--- /dev/null
@@ -0,0 +1,27 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+# right now we're just grabbing the sequencer model
+# it might make sense to report each of the flowcell/image software.
+select ?flowcell ?read_length ?image_software ?image_version ?basecall_software ?basecall_version
+where {
+  <{{submission}}> submissionOntology:library ?library ;
+                   a submissionOntology:submission .
+
+  ?library libraryOntology:library_id ?library_id ;
+           libraryOntology:has_lane ?lane ;
+           a libraryOntology:library .
+  OPTIONAL { ?flowcell libraryOntology:has_lane ?lane .
+             ?flowcell libraryOntology:read_length ?read_length ;
+                       libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version ;
+                       libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version ;
+                       libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model
+  }
+}
index 00c0b4dc77818789bfea8950507fbd1dba0c13fb..969ff53836bd37b3d4f82747cf3d94c806852ca9 100644 (file)
@@ -1,18 +1,11 @@
-{% for name, value in series %}{{name}} = {{value}}
-{% endfor %}!Series_platform_id = {{ platform_id }}
-{% for row in samples %}
-^SAMPLE={{row.name}}
+{% for name, value in series %}{{name}}={{value}}
+{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
 !Sample_type=SRA
 !Sample_title={{row.name}}
-!Sample_series_id = {{ series_id }}
-!Sample_instrument_model = Illumina Genome Analyzer
-!Sample_instrument_model = Illumina Genome Analyzer II
-!Sample_instrument_model = Illumina Genome Analyzer IIx
-!Sample_instrument_model = Illumina HiSeq 2000
-!Sample_channel_count = 1
-!Sample_organism_ch1 = {{ row.species_name }}
-!Sample_taxid_ch1 = {{ row.taxon_id }}
-!Sample_platform_id = {{ platform_id }}
+!Sample_series_id={{ series_id }}
+!Sample_channel_count=1
+!Sample_organism_ch1={{ row.species_name }}
+!Sample_taxid_ch1={{ row.taxon_id }}
 !Sample_source_name_ch1={{row.cell}}
 !Sample_library_strategy={{ row.experiment_type }}
 !Sample_library_source={{row.library_source}}
 !Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
 !Sample_extract_protocol={{ row.extractProtocol|safe }}
 !Sample_data_processing={{ row.dataProtocol|safe }}
-!Sample_molecule_ch1 = {{ row.extractMolecule }}
-!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
-!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+!Sample_molecule_ch1={{ row.extractMolecule }}
+!Sample_characteristics_ch1=labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1=replicate: {{ row.replicate }}
 {% if row.cell %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.cell }}
+!Sample_characteristics_ch1=cell: {{ row.cell }}
 {% endspaceless %}{% endif %}
 {% if row.readType %}{% spaceless %}
-!Sample_characteristics_ch1 = readType: {{ row.readType }}
+!Sample_characteristics_ch1=readType: {{ row.readType }}
 {% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.antibody }}
-{% endspaceless %}{% endif %}{% for raw in row.raw %}
-!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_characteristics_ch1=cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for run in row.run %}
+!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}
+!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{ raw.filename }}
 !Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
-!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }}
+!Sample_raw_file_read_length_{{forloop.counter}}={{raw.read_length}}
+!Sample_raw_file_instrument_model_{{forloop.counter}}={{raw.sequencer_model}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
+!sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
 !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
 !Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
\ No newline at end of file
+{% endfor %}{% endfor %}
index 21d523258e2028dbccad2785ad7a51e8b66dd5e9..1c8e8547e02d7137a2898045648d1dc354e55d26 100644 (file)
@@ -6,6 +6,6 @@ log={{ logdir }}/fastq.log
 {% if env %}environment={{ env }}{% endif %}
 
 {% for arg in args %}
-arguments="{{ arg.pyscript }}  {{ arg.sources.0 }} --verbose {% if arg.flowcell %}-f {{ arg.flowcell }}{% endif %} {% if arg.ispaired %}--left {{ arg.target }} --right {{ arg.target_right }}{% else %}--single {{ arg.target }}{% endif %}"
+arguments="{{ arg.pyscript }}  {{ arg.sources.0 }} --verbose {% if arg.flowcell %}--flowcell {{ arg.flowcell }}{% endif %} {% if arg.ispaired %}--left {{ arg.target }} --right {{ arg.target_right }}{% else %}--single {{ arg.target }}{% endif %}"
 queue
 {% endfor %}