Merge ssh://jumpgate.caltech.edu/var/htsworkflow/htsworkflow

author Diane Trout <diane@caltech.edu>

Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)

committer Diane Trout <diane@caltech.edu>

Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)
author Diane Trout <diane@caltech.edu>
Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)
committer Diane Trout <diane@caltech.edu>
Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)
diff --git a/MANIFEST.in b/MANIFEST.in

index c96b250a1ed7f752d2e79ca91e85019ad77deffd..83c439dcc4e7d28d2cd2b35d5491b88f306ed64b 100644 (file)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
  include RELEASE-VERSION
  include version.py
+include htsworkflow/util/schemas/*.turtle
diff --git a/encode_submission/encode_find.py b/encode_submission/encode_find.py

index 0495d142041e90dc2e9c9baaa91e2740007a69aa..7589f5487aef5ef85a6c4ec65072aa3d025ef5bc 100644 (file)
--- a/encode_submission/encode_find.py
+++ b/encode_submission/encode_find.py
@@ -26,18 +26,14 @@ if not 'DJANGO_SETTINGS_MODULE' in os.environ:
  from htsworkflow.submission import daf, ucsc
  
  from htsworkflow.util import api
+from htsworkflow.util.rdfns import *
  from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     dublinCoreNS, \
       get_model, \
       get_serializer, \
       sparql_query, \
       submissionOntology, \
       libraryOntology, \
-     load_into_model, \
-     rdfNS, \
-     rdfsNS, \
-     xsdNS
+     load_into_model
  TYPE_N = rdfNS['type']
  CREATION_DATE = libraryOntology['date']
  
diff --git a/encode_submission/geo_gather.py b/encode_submission/geo_gather.py

index 9a1f51e7564431a28cf1f5b99b2b38f74574f739..c67edefc7d45d0e95e7a798dd015a31befbbbac2 100644 (file)
--- a/encode_submission/geo_gather.py
+++ b/encode_submission/geo_gather.py
@@ -77,11 +77,15 @@ def main(cmdline=None):
          results.make_tree_from(opts.make_tree_from)
  
      if opts.fastq:
-        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+        flowcells = os.path.join(opts.sequence, 'flowcells')
+        extractor = CondorFastqExtract(opts.host, flowcells,
+                                       model=opts.model,
                                         force=opts.force)
          extractor.create_scripts(results)
  
      if opts.scan_submission:
+        if opts.name is None:
+            parser.error("Please define a submission name")
          mapper.scan_submission_dirs(results)
  
      if opts.make_soft:
diff --git a/encode_submission/submission_report.py b/encode_submission/submission_report.py

index 3f5547933f462318d5e5619c7f3b85ae5ea32e4b..d8ace391888686c0d0650fb0b080b0e79399aa67 100644 (file)
--- a/encode_submission/submission_report.py
+++ b/encode_submission/submission_report.py
@@ -4,17 +4,12 @@ import jinja2
  from pprint import pprint
  
  from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     dublinCoreNS, \
       get_model, \
       get_serializer, \
       sparql_query, \
-     submissionOntology, \
       libraryOntology, \
-     load_into_model, \
-     rdfNS, \
-     rdfsNS, \
-     xsdNS
+     load_into_model
+from htsworkflow.util.rdfns import *
  TYPE_N = rdfNS['type']
  CREATION_DATE = libraryOntology['date']
  
diff --git a/encode_submission/ucsc_gather.py b/encode_submission/ucsc_gather.py

index 811ffdfc277373e86b6f0f3fe7f9782463c8bdb0..a9fa72a83ddc1c4263e509070f50053d238fa405 100644 (file)
--- a/encode_submission/ucsc_gather.py
+++ b/encode_submission/ucsc_gather.py
@@ -96,7 +96,8 @@ def main(cmdline=None):
          mapper.link_daf(results)
  
      if opts.fastq:
-        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+        flowcells = os.path.join(opts.sequence, 'flowcells')
+        extractor = CondorFastqExtract(opts.host, flowcells,
                                         force=opts.force)
          extractor.create_scripts(results)
  
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index 0e5612a8e47b393684f9ee7f69cd779cca97c607..23e7fe8cfa401a52a52e936e91e3182311ed851f 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -6,6 +6,13 @@ import logging
  import os
  import types
  import re
+import sys
+from urlparse import urljoin, urlparse
+
+import RDF
+from htsworkflow.util.rdfhelp import libraryOntology as libNS
+from htsworkflow.util.rdfhelp import toTypedNode, fromTypedNode, rdfNS, \
+     stripNamespace, dump_model, simplify_uri
  
  LOGGER = logging.getLogger(__name__)
  
@@ -122,7 +129,7 @@ class SequenceFile(object):
          # information and thus are unique so we don't have to do anything
          return os.path.join(root, basename)
  
-    def save(self, cursor):
+    def save_to_sql(self, cursor):
          """
          Add this entry to a DB2.0 database.
          """
@@ -148,6 +155,92 @@ class SequenceFile(object):
  
          return cursor.execute(sql, sql_values)
  
+    def save_to_model(self, model, base_url=None):
+        def add_lit(model, s, p, o):
+            if o is not None:
+                model.add_statement(RDF.Statement(s, p, toTypedNode(o)))
+        def add(model, s, p, o):
+            model.add_statement(RDF.Statement(s,p,o))
+        # a bit unreliable... assumes filesystem is encoded in utf-8
+        path = os.path.abspath(self.path.encode('utf-8'))
+        fileNode = RDF.Node(RDF.Uri('file://' + path))
+        add(model, fileNode, rdfNS['type'], libNS['illumina_result'])
+        add_lit(model, fileNode, libNS['flowcell_id'], self.flowcell)
+        add_lit(model, fileNode, libNS['lane_number'], self.lane)
+        if self.read is not None:
+            add_lit(model, fileNode, libNS['read'], self.read)
+        else:
+            add_lit(model, fileNode, libNS['read'], '')
+
+        add_lit(model, fileNode, libNS['library_id'], self.project)
+        add_lit(model, fileNode, libNS['multiplex_index'], self.index)
+        add_lit(model, fileNode, libNS['split_id'], self.split)
+        add_lit(model, fileNode, libNS['cycle'], self.cycle)
+        add_lit(model, fileNode, libNS['passed_filter'], self.pf)
+        add(model, fileNode, libNS['file_type'], libNS[self.filetype])
+
+        if base_url is not None:
+            flowcell = RDF.Node(RDF.Uri("{base}/flowcell/{flowcell}/".format(
+                base=base_url,
+                flowcell=self.flowcell)))
+            add(model, fileNode, libNS['flowcell'], flowcell)
+            if self.project is not None:
+                library = RDF.Node(RDF.Uri("{base}/library/{library}".format(
+                    base=base_url,
+                    library=self.project)))
+                add(model, fileNode, libNS['library'], library)
+
+
+    @classmethod
+    def load_from_model(cls, model, seq_id):
+        def get(s, p):
+            values = []
+            stmts = model.find_statements(RDF.Statement(s, p, None))
+            for s in stmts:
+                obj = s.object
+                if not obj.is_resource():
+                    values.append(fromTypedNode(obj))
+                else:
+                    values.append(obj)
+            return values
+        def get_one(s, p):
+            values = get(s, p)
+            if len(values) > 1:
+                errmsg = u"To many values for %s %s"
+                raise ValueError(errmsg % (unicode(s), unicode(p)))
+            elif len(values) == 1:
+                return values[0]
+            else:
+                return None
+
+        if not isinstance(seq_id, RDF.Node):
+            seq_id = RDF.Node(RDF.Uri(seq_id))
+        result_statement = RDF.Statement(seq_id,
+                                         rdfNS['type'],
+                                         libNS['illumina_result'])
+        if not model.contains_statement(result_statement):
+            raise KeyError(u"%s not found" % (unicode(seq_id),))
+
+        seq_type_node = model.get_target(seq_id, libNS['file_type'])
+        seq_type = stripNamespace(libNS, seq_type_node)
+
+        path = urlparse(str(seq_id.uri)).path
+        flowcellNode = get_one(seq_id, libNS['flowcell'])
+        flowcell = get_one(seq_id, libNS['flowcell_id'])
+        lane = get_one(seq_id, libNS['lane_number'])
+        read = get_one(seq_id, libNS['read'])
+
+        obj = cls(seq_type, path, flowcell, lane)
+        obj.read = read if read != '' else None
+        obj.project = get_one(seq_id, libNS['library_id'])
+        obj.index = get_one(seq_id, libNS['multiplex_index'])
+        obj.split = get_one(seq_id, libNS['split_id'])
+        obj.cycle = get_one(seq_id, libNS['cycle'] )
+        obj.pf = get_one(seq_id, libNS['passed_filter'])
+        obj.libraryNode = get_one(seq_id, libNS['library'])
+        return obj
+
+
  def get_flowcell_cycle(path):
      """
      Extract flowcell, cycle from pathname
@@ -316,3 +409,99 @@ def scan_for_sequences(dirs):
                      LOGGER.debug("Found sequence at %s" % (f,))
  
      return sequences
+
+
+def update_model_sequence_library(model, base_url):
+    """Find sequence objects and add library information if its missing
+    """
+    file_body = """
+    prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+    select ?filenode ?flowcell_id ?lane_id ?library_id ?flowcell ?library
+    where {
+       ?filenode a libNS:illumina_result ;
+                 libNS:flowcell_id ?flowcell_id ;
+                 libNS:lane_number ?lane_id .
+       OPTIONAL { ?filenode libNS:flowcell ?flowcell . }
+       OPTIONAL { ?filenode libNS:library ?library .}
+       OPTIONAL { ?filenode libNS:library_id ?library_id .}
+    }
+    """
+    LOGGER.debug("update_model_sequence_library query %s", file_body)
+    file_query = RDF.SPARQLQuery(file_body)
+    files = file_query.execute(model)
+
+    libraryNS = RDF.NS(urljoin(base_url, 'library/'))
+    flowcellNS = RDF.NS(urljoin(base_url, 'flowcell/'))
+    for f in files:
+        filenode = f['filenode']
+        LOGGER.debug("Updating file node %s", str(filenode))
+        lane_id = fromTypedNode(f['lane_id'])
+        if f['flowcell'] is None:
+            flowcell = flowcellNS[str(f['flowcell_id'])+'/']
+            LOGGER.debug("Adding file (%s) to flowcell (%s) link",
+                         str(filenode),
+                         str(flowcell))
+            model.add_statement(
+                RDF.Statement(filenode, libNS['flowcell'], flowcell))
+        else:
+            flowcell = f['flowcell']
+
+        if f['library'] is None:
+            if f['library_id'] is not None:
+                library = libraryNS[str(f['library_id']) + '/']
+            else:
+                library = guess_library_from_model(model, base_url,
+                                                   flowcell,
+                                                   lane_id)
+                library_id = toTypedNode(simplify_uri(library))
+                LOGGER.debug("Adding file (%s) to library (%s) link",
+                             str(filenode),
+                             str(library))
+                model.add_statement(
+                    RDF.Statement(filenode, libNS['library_id'], library_id))
+            if library is not None:
+                model.add_statement(
+                    RDF.Statement(filenode, libNS['library'], library))
+
+
+def guess_library_from_model(model, base_url, flowcell, lane_id):
+    """Attempt to find library URI
+    """
+    flowcellNode = RDF.Node(flowcell)
+    flowcell = str(flowcell.uri)
+    lane_body = """
+    prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+    select ?library ?lane
+    where {{
+      <{flowcell}> libNS:has_lane ?lane ;
+                   a libNS:illumina_flowcell .
+      ?lane libNS:lane_number {lane_id} ;
+            libNS:library ?library .
+    }}
+    """
+    lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id)
+    lanes = []
+    tries = 3
+    while len(lanes) == 0 and tries > 0:
+        tries -= 1
+        lane_query = RDF.SPARQLQuery(lane_body)
+        lanes = [ l for l in lane_query.execute(model)]
+        if len(lanes) > 1:
+            # CONFUSED!
+            errmsg = "Too many libraries for flowcell {flowcell} "\
+                     "lane {lane} = {count}"
+            LOGGER.error(errmsg.format(flowcell=flowcell,
+                                       lane=lane_id,
+                                       count=len(lanes)))
+            return None
+        elif len(lanes) == 1:
+            # success
+            return lanes[0]['library']
+        else:
+            # try grabbing data
+            model.load(flowcellNode.uri, name="rdfa")
+
+
diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py

index 7bbcc2e762f6361b1fd9ba97165e6d22c241e1a5..34ddeab593da5636386a7f0a7254a5e2af1b53e4 100644 (file)
--- a/htsworkflow/pipelines/test/test_sequences.py
+++ b/htsworkflow/pipelines/test/test_sequences.py
@@ -4,8 +4,11 @@ import shutil
  import tempfile
  import unittest
  
-from htsworkflow.pipelines import sequences
+import RDF
  
+from htsworkflow.pipelines import sequences
+from htsworkflow.util.rdfhelp import get_model, load_string_into_model, \
+     rdfNS, libraryOntology, dump_model, fromTypedNode
  
  class SequenceFileTests(unittest.TestCase):
      """
@@ -294,15 +297,8 @@ class SequenceFileTests(unittest.TestCase):
          self.assertEqual(f.make_target_name('/tmp'),
                           '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
  
-    def test_sql(self):
-        """
-        Make sure that the quick and dirty sql interface in sequences works
-        """
-        import sqlite3
-        db = sqlite3.connect(":memory:")
-        c = db.cursor()
-        sequences.create_sequence_table(c)
-
+    def _generate_sequences(self):
+        seqs = []
          data = [('/root/42BW9AAXX/C1-152',
                  'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
                  ('/root/42BW9AAXX/C1-152',
@@ -313,12 +309,182 @@ class SequenceFileTests(unittest.TestCase):
                  'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
  
          for path, name in data:
-            seq = sequences.parse_qseq(path, name)
-            seq.save(c)
+            seqs.append(sequences.parse_qseq(path, name))
+
+        path = '/root/42BW9AAXX/C1-38/Project_12345'
+        name = '12345_AAATTT_L003_R1_001.fastq.gz'
+        pathname = os.path.join(path,name)
+        seqs.append(sequences.parse_fastq(path, name))
+        self.assertEqual(len(seqs), 5)
+        return seqs
+
+
+    def test_sql(self):
+        """
+        Make sure that the quick and dirty sql interface in sequences works
+        """
+        import sqlite3
+        db = sqlite3.connect(":memory:")
+        c = db.cursor()
+        sequences.create_sequence_table(c)
+
+        for seq in self._generate_sequences():
+            seq.save_to_sql(c)
  
          count = c.execute("select count(*) from sequences")
          row = count.fetchone()
-        self.assertEqual(row[0], 4)
+        self.assertEqual(row[0], 5)
+
+    def test_basic_rdf_scan(self):
+        """Make sure we can save to RDF model"""
+        import RDF
+        model = get_model()
+
+        for seq in self._generate_sequences():
+            seq.save_to_model(model)
+
+        files = list(model.find_statements(
+            RDF.Statement(None,
+                          rdfNS['type'],
+                          libraryOntology['illumina_result'])))
+        self.assertEqual(len(files), 5)
+        files = list(model.find_statements(
+            RDF.Statement(None,
+                          libraryOntology['file_type'],
+                          libraryOntology['qseq'])))
+        self.assertEqual(len(files), 4)
+        files = list(model.find_statements(
+            RDF.Statement(None,
+                          libraryOntology['file_type'],
+                          libraryOntology['split_fastq'])))
+        self.assertEqual(len(files), 1)
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['library_id'], None)))
+        self.assertEqual(len(files), 1)
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['flowcell_id'], None)))
+        self.assertEqual(len(files), 5)
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['flowcell'], None)))
+        self.assertEqual(len(files), 0)
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['library'], None)))
+        self.assertEqual(len(files), 0)
+
+    def test_rdf_scan_with_url(self):
+        """Make sure we can save to RDF model"""
+        import RDF
+        model = get_model()
+        base_url = 'http://localhost'
+        for seq in self._generate_sequences():
+            seq.save_to_model(model, base_url=base_url)
+        localFC = RDF.NS(base_url + '/flowcell/')
+        localLibrary = RDF.NS(base_url + '/library/')
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['flowcell'], None)))
+        self.assertEqual(len(files), 5)
+        for f in files:
+            self.assertEqual(f.object, localFC['42BW9AAXX/'])
+
+        files = list(model.find_statements(
+            RDF.Statement(None, libraryOntology['library'], None)))
+        self.assertEqual(len(files), 1)
+        self.assertEqual(files[0].object, localLibrary['12345'])
+
+    def test_rdf_fixup_library(self):
+        """Make sure we can save to RDF model"""
+        base_url = 'http://localhost'
+        localLibrary = RDF.NS(base_url + '/library/')
+
+        flowcellInfo = """@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+
+<{base}/flowcell/42BW9AAXX/>
+    libns:flowcell_id "42BW9AXX"@en ;
+    libns:has_lane <{base}/lane/1169>, <{base}/lane/1170>,
+                   <{base}/lane/1171>, <{base}/lane/1172> ;
+    libns:read_length 75 ;
+    a libns:illumina_flowcell .
+
+<{base}/lane/1169>
+    libns:lane_number 1 ; libns:library <{base}/library/10923/> .
+<{base}/lane/1170>
+    libns:lane_number 2 ; libns:library <{base}/library/10924/> .
+<{base}/lane/1171>
+    libns:lane_number 3 ; libns:library <{base}/library/12345/> .
+<{base}/lane/1172>
+    libns:lane_number 3 ; libns:library <{base}/library/10930/> .
+""".format(base=base_url)
+        model = get_model()
+        load_string_into_model(model, 'turtle', flowcellInfo)
+        for seq in self._generate_sequences():
+            seq.save_to_model(model)
+        f = sequences.update_model_sequence_library(model, base_url=base_url)
+
+        libTerm = libraryOntology['library']
+        libIdTerm = libraryOntology['library_id']
+
+        url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'
+        nodes = list(model.get_targets(RDF.Uri(url), libTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(nodes[0], localLibrary['10923/'])
+        nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(fromTypedNode(nodes[0]), '10923')
+
+        url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'
+        nodes = list(model.get_targets(RDF.Uri(url), libTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(nodes[0], localLibrary['10924/'])
+        nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(fromTypedNode(nodes[0]), '10924')
+
+        url = 'file:///root/42BW9AAXX/C1-38/Project_12345/12345_AAATTT_L003_R1_001.fastq.gz'
+        nodes = list(model.get_targets(RDF.Uri(url), libTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(nodes[0], localLibrary['12345/'])
+        nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(fromTypedNode(nodes[0]), '12345')
+
+    def test_load_from_model(self):
+        """Can we round trip through a RDF model"""
+        model = get_model()
+        path = '/root/42BW9AAXX/C1-38/Project_12345/'
+        filename = '12345_AAATTT_L003_R1_001.fastq.gz'
+        seq = sequences.parse_fastq(path, filename)
+        seq.save_to_model(model)
+
+        seq_id = 'file://'+path+filename
+        seqNode = RDF.Node(RDF.Uri(seq_id))
+        libNode = RDF.Node(RDF.Uri('http://localhost/library/12345'))
+        model.add_statement(
+            RDF.Statement(seqNode, libraryOntology['library'], libNode))
+        seq2 = sequences.SequenceFile.load_from_model(model, seq_id)
+
+        self.assertEqual(seq.flowcell, seq2.flowcell)
+        self.assertEqual(seq.flowcell, '42BW9AAXX')
+        self.assertEqual(seq.filetype, seq2.filetype)
+        self.assertEqual(seq2.filetype, 'split_fastq')
+        self.assertEqual(seq.lane, seq2.lane)
+        self.assertEqual(seq2.lane, 3)
+        self.assertEqual(seq.read, seq2.read)
+        self.assertEqual(seq2.read, 1)
+        self.assertEqual(seq.project, seq2.project)
+        self.assertEqual(seq2.project, '12345')
+        self.assertEqual(seq.index, seq2.index)
+        self.assertEqual(seq2.index, 'AAATTT')
+        self.assertEqual(seq.split, seq2.split)
+        self.assertEqual(seq2.split, '001')
+        self.assertEqual(seq.cycle, seq2.cycle)
+        self.assertEqual(seq.pf, seq2.pf)
+        self.assertEqual(seq2.libraryNode, libNode)
+        self.assertEqual(seq.path, seq2.path)
  
      def test_scan_for_sequences(self):
          # simulate tree
diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py

index 64eb6a1892a93ab086adb377552db4a828a2a974..9aab790ef6fb6262e998b7ef1b4c2689144d5e88 100644 (file)
--- a/htsworkflow/submission/condorfastq.py
+++ b/htsworkflow/submission/condorfastq.py
@@ -2,27 +2,35 @@
  """
  import logging
  import os
-from pprint import pformat
+from pprint import pformat,pprint
  import sys
  import types
+from urlparse import urljoin, urlparse
  
-from htsworkflow.pipelines.sequences import scan_for_sequences
+from htsworkflow.pipelines.sequences import scan_for_sequences, \
+     update_model_sequence_library
  from htsworkflow.pipelines.samplekey import SampleKey
  from htsworkflow.pipelines import qseq2fastq
  from htsworkflow.pipelines import srf2fastq
  from htsworkflow.pipelines import desplit_fastq
-from htsworkflow.util.api import HtswApi
+from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
+     fromTypedNode, \
+     stripNamespace
+from htsworkflow.util.rdfns import *
  from htsworkflow.util.conversion import parse_flowcell_id
  
  from django.conf import settings
  from django.template import Context, loader
  
+import RDF
+
  LOGGER = logging.getLogger(__name__)
  
  
  class CondorFastqExtract(object):
-    def __init__(self, host, apidata, sequences_path,
+    def __init__(self, host, sequences_path,
                   log_path='log',
+                 model=None,
                   force=False):
          """Extract fastqs from results archive
  
@@ -33,10 +41,14 @@ class CondorFastqExtract(object):
            log_path (str): where to put condor log files
            force (bool): do we force overwriting current files?
          """
-        self.api = HtswApi(host, apidata)
+        self.host = host
+        self.model = get_model(model)
          self.sequences_path = sequences_path
          self.log_path = log_path
          self.force = force
+        LOGGER.info("CondorFastq host={0}".format(self.host))
+        LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path))
+        LOGGER.info("CondorFastq log_path={0}".format(self.log_path))
  
      def create_scripts(self, result_map ):
          """
@@ -48,7 +60,6 @@ class CondorFastqExtract(object):
          template_map = {'srf': 'srf.condor',
                          'qseq': 'qseq.condor',
                          'split_fastq': 'split_fastq.condor',
-                        'by_sample': 'lane_to_fastq.turtle',
                          }
  
          env = None
@@ -62,7 +73,7 @@ class CondorFastqExtract(object):
                           'logdir': self.log_path,
                           'env': env,
                           'args': condor_entries[script_type],
-                         'root_url': self.api.root_url,
+                         'root_url': self.host,
                           }
              context = Context(variables)
  
@@ -78,9 +89,8 @@ class CondorFastqExtract(object):
                              'qseq': self.condor_qseq_to_fastq,
                              'split_fastq': self.condor_desplit_fastq
                              }
-        by_sample = {}
-        lib_db = self.find_archive_sequence_files(result_map)
-        needed_targets = self.find_missing_targets(result_map, lib_db)
+        sequences = self.find_archive_sequence_files(result_map)
+        needed_targets = self.update_fastq_targets(result_map, sequences)
  
          for target_pathname, available_sources in needed_targets.items():
              LOGGER.debug(' target : %s' % (target_pathname,))
@@ -97,120 +107,170 @@ class CondorFastqExtract(object):
                  if sources is not None:
                      condor_entries.setdefault(condor_type, []).append(
                          conversion(sources, target_pathname))
-                    for s in sources:
-                        by_sample.setdefault(s.lane_id,[]).append(
-                            target_pathname)
              else:
                  print " need file", target_pathname
  
-        condor_entries['by_sample'] = by_sample
          return condor_entries
  
      def find_archive_sequence_files(self,  result_map):
          """
          Find archived sequence files associated with our results.
          """
-        LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,))
-
-        lib_db = {}
-        seq_dirs = set()
-        candidate_lanes = {}
-        for lib_id in result_map.keys():
-            lib_info = self.api.get_library(lib_id)
-            lib_info['lanes'] = {}
-            lib_db[lib_id] = lib_info
-
-            for lane in lib_info['lane_set']:
-                lane_key = (lane['flowcell'], lane['lane_number'])
-                candidate_lanes[lane_key] = (lib_id, lane['lane_id'])
-                seq_dirs.add(os.path.join(self.sequences_path,
-                                             'flowcells',
-                                             lane['flowcell']))
-        LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
-        candidate_seq_list = scan_for_sequences(seq_dirs)
-
-        # at this point we have too many sequences as scan_for_sequences
-        # returns all the sequences in a flowcell directory
-        # so lets filter out the extras
-
-        for seq in candidate_seq_list:
-            lane_key = (seq.flowcell, seq.lane)
-            candidate_key = candidate_lanes.get(lane_key, None)
-            if candidate_key is not None:
-                lib_id, lane_id = candidate_key
-                seq.lane_id = lane_id
-                lib_info = lib_db[lib_id]
-                lib_info['lanes'].setdefault(lane_key, set()).add(seq)
-
-        return lib_db
-
-    def find_missing_targets(self, result_map, lib_db):
+        self.import_libraries(result_map)
+        flowcell_ids = self.find_relavant_flowcell_ids()
+        self.import_sequences(flowcell_ids)
+
+        query_text = """
+        prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+        select ?filenode ?filetype ?cycle ?lane_number ?read
+               ?library  ?library_id
+               ?flowcell ?flowcell_id ?read_length
+               ?flowcell_type ?flowcell_status
+        where {
+            ?filenode libns:cycle ?cycle ;
+                      libns:lane_number ?lane_number ;
+                      libns:read ?read ;
+                      libns:flowcell ?flowcell ;
+                      libns:flowcell_id ?flowcell_id ;
+                      libns:library ?library ;
+                      libns:library_id ?library_id ;
+                      libns:file_type ?filetype ;
+                      a libns:illumina_result .
+            ?flowcell libns:read_length ?read_length ;
+                      libns:flowcell_type ?flowcell_type .
+            OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
+            FILTER(?filetype != libns:sequencer_result)
+        }
          """
-        Check if the sequence file exists.
-        This requires computing what the sequence name is and checking
-        to see if it can be found in the sequence location.
+        LOGGER.debug("find_archive_sequence_files query: %s",
+                     query_text)
+        query = RDF.SPARQLQuery(query_text)
+        results = []
+        for r in query.execute(self.model):
+            library_id = fromTypedNode(r['library_id'])
+            if library_id in result_map:
+                seq = SequenceResult(r)
+                LOGGER.debug("Creating sequence result for library %s: %s",
+                             library_id,
+                             repr(seq))
+                results.append(seq)
+        return results
+
+    def import_libraries(self, result_map):
+        for lib_id in result_map.keys():
+            lib_id_encoded = lib_id.encode('utf-8')
+            liburl = urljoin(self.host, 'library/%s/' % (lib_id_encoded,))
+            library = RDF.Node(RDF.Uri(liburl))
+            self.import_library(library)
  
-        Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
+    def import_library(self, library):
+        """Import library data into our model if we don't have it already
+        """
+        q = RDF.Statement(library, rdfNS['type'], libraryOntology['library'])
+        present = False
+        if not self.model.contains_statement(q):
+            present = True
+            load_into_model(self.model, 'rdfa', library)
+        LOGGER.debug("Did we import %s: %s", library, present)
+
+    def find_relavant_flowcell_ids(self):
+        """Generate set of flowcell ids that had samples of interest on them
+        """
+        flowcell_query =RDF.SPARQLQuery("""
+prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+select distinct ?flowcell ?flowcell_id
+WHERE {
+  ?library a libns:library ;
+           libns:has_lane ?lane .
+  ?lane libns:flowcell ?flowcell .
+  ?flowcell libns:flowcell_id ?flowcell_id .
+}""")
+        flowcell_ids = set()
+        for r in flowcell_query.execute(self.model):
+            flowcell_ids.add( fromTypedNode(r['flowcell_id']) )
+            LOGGER.debug("Flowcells = %s" %(unicode(flowcell_ids)))
+            flowcell_test = RDF.Statement(r['flowcell'],
+                                          rdfNS['type'],
+                                          libraryOntology['illumina_flowcell'])
+            if not self.model.contains_statement(flowcell_test):
+                # we probably lack full information about the flowcell.
+                load_into_model(self.model, 'rdfa', r['flowcell'])
+        return flowcell_ids
+
+    def import_sequences(self, flowcell_ids):
+        seq_dirs = []
+        for f in flowcell_ids:
+            seq_dirs.append(os.path.join(self.sequences_path, str(f)))
+        sequences = scan_for_sequences(seq_dirs)
+        for seq in sequences:
+            seq.save_to_model(self.model, self.host)
+        update_model_sequence_library(self.model, self.host)
+
+    def update_fastq_targets(self, result_map, raw_files):
+        """Return list of fastq files that need to be built.
+
+        Also update model with link between illumina result files
+        and our target fastq file.
          """
          fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
          fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
          # find what targets we're missing
          needed_targets = {}
-        for lib_id in result_map.keys():
-            result_dir = result_map[lib_id]
-            lib = lib_db[lib_id]
-            lane_dict = make_lane_dict(lib_db, lib_id)
-
-            for lane_key, sequences in lib['lanes'].items():
-                for seq in sequences:
-                    seq.paired = lane_dict[seq.flowcell]['paired_end']
-                    lane_status = lane_dict[seq.flowcell]['status']
-
-                    if seq.paired and seq.read is None:
-                        seq.read = 1
-                    filename_attributes = {
-                        'flowcell': seq.flowcell,
-                        'lib_id': lib_id,
-                        'lane': seq.lane,
-                        'read': seq.read,
-                        'cycle': seq.cycle
-                        }
-                    # skip bad runs
-                    if lane_status == 'Failed':
-                        continue
-                    if seq.flowcell == '30DY0AAXX':
-                        # 30DY0 only ran for 151 bases instead of 152
-                        # it is actually 76 1st read, 75 2nd read
-                        seq.mid_point = 76
-
-                    # end filters
-                    if seq.paired:
-                        target_name = fastq_paired_template % \
-                                      filename_attributes
-                    else:
-                        target_name = fastq_single_template % \
-                                      filename_attributes
-
-                    target_pathname = os.path.join(result_dir, target_name)
-                    if self.force or not os.path.exists(target_pathname):
-                        t = needed_targets.setdefault(target_pathname, {})
-                        t.setdefault(seq.filetype, []).append(seq)
-
+        for seq in raw_files:
+            if not seq.isgood:
+                continue
+            filename_attributes = {
+                'flowcell': seq.flowcell_id,
+                'lib_id': seq.library_id,
+                'lane': seq.lane_number,
+                'read': seq.read,
+                'cycle': seq.cycle
+            }
+
+            if seq.ispaired:
+                target_name = fastq_paired_template % \
+                              filename_attributes
+            else:
+                target_name = fastq_single_template % \
+                              filename_attributes
+
+            result_dir = result_map[seq.library_id]
+            target_pathname = os.path.join(result_dir, target_name)
+            if self.force or not os.path.exists(target_pathname):
+                t = needed_targets.setdefault(target_pathname, {})
+                t.setdefault(seq.filetype, []).append(seq)
+            self.add_target_source_links(target_pathname, seq)
          return needed_targets
  
+    def add_target_source_links(self, target, seq):
+        """Add link between target pathname and the 'lane' that produced it
+        (note lane objects are now post demultiplexing.)
+        """
+        target_uri = 'file://' + target
+        target_node = RDF.Node(RDF.Uri(target_uri))
+        source_stmt = RDF.Statement(target_node, dcNS['source'], seq.filenode)
+        self.model.add_statement(source_stmt)
  
      def condor_srf_to_fastq(self, sources, target_pathname):
          if len(sources) > 1:
              raise ValueError("srf to fastq can only handle one file")
  
+        mid_point = None
+        if sources[0].flowcell_id == '30DY0AAXX':
+            mid_point = 76
+
          return {
-            'sources': [os.path.abspath(sources[0].path)],
+            'sources': [sources[0].path],
              'pyscript': srf2fastq.__file__,
-            'flowcell': sources[0].flowcell,
-            'ispaired': sources[0].paired,
+            'flowcell': sources[0].flowcell_id,
+            'ispaired': sources[0].ispaired,
              'target': target_pathname,
              'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
-            'mid': getattr(sources[0], 'mid_point', None),
+            'mid': mid_point,
              'force': self.force,
          }
  
@@ -221,10 +281,10 @@ class CondorFastqExtract(object):
          paths.sort()
          return {
              'pyscript': qseq2fastq.__file__,
-            'flowcell': sources[0].flowcell,
+            'flowcell': sources[0].flowcell_id,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
              'istar': len(sources) == 1,
          }
  
@@ -237,11 +297,9 @@ class CondorFastqExtract(object):
              'pyscript': desplit_fastq.__file__,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
          }
  
-    def lane_rdf(self, sources, target_pathname):
-        pass
  
  def make_lane_dict(lib_db, lib_id):
      """
@@ -255,3 +313,54 @@ def make_lane_dict(lib_db, lib_id):
          result.append((lane['flowcell'], lane))
      return dict(result)
  
+class SequenceResult(object):
+    """Convert the sparql query result from find_archive_sequence_files
+    """
+    def __init__(self, result):
+        self.filenode = result['filenode']
+        self._filetype = result['filetype']
+        self.cycle = fromTypedNode(result['cycle'])
+        self.lane_number = fromTypedNode(result['lane_number'])
+        self.read = fromTypedNode(result['read'])
+        if type(self.read) in types.StringTypes:
+            self.read = 1
+        self.library = result['library']
+        self.library_id = fromTypedNode(result['library_id'])
+        self.flowcell = result['flowcell']
+        self.flowcell_id = fromTypedNode(result['flowcell_id'])
+        self.flowcell_type = fromTypedNode(result['flowcell_type'])
+        self.flowcell_status = fromTypedNode(result['flowcell_status'])
+
+    def _is_good(self):
+        """is this sequence / flowcell 'good enough'"""
+        if self.flowcell_status is not None and \
+           self.flowcell_status.lower() == "failed":
+            return False
+        return True
+    isgood = property(_is_good)
+
+    def _get_ispaired(self):
+        if self.flowcell_type.lower() == "paired":
+            return True
+        else:
+            return False
+    ispaired = property(_get_ispaired)
+
+    def _get_filetype(self):
+        return stripNamespace(libraryOntology, self._filetype)
+    filetype = property(_get_filetype)
+
+    def _get_path(self):
+        url = urlparse(str(self.filenode.uri))
+        if url.scheme == 'file':
+            return url.path
+        else:
+            errmsg = u"Unsupported scheme {0} for {1}"
+            raise ValueError(errmsg.format(url.scheme, unicode(url)))
+    path = property(_get_path)
+
+    def __repr__(self):
+        return "SequenceResult({0},{1},{2})".format(
+            str(self.filenode),
+            str(self.library_id),
+            str(self.flowcell_id))
diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py

index a74d71a667d1cb788cbdaab24cdf8661738c9413..f04ac8fe5328e738a012738a5d256b9307e11504 100644 (file)
--- a/htsworkflow/submission/daf.py
+++ b/htsworkflow/submission/daf.py
@@ -361,7 +361,7 @@ class UCSCSubmission(object):
                            rdfNS['type'],
                            submissionOntology['submission']))
          self.model.add_statement(RDF.Statement(submissionNode,
-                                               submissionOntology['library'],
+                                               libraryOntology['library'],
                                                 libNode))
  
          LOGGER.debug("Adding statements to {0}".format(str(submissionView)))
@@ -385,8 +385,8 @@ class UCSCSubmission(object):
      def create_file_attributes(self, filename, submissionView, submission_uri, submission_dir):
          # add file specific information
          LOGGER.debug("Updating file md5sum")
-        fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
          submission_pathname = os.path.join(submission_dir, filename)
+        fileNode = RDF.Node(RDF.Uri("file://" + submission_pathname))
          self.model.add_statement(
              RDF.Statement(submissionView,
                            dafTermOntology['has_file'],
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py

index a3ac2f17b904c832c9996336bb74d0186f0ddef9..413d2c3ae5e99daa7025a6be3101edb50d96da85 100644 (file)
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -8,7 +8,7 @@ from htsworkflow.submission.submission import Submission
  from htsworkflow.util.rdfhelp import \
       fromTypedNode, \
       geoSoftNS, \
-     simplifyUri, \
+     stripNamespace, \
       submissionOntology
  
  from django.conf import settings
@@ -31,9 +31,13 @@ class GEOSubmission(Submission):
          for lib_id, result_dir in result_map.items():
              an_analysis = self.get_submission_node(result_dir)
              metadata = self.get_sample_metadata(an_analysis)
-            if len(metadata) > 1:
+            if len(metadata) == 0:
+                errmsg = 'No metadata found for {0}'
+                LOGGER.error(errmsg.format(str(an_analysis),))
+                continue
+            elif len(metadata) > 1:
                  errmsg = 'Confused there are more than one samples for %s'
-                LOGGER.debug(errmsg % (str(an_analysis,)))
+                LOGGER.debug(errmsg % (str(an_analysis),))
              metadata = metadata[0]
              metadata['raw'] = self.get_raw_files(an_analysis)
              metadata['supplimental'] = self.get_sample_files(an_analysis)
@@ -203,7 +207,7 @@ class GEOSubmission(Submission):
      def query_to_soft_dictionary(self, results, heading):
          attributes = []
          for r in results:
-            name = simplifyUri(geoSoftNS, r['name'])
+            name = stripNamespace(geoSoftNS, r['name'])
              if name is not None:
                  if name.lower() == heading.lower():
                      name = '^' + name
diff --git a/htsworkflow/submission/results.py b/htsworkflow/submission/results.py

index daeb7d1fed1d65b867b57b5b9a1e3dbaf60c3013..ca6f1e92438ff9a7dd64042196545e6bf06129b6 100644 (file)
--- a/htsworkflow/submission/results.py
+++ b/htsworkflow/submission/results.py
@@ -1,5 +1,6 @@
  """Help collect and process results for submission
  """
+from collections import MutableMapping
  import os
  import logging
  
@@ -7,25 +8,34 @@ from collections import namedtuple
  
  LOGGER = logging.getLogger(__name__)
  
-class ResultMap(object):
+class ResultMap(MutableMapping):
      """Store list of results
      """
      def __init__(self):
          self.results_order = []
          self.results = {}
  
-    def keys(self):
-        return self.results_order
+    def __iter__(self):
+        for item in self.results_order:
+            yield item
  
-    def values(self):
-        return ( self.results[r] for r in self.results_order )
+    def __len__(self):
+        l = len(self.results)
+        assert l == len(self.results_order)
+        return l
  
-    def items(self):
-        return ( (r, self.results[r]) for r in self.results_order )
+    def __setitem__(self, key, value):
+        self.results_order.append(key)
+        self.results[key] = value
  
      def __getitem__(self, key):
          return self.results[key]
  
+    def __delitem__(self, key):
+        del self.results[key]
+        i = self.results_order.index(key)
+        del self.results_order[i]
+
      def add_results_from_file(self, filename):
          pathname = os.path.abspath(filename)
          basepath, name = os.path.split(pathname)
@@ -33,11 +43,7 @@ class ResultMap(object):
          for lib_id, lib_path in results:
              if not os.path.isabs(lib_path):
                  lib_path = os.path.join(basepath, lib_path)
-            self.add_result(lib_id, lib_path)
-
-    def add_result(self, lib_id, lib_path):
-        self.results_order.append(lib_id)
-        self.results[lib_id] = lib_path
+            self[lib_id] = lib_path
  
      def make_tree_from(self, source_path, destpath = None):
          """Create a tree using data files from source path.
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index 6dd630aeda90fe5ad08a96da8be610f628642b04..18fa3b2bc9f487b63915ca02a3c060adbfeedf3b 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -83,7 +83,7 @@ class Submission(object):
                                                      rdfNS['type'])
          if file_classification is None:
              errmsg = 'Could not find class for {0}'
-            logger.warning(errmsg.format(str(file_type)))
+            LOGGER.warning(errmsg.format(str(file_type)))
              return
  
          self.model.add_statement(
diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py

index bb2b3c9995ff9a2fcf8510e9b6cf2860c71d029e..94df7b60b2f1e7dbca6d5e2fc2ecbacded55f30a 100644 (file)
--- a/htsworkflow/submission/test/test_condorfastq.py
+++ b/htsworkflow/submission/test/test_condorfastq.py
@@ -7,8 +7,9 @@ import shutil
  import tempfile
  import unittest
  
-from htsworkflow.submission import condorfastq
+from htsworkflow.submission.condorfastq import CondorFastqExtract
  from htsworkflow.submission.results import ResultMap
+from htsworkflow.util.rdfhelp import load_string_into_model, dump_model
  
  FCDIRS = [
      'C02F9ACXX',
@@ -16,6 +17,7 @@ FCDIRS = [
      'C02F9ACXX/C1-202/Project_11154',
      'C02F9ACXX/C1-202/Project_12342_Index1',
      'C02F9ACXX/C1-202/Project_12342_Index2',
+    'C02F9ACXX/C1-202/Project_12345',
      '42JUYAAXX',
      '42JUYAAXX/C1-76',
      '30221AAXX',
@@ -31,9 +33,18 @@ DATAFILES = [
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
@@ -77,77 +88,288 @@ DATAFILES = [
      '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
  ]
  
-LIBDATA = {
-    '11154':{u'antibody_id': None,
-             u'cell_line': u'Unknown',
-             u'cell_line_id': 1,
-             u'experiment_type': u'RNA-seq',
-             u'experiment_type_id': 4,
-             u'gel_cut_size': 300,
-             u'hidden': False,
-             u'id': u'11154',
-             u'insert_size': 200,
-             u'lane_set': [{u'flowcell': u'30221AAXX',
-                            u'lane_number': 4,
-                            u'lane_id': 3400,
-                            u'paired_end': False,
-                            u'read_length': 33,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'42JUYAAXX',
-                            u'lane_number': 5,
-                            u'lane_id': 4200,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'61MJTAAXX',
-                            u'lane_number': 6,
-                            u'lane_id': 6600,
-                            u'paired_end': False,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'30DY0AAXX',
-                            u'lane_number': 8,
-                            u'lane_id': 3800,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'C02F9ACXX',
-                            u'lane_number': 3,
-                            u'lane_id': 12300,
-                            u'paired_end': True,
-                            u'read_length': 101,
-                            u'status': u'Unknown',
-                            u'status_code': None}],
-             u'library_id': u'11154',
-             u'library_name': u'Paired ends ASDF ',
-             u'library_species': u'Mus musculus',
-             u'library_species_id': 9,
-             u'library_type': u'Paired End (non-multiplexed)',
-             u'library_type_id': 2,
-             u'made_by': u'Gary Gygax',
-             u'made_for': u'TSR',
-             u'notes': u'300 bp gel fragment',
-             u'replicate': 1,
-             u'stopping_point': u'1Aa',
-             u'successful_pM': None,
-             u'undiluted_concentration': u'29.7'}
-    }
-
-FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
-
-class FakeApi(object):
-    def __init__(self, *args, **kwargs):
-        self.root_url = 'http://localhost'
-
-    def get_library(self, libid):
-        lib_data = LIBDATA[libid]
-        return copy.deepcopy(lib_data)
-
-
+lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
+
+<http://localhost/flowcell/30221AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 33 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3401> ;
+        libns:has_lane <http://localhost/lane/3402> ;
+        libns:has_lane <http://localhost/lane/3403> ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/3405> ;
+        libns:has_lane <http://localhost/lane/3406> ;
+        libns:has_lane <http://localhost/lane/3407> ;
+        libns:has_lane <http://localhost/lane/3408> ;
+        libns:flowcell_id "30221AAXX"@en .
+
+<http://localhost/lane/3401>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3402>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3403>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3404>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 4 .
+        # paired_end 1;
+        # read_length 33;
+        # status "Unknown"@en .
+<http://localhost/lane/3405>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3406>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3407>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3408>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/42JUYAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/4201> ;
+        libns:has_lane <http://localhost/lane/4202> ;
+        libns:has_lane <http://localhost/lane/4203> ;
+        libns:has_lane <http://localhost/lane/4204> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/4206> ;
+        libns:has_lane <http://localhost/lane/4207> ;
+        libns:has_lane <http://localhost/lane/4208> ;
+        libns:flowcell_id "42JUYAAXX"@en .
+
+<http://localhost/lane/4201>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/4202>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/4203>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/4204>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/4205>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 5 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/4206>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/4207>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/4208>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/61MJTAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/6601> ;
+        libns:has_lane <http://localhost/lane/6602> ;
+        libns:has_lane <http://localhost/lane/6603> ;
+        libns:has_lane <http://localhost/lane/6604> ;
+        libns:has_lane <http://localhost/lane/6605> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/6607> ;
+        libns:has_lane <http://localhost/lane/6608> ;
+        libns:flowcell_id "61MJTAAXX"@en .
+
+<http://localhost/lane/6601>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/6602>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/6603>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/6604>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/6605>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/6606>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 6 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/6607>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/6608>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/30DY0AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3801> ;
+        libns:has_lane <http://localhost/lane/3802> ;
+        libns:has_lane <http://localhost/lane/3803> ;
+        libns:has_lane <http://localhost/lane/3804> ;
+        libns:has_lane <http://localhost/lane/3805> ;
+        libns:has_lane <http://localhost/lane/3806> ;
+        libns:has_lane <http://localhost/lane/3807> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:flowcell_id "30DY0AAXX"@en .
+
+<http://localhost/lane/3801>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3802>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3803>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3804>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/3805>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3806>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3807>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3808>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 8 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+
+<http://localhost/flowcell/C02F9ACXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 101 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:flowcell_id "C02F9ACXX"@en .
+
+<http://localhost/lane/12300>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/12345/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/lane/12500>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/library/11154/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "29.7";
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:insert_size 2000 ;
+        libns:library_id "11154"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends ASDF"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+
+
+<http://localhost/library/12345/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "12.345";
+        libns:cell_line "Unknown"@en ;
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:insert_size 2000 ;
+        libns:library_id "12345"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends THING"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+"""
+HOST = "http://localhost"
  
  class TestCondorFastq(unittest.TestCase):
      def setUp(self):
@@ -168,141 +390,146 @@ class TestCondorFastq(unittest.TestCase):
              with open(filename, 'w') as stream:
                  stream.write('testfile')
  
-        self.subname = unicode('sub-11154')
-        self.subdir = os.path.join(self.tempdir, self.subname)
-        os.mkdir(self.subdir)
-
          self.result_map = ResultMap()
-        self.result_map.add_result('11154', self.subname)
+        for lib_id in [u'11154', u'12345']:
+            subname = 'sub-%s' % (lib_id,)
+            sub_dir = os.path.join(self.tempdir, subname)
+            os.mkdir(sub_dir)
+            self.result_map[lib_id] =  sub_dir
+
+        self.extract = CondorFastqExtract(HOST,
+                                          self.flowcelldir,
+                                          self.logdir)
+        load_string_into_model(self.extract.model, 'turtle', lib_turtle)
  
      def tearDown(self):
          shutil.rmtree(self.tempdir)
          os.chdir(self.cwd)
  
+    def test_find_relavant_flowcell_ids(self):
+        expected = set(('30221AAXX',
+                        '42JUYAAXX',
+                        '61MJTAAXX',
+                        '30DY0AAXX',
+                        'C02F9ACXX'))
+        flowcell_ids = self.extract.find_relavant_flowcell_ids()
+        self.assertEqual(flowcell_ids, expected)
+
      def test_find_archive_sequence(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
-        lanes = [
-            lib_db['11154']['lanes'][(u'30221AAXX', 4)],
-            lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
-            lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
-            lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
-            lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
-        ]
-        self.failUnlessEqual(len(lanes[0]), 1)
-        self.failUnlessEqual(len(lanes[1]), 2)
-        self.failUnlessEqual(len(lanes[2]), 1)
-        self.failUnlessEqual(len(lanes[3]), 1)
-        self.failUnlessEqual(len(lanes[4]), 4)
+        seqs = self.extract.find_archive_sequence_files(self.result_map)
+
+        expected = set([
+            (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
+            (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
+            (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
+            (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
+        ])
+        found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
+        self.assertEqual(expected, found)
  
      def test_find_needed_targets(self):
+        lib_db = self.extract.find_archive_sequence_files(self.result_map)
  
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        needed_targets = extract.find_missing_targets(self.result_map,
-                                                      lib_db)
-        self.failUnlessEqual(len(needed_targets), 7)
+        needed_targets = self.extract.update_fastq_targets(self.result_map,
+                                                           lib_db)
+        self.assertEqual(len(needed_targets), 9)
          srf_30221 = needed_targets[
-            self.subname + u'/11154_30221AAXX_c33_l4.fastq']
+            self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
          qseq_42JUY_r1 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
          qseq_42JUY_r2 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
          qseq_61MJT = needed_targets[
-            self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
+            self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
          split_C02F9_r1 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
          split_C02F9_r2 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
  
-        self.failUnlessEqual(len(srf_30221['srf']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
-        self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
-        self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
-        self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
-
-        #print '-------needed targets---------'
-        #pprint(needed_targets)
+        self.assertEqual(len(srf_30221['srf']), 1)
+        self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
+        self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
+        self.assertEqual(len(qseq_61MJT['qseq']), 1)
+        self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
+        self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
  
      def test_generate_fastqs(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        commands = extract.build_condor_arguments(self.result_map)
+        commands = self.extract.build_condor_arguments(self.result_map)
  
          srf = commands['srf']
          qseq = commands['qseq']
          split = commands['split_fastq']
  
-        self.failUnlessEqual(len(srf), 2)
-        self.failUnlessEqual(len(qseq), 3)
-        self.failUnlessEqual(len(split), 2)
+        self.assertEqual(len(srf), 2)
+        self.assertEqual(len(qseq), 3)
+        self.assertEqual(len(split), 4)
  
          srf_data = {
-            os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30221AAXX_c33_l4.fastq'): {
                  'mid': None,
                  'ispaired': False,
                  'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
                  'flowcell': u'30221AAXX',
-                'target': os.path.join(self.subname,
+                'target': os.path.join(self.result_map['11154'],
                                         u'11154_30221AAXX_c33_l4.fastq'),
              },
-            os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30DY0AAXX_c151_l8_r1.fastq'): {
                  'mid': None,
                  'ispaired': True,
                  'flowcell': u'30DY0AAXX',
                  'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
                  'mid': 76,
                  'target':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r1.fastq'),
                  'target_right':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r2.fastq'),
              }
          }
          for args in srf:
              expected = srf_data[args['target']]
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
-            self.failUnlessEqual(len(args['sources']), 1)
+            self.assertEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(len(args['sources']), 1)
              _, source_filename = os.path.split(args['sources'][0])
-            self.failUnlessEqual(source_filename, expected['sources'][0])
-            self.failUnlessEqual(args['target'], expected['target'])
+            self.assertEqual(source_filename, expected['sources'][0])
+            self.assertEqual(args['target'], expected['target'])
              if args['ispaired']:
-                self.failUnlessEqual(args['target_right'],
+                self.assertEqual(args['target_right'],
                                       expected['target_right'])
              if 'mid' in expected:
-                self.failUnlessEqual(args['mid'], expected['mid'])
+                self.assertEqual(args['mid'], expected['mid'])
  
          qseq_data = {
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r1.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
              },
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r2.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
              },
-            os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_61MJTAAXX_c76_l6.fastq'): {
                  'istar': True,
                  'ispaired': False,
                  'sources': [
@@ -311,11 +538,11 @@ class TestCondorFastq(unittest.TestCase):
          }
          for args in qseq:
              expected = qseq_data[args['target']]
-            self.failUnlessEqual(args['istar'], expected['istar'])
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(args['istar'], expected['istar'])
+            self.assertEqual(args['ispaired'], expected['ispaired'])
              for i in range(len(expected['sources'])):
                  _, filename = os.path.split(args['sources'][i])
-                self.failUnlessEqual(filename, expected['sources'][i])
+                self.assertEqual(filename, expected['sources'][i])
  
  
          split_test = dict((( x['target'], x) for x in
@@ -326,64 +553,82 @@ class TestCondorFastq(unittest.TestCase):
              {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
                           u'11154_NoIndex_L003_R2_002.fastq.gz'],
               'pyscript': 'desplit_fastq.pyc',
-             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
+             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
+                         u'12345_CGATGT_L003_R1_002.fastq.gz',
+                         u'12345_CGATGT_L003_R1_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
+                         u'12345_CGATGT_L003_R2_002.fastq.gz',
+                         u'12345_CGATGT_L003_R2_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
+             ]
           ))
          for arg in split:
              _, target = os.path.split(arg['target'])
              pyscript = split_test[target]['pyscript']
-            self.failUnless(arg['pyscript'].endswith(pyscript))
+            self.assertTrue(arg['pyscript'].endswith(pyscript))
              filename = split_test[target]['target']
-            self.failUnless(arg['target'].endswith(filename))
+            self.assertTrue(arg['target'].endswith(filename))
              for s_index in range(len(arg['sources'])):
                  s1 = arg['sources'][s_index]
                  s2 = split_test[target]['sources'][s_index]
-                self.failUnless(s1.endswith(s2))
-
-        #print '-------commands---------'
-        #pprint (commands)
+                self.assertTrue(s1.endswith(s2))
  
      def test_create_scripts(self):
-        os.chdir(self.tempdir)
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        extract.create_scripts(self.result_map)
-
-        self.failUnless(os.path.exists('srf.condor'))
+        self.extract.create_scripts(self.result_map)
+
+        self.assertTrue(os.path.exists('srf.condor'))
          with open('srf.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
+            self.assertEqual(len(arguments), 2)
+            self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
                              in arguments[0])
-            self.failUnless(
-                '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
+            self.assertTrue(
+                'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
                  arguments[1])
  
-        self.failUnless(os.path.exists('qseq.condor'))
+        self.assertTrue(os.path.exists('qseq.condor'))
          with open('qseq.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 3)
-            self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
+            self.assertEqual(len(arguments), 3)
+            self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
                              arguments[0])
-            self.failUnless(
+            self.assertTrue(
                  'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
                  arguments[1])
-            self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
+            self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
                              arguments[2])
  
-        self.failUnless(os.path.exists('split_fastq.condor'))
+        self.assertTrue(os.path.exists('split_fastq.condor'))
          with open('split_fastq.condor', 'r') as split:
              arguments = [ l for l in split if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
+            self.assertEqual(len(arguments), 4)
+            # Lane 3 Read 1
+            self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
                              arguments[0])
-            self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
+            # Lane 3 Read 2
+            self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
                              arguments[1])
+            # Lane 3 Read 1
+            self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
+
+            # Lane 3 Read 2
+            self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
+
  
  def suite():
      suite = unittest.makeSuite(TestCondorFastq, 'test')
diff --git a/htsworkflow/submission/test/test_daf.py b/htsworkflow/submission/test/test_daf.py

index 334a71a9203e7ca5d621b634a62bfc9b5fd299a6..95dc1d9064688f4b63ca4c7b1a70866587deb9f7 100644 (file)
--- a/htsworkflow/submission/test/test_daf.py
+++ b/htsworkflow/submission/test/test_daf.py
@@ -301,7 +301,7 @@ thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
          result_map = results.ResultMap()
          result_dir = os.path.join(self.sourcedir,
                                    test_results.S1_NAME)
-        result_map.add_result('1000', result_dir)
+        result_map['1000'] = result_dir
  
          submission.link_daf(result_map)
  
diff --git a/htsworkflow/submission/test/test_results.py b/htsworkflow/submission/test/test_results.py

index 8579c582a5ea134dee51bcfd1be8cb6963a80a79..2d8cd57c30dbc661a88733a5d1af36efd9ecd3e9 100644 (file)
--- a/htsworkflow/submission/test/test_results.py
+++ b/htsworkflow/submission/test/test_results.py
@@ -48,14 +48,13 @@ class TestResultMap(unittest.TestCase):
      def tearDown(self):
          shutil.rmtree(self.tempdir)
  
-
      def test_dict_like(self):
          """Make sure the result map works like an ordered dictionary
          """
          results = ResultMap()
-        results.add_result('1000', 'dir1000')
-        results.add_result('2000', 'dir2000')
-        results.add_result('1500', 'dir1500')
+        results['1000'] = 'dir1000'
+        results['2000'] = 'dir2000'
+        results['1500'] = 'dir1500'
  
          self.failUnlessEqual(results.keys(), ['1000', '2000', '1500'])
          self.failUnlessEqual(list(results.values()),
@@ -69,10 +68,15 @@ class TestResultMap(unittest.TestCase):
          self.failUnlessEqual(results['1500'], 'dir1500')
          self.failUnlessEqual(results['2000'], 'dir2000')
  
+        self.assertTrue(u'2000' in results)
+        self.assertTrue('2000' in results)
+        self.assertFalse(u'77777' in results)
+        self.assertFalse('77777' in results)
+
      def test_make_from(self):
          results = ResultMap()
-        results.add_result('1000', S1_NAME)
-        results.add_result('2000', S2_NAME)
+        results['1000'] =  S1_NAME
+        results['2000'] =  S2_NAME
  
          results.make_tree_from(self.sourcedir, self.resultdir)
  
diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql

index e7fcbc12d0f8337e099a8bbe85409ad2e8c18432..de9097ba9962849dde661bbd9ebd38521573f781 100644 (file)
--- a/htsworkflow/templates/geo_fastqs.sparql
+++ b/htsworkflow/templates/geo_fastqs.sparql
@@ -11,7 +11,7 @@ WHERE {
  
    ?file ucscDaf:filename ?filename ;
          ucscDaf:md5sum ?md5sum ;
-        libraryOntology:has_lane ?lane ;
+        libraryOntology:library ?library ;
          a ?file_type .
    ?file_type a <{{file_class}}> ;
               geoSoft:fileTypeLabel ?file_type_label .
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql

index 850d99a944ee51009675a381aa5c8a023d521a9d..b4d4b0bc8102db05b84f16472b4508a82db48afa 100644 (file)
--- a/htsworkflow/templates/geo_samples.sparql
+++ b/htsworkflow/templates/geo_samples.sparql
@@ -7,18 +7,22 @@ PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
  
  select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
  WHERE {
-  <{{submission}}> a submissionOntology:submission .
+  <{{submission}}> a submissionOntology:submission ;
+                   submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
  
    OPTIONAL { <{{submission}}> ucscDaf:control ?control }
    OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
    OPTIONAL { ?library libraryOntology:antibody ?antibody }
    OPTIONAL { ?library libraryOntology:cell_line ?cell .
-             ?cell_line cells:cell ?cell ;
-                        cells:documents ?growthProtocol . }
+             OPTIONAL { ?cell_line cells:cell ?cell ;
+                                   cells:documents ?growthProtocol . }}
    OPTIONAL { ?library ucscDaf:sex ?sex }
    OPTIONAL { ?library libraryOntology:library_id ?library_id }
    OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:species ?species_name .
+             ?species libraryOntology:species ?species_name ;
+                      libraryOntology:taxon_id ?taxon_id . }
    OPTIONAL { ?library libraryOntology:condition_term ?treatment }
    OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
    OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
@@ -32,8 +36,6 @@ WHERE {
    OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
    OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
  
-  <{{submission}}> submissionOntology:library ?library ;
-                   submissionOntology:name ?name .
    ?species libraryOntology:species ?species_name ;
             libraryOntology:taxon_id ?taxon_id .
  
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft

index 969ff53836bd37b3d4f82747cf3d94c806852ca9..8ae3ed9ef78e0d737f1d7cada296a740e2ff553d 100644 (file)
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,5 +1,6 @@
-{% for name, value in series %}{{name}}={{value}}
-{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
+{% for name, value in series %}
+{{name}}={{value}}{% endfor %}{% for row in samples %}
+^SAMPLE={{row.name}}
  !Sample_type=SRA
  !Sample_title={{row.name}}
  !Sample_series_id={{ series_id }}
@@ -14,18 +15,13 @@
  !Sample_extract_protocol={{ row.extractProtocol|safe }}
  !Sample_data_processing={{ row.dataProtocol|safe }}
  !Sample_molecule_ch1={{ row.extractMolecule }}
-!Sample_characteristics_ch1=labExpId: {{ row.library_id }}
-!Sample_characteristics_ch1=replicate: {{ row.replicate }}
-{% if row.cell %}{% spaceless %}
-!Sample_characteristics_ch1=cell: {{ row.cell }}
-{% endspaceless %}{% endif %}
-{% if row.readType %}{% spaceless %}
-!Sample_characteristics_ch1=readType: {{ row.readType }}
-{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
-!Sample_characteristics_ch1=cell: {{ row.antibody }}
-{% endspaceless %}{% endif %}{% for run in row.run %}
-!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}
-!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %}
+!Sample_characteristics_ch1=labExpId: {{ row.library_id }}{% if row.replicate %}
+!Sample_characteristics_ch1=replicate: {{ row.replicate }}{% endif %}{% if row.cell %}
+!Sample_characteristics_ch1=cell: {{ row.cell }}{% endif %}{% if row.readType %}
+!Sample_characteristics_ch1=readType: {{ row.readType }}{% endif %}{% if row.antibody %}
+!Sample_characteristics_ch1=cell: {{ row.antibody }}{% endif %}{% for run in row.run %}{% if run.image_software %}
+!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endif %}{% if run.basecall_software %}
+!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.basecall_software }}-{{ run.basecall_version }}{% endif %}{% endfor %}{% for raw in row.raw %}
  !Sample_raw_file_{{forloop.counter}}={{ raw.filename }}
  !Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
  !Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }}
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py

index 29df21ec492beeaee42b54dc65e7216753f6a3a7..89d9e5b4d16290e940878d7cc6844d8b64d80c79 100644 (file)
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -2,6 +2,7 @@
  """
  import collections
  from datetime import datetime
+from glob import glob
  from urlparse import urlparse, urlunparse
  from urllib2 import urlopen
  import logging
@@ -14,27 +15,14 @@ import RDF
  
  logger = logging.getLogger(__name__)
  
-# standard ontology namespaces
-owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
-dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
-rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
-rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
-xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
-
-# internal ontologies
-submissionOntology = RDF.NS(
-    "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
-dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
-libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
-inventoryOntology = RDF.NS(
-    "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
-submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
-geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
+from htsworkflow.util.rdfns import *
+
+SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
+INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  
  ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  
-
  def sparql_query(model, query_filename, output_format='text'):
      """Execute sparql query from file
      """
@@ -203,6 +191,14 @@ def simplify_uri(uri):
      >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
      'was=foo'
      """
+    if isinstance(uri, RDF.Node):
+        if uri.is_resource():
+            uri = uri.uri
+        else:
+            raise ValueError("Can't simplify an RDF literal")
+    if isinstance(uri, RDF.Uri):
+        uri = str(uri)
+
      parsed = urlparse(uri)
      if len(parsed.query) > 0:
          return parsed.query
@@ -214,7 +210,7 @@ def simplify_uri(uri):
                  return element
      raise ValueError("Unable to simplify %s" % (uri,))
  
-def simplifyUri(namespace, term):
+def stripNamespace(namespace, term):
      """Remove the namespace portion of a term
  
      returns None if they aren't in common
@@ -237,10 +233,10 @@ def get_model(model_name=None, directory=None):
          directory = os.getcwd()
  
      if model_name is None:
-        storage = RDF.MemoryStorage()
+        storage = RDF.MemoryStorage(options_string="contexts='yes'")
          logger.info("Using RDF Memory model")
      else:
-        options = "hash-type='bdb',dir='{0}'".format(directory)
+        options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
          storage = RDF.HashStorage(model_name,
                        options=options)
          logger.info("Using {0} with options {1}".format(model_name, options))
@@ -249,37 +245,110 @@ def get_model(model_name=None, directory=None):
  
  
  def load_into_model(model, parser_name, path, ns=None):
+    if type(ns) in types.StringTypes:
+        ns = RDF.Uri(ns)
+
+    if isinstance(path, RDF.Node):
+        if path.is_resource():
+            path = str(path.uri)
+        else:
+            raise ValueError("url to load can't be a RDF literal")
+
      url_parts = list(urlparse(path))
-    if len(url_parts[0]) == 0:
+    if len(url_parts[0]) == 0 or url_parts[0] == 'file':
          url_parts[0] = 'file'
          url_parts[2] = os.path.abspath(url_parts[2])
+    if parser_name is None or parser_name == 'guess':
+        parser_name = guess_parser_by_extension(path)
      url = urlunparse(url_parts)
-    logger.info("Opening %s" % (url,))
-    req = urlopen(url)
-    logger.debug("request status: %s" % (req.code,))
-    if parser_name is None:
-        content_type = req.headers.get('Content-Type', None)
-        parser_name = guess_parser(content_type, path)
-        logger.debug("Guessed parser: %s" % (parser_name,))
-    data = req.read()
-    load_string_into_model(model, parser_name, data, ns)
+    logger.info("Opening {0} with parser {1}".format(url, parser_name))
+
+    rdf_parser = RDF.Parser(name=parser_name)
  
+    statements = []
+    retries = 3
+    while retries > 0:
+        try:
+            retries -= 1
+            statements = rdf_parser.parse_as_stream(url, ns)
+            retries = 0
+        except RDF.RedlandError, e:
+            errmsg = "RDF.RedlandError: {0} {1} tries remaining"
+            logger.error(errmsg.format(str(e), retries))
+
+    for s in statements:
+        conditionally_add_statement(model, s, ns)
  
  def load_string_into_model(model, parser_name, data, ns=None):
+    ns = fixup_namespace(ns)
+    logger.debug("load_string_into_model parser={0}, len={1}".format(
+        parser_name, len(data)))
+    rdf_parser = RDF.Parser(name=parser_name)
+
+    for s in rdf_parser.parse_string_as_stream(data, ns):
+        conditionally_add_statement(model, s, ns)
+
+
+def fixup_namespace(ns):
      if ns is None:
          ns = RDF.Uri("http://localhost/")
+    elif type(ns) in types.StringTypes:
+        ns = RDF.Uri(ns)
+    elif not(isinstance(ns, RDF.Uri)):
+        errmsg = "Namespace should be string or uri not {0}"
+        raise ValueError(errmsg.format(str(type(ns))))
+    return ns
+
+
+def conditionally_add_statement(model, s, ns):
      imports = owlNS['imports']
-    rdf_parser = RDF.Parser(name=parser_name)
-    for s in rdf_parser.parse_string_as_stream(data, ns):
-        if s.predicate == imports:
-            obj = str(s.object)
-            logger.info("Importing %s" % (obj,))
-            load_into_model(model, None, obj, ns)
-        if s.object.is_literal():
-            value_type = get_node_type(s.object)
-            if value_type == 'string':
-                s.object = sanitize_literal(s.object)
-        model.add_statement(s)
+    if s.predicate == imports:
+        obj = str(s.object)
+        logger.info("Importing %s" % (obj,))
+        load_into_model(model, None, obj, ns)
+    if s.object.is_literal():
+        value_type = get_node_type(s.object)
+        if value_type == 'string':
+            s.object = sanitize_literal(s.object)
+    model.add_statement(s)
+
+
+def add_default_schemas(model, schema_path=None):
+    """Add default schemas to a model
+    Looks for turtle files in either htsworkflow/util/schemas
+    or in the list of directories provided in schema_path
+    """
+
+    if schema_path is None:
+        path, _ = os.path.split(__file__)
+        schema_path = [os.path.join(path, 'schemas')]
+    elif type(schema_path) in types.StringTypes:
+        schema_path = [schema_path]
+
+    for p in schema_path:
+        for f in glob(os.path.join(p, '*.turtle')):
+            add_schema(model, f)
+
+def add_schema(model, filename):
+    """Add a schema to a model.
+
+    Main difference from 'load_into_model' is it tags it with
+    a RDFlib context so I can remove them later.
+    """
+    parser = RDF.Parser(name='turtle')
+    context = RDF.Node(RDF.Uri(SCHEMAS_URL))
+    url = 'file://' + filename
+    for s in parser.parse_as_stream(url):
+        try:
+            model.append(s, context)
+        except RDF.RedlandError as e:
+            logger.error("%s with %s", str(e), str(s))
+
+
+def remove_schemas(model):
+    """Remove statements labeled with our schema context"""
+    context = RDF.Node(RDF.Uri(SCHEMAS_URL))
+    model.context_remove_statements(context)
  
  
  def sanitize_literal(node):
@@ -310,20 +379,23 @@ def sanitize_literal(node):
  
  
  def guess_parser(content_type, pathname):
-    if content_type in ('application/rdf+xml'):
+    if content_type in ('application/rdf+xml',):
          return 'rdfxml'
-    elif content_type in ('application/x-turtle'):
+    elif content_type in ('application/x-turtle',):
          return 'turtle'
-    elif content_type in ('text/html'):
+    elif content_type in ('text/html',):
+        return 'rdfa'
+    elif content_type is None or content_type in ('text/plain',):
+        return guess_parser_by_extension(pathname)
+
+def guess_parser_by_extension(pathname):
+    _, ext = os.path.splitext(pathname)
+    if ext in ('.xml', '.rdf'):
+        return 'rdfxml'
+    elif ext in ('.html',):
          return 'rdfa'
-    elif content_type is None:
-        _, ext = os.path.splitext(pathname)
-        if ext in ('xml', 'rdf'):
-            return 'rdfxml'
-        elif ext in ('html'):
-            return 'rdfa'
-        elif ext in ('turtle'):
-            return 'turtle'
+    elif ext in ('.turtle',):
+        return 'turtle'
      return 'guess'
  
  def get_serializer(name='turtle'):
@@ -331,10 +403,14 @@ def get_serializer(name='turtle'):
      """
      writer = RDF.Serializer(name=name)
      # really standard stuff
-    writer.set_namespace('owl', owlNS._prefix)
      writer.set_namespace('rdf', rdfNS._prefix)
      writer.set_namespace('rdfs', rdfsNS._prefix)
+    writer.set_namespace('owl', owlNS._prefix)
+    writer.set_namespace('dc', dcNS._prefix)
+    writer.set_namespace('xml', xmlNS._prefix)
      writer.set_namespace('xsd', xsdNS._prefix)
+    writer.set_namespace('vs', vsNS._prefix)
+    writer.set_namespace('wot', wotNS._prefix)
  
      # should these be here, kind of specific to an application
      writer.set_namespace('libraryOntology', libraryOntology._prefix)
@@ -343,6 +419,9 @@ def get_serializer(name='turtle'):
      return writer
  
  
-def dump_model(model):
+def dump_model(model, destination=None):
+    if destination is None:
+        destination = sys.stdout
      serializer = get_serializer()
-    print serializer.serialize_model_to_string(model)
+    destination.write(serializer.serialize_model_to_string(model))
+    destination.write(os.linesep)
diff --git a/htsworkflow/util/rdfinfer.py b/htsworkflow/util/rdfinfer.py

new file mode 100644 (file)

index 0000000..8f12f5c
--- /dev/null
+++ b/htsworkflow/util/rdfinfer.py
@@ -0,0 +1,215 @@
+import logging
+import os
+import sys
+
+import RDF
+
+from htsworkflow.util.rdfns import *
+from htsworkflow.util.rdfhelp import SCHEMAS_URL
+
+INFER_URL='http://jumpgate.caltech.edu/phony/infer'
+
+class Infer(object):
+    """Provide some simple inference.
+
+    Provides a few default rules as methods starting with _rule_
+    """
+    def __init__(self, model):
+        self.model = model
+        self._context = RDF.Node(RDF.Uri(INFER_URL))
+
+
+    def think(self, max_iterations=None):
+        """Update model with with inferred statements.
+
+        max_iterations puts a limit on the number of times we
+        run through the loop.
+
+        it will also try to exit if nothing new has been inferred.
+
+        Also this is the naive solution.
+        There's probably better ones out there.
+        """
+        iterations = 0
+        while max_iterations is None or iterations != max_iterations:
+            starting_size = self.model.size()
+
+            for method_name in dir(self):
+                if method_name.startswith('_rule_'):
+                    method = getattr(self, method_name)
+                    method()
+            if self.model.size() == starting_size:
+                # we didn't add anything new
+                return
+
+    def validate(self, destination=None):
+        if destination is None:
+            destination = sys.stdout
+
+        for msg in self.run_validation():
+            destination.write(msg)
+            destination.write(os.linesep)
+
+    def run_validation(self):
+        """Apply validation rules to our model.
+        """
+        for method_name in dir(self):
+            if method_name.startswith('_validate_'):
+                method = getattr(self, method_name)
+                for msg in method():
+                    yield msg
+
+
+    def _rule_class(self):
+        """resolve class chains.
+        e.g. if a is an BClass, and a BClass is an AClass
+        then a is both a BClass and AClass.
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?obj ?class
+        where  {
+          ?alias a ?class .
+          ?obj a ?alias .
+        }"""
+        query = RDF.SPARQLQuery(body)
+        for r in query.execute(self.model):
+            s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
+            if s not in self.model:
+                self.model.append(s, self._context)
+
+    def _rule_subclass(self):
+        """A subclass is a parent class
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?obj ?subclass ?parent
+        where  {
+          ?subclass rdfs:subClassOf ?parent .
+          ?obj a ?subclass .
+        }"""
+        query = RDF.SPARQLQuery(body)
+        for r in query.execute(self.model):
+            s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
+            if s not in self.model:
+                self.model.append(s, self._context)
+
+    def _rule_inverse_of(self):
+        """Add statements computed with inverseOf
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?o ?reverse ?s
+        where  {
+            ?s ?term ?o .
+            ?s a ?subject_type .
+            ?o a ?object_type .
+            ?term owl:inverseOf ?reverse .
+            ?term rdfs:domain ?subject_type ;
+                  rdfs:range ?object_type .
+            ?reverse rdfs:domain ?object_type ;
+                  rdfs:range ?subject_type .
+        }"""
+        query = RDF.SPARQLQuery(body)
+
+        statements = []
+        for r in query.execute(self.model):
+            s = RDF.Statement(r['o'], r['reverse'], r['s'])
+            if s not in self.model:
+                self.model.append(s, self._context)
+
+
+    def _validate_types(self):
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?subject ?predicate ?object
+        where {
+          ?subject ?predicate ?object
+          OPTIONAL { ?subject a ?class }
+          FILTER(!bound(?class))
+        }
+        """
+        query = RDF.SPARQLQuery(body)
+        errmsg = "Missing type for: {0}"
+        for r in query.execute(self.model):
+            yield errmsg.format(str(r['subject']))
+
+    def _validate_undefined_properties(self):
+        """Find properties that aren't defined.
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?subject ?predicate ?object
+        where {
+            ?subject ?predicate ?object
+            OPTIONAL { ?predicate a ?predicate_class }
+            FILTER(!bound(?predicate_class))
+        }"""
+        query = RDF.SPARQLQuery(body)
+        msg = "Undefined property in {0} {1} {2}"
+        for r in query.execute(self.model):
+            yield msg.format(str(r['subject']),
+                             str(r['predicate']),
+                             str(r['object']))
+
+    def _validate_property_types(self):
+        """Find resources that don't have a type
+        """
+        property_template = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+        select ?type
+        where {{
+            <{predicate}> a rdf:Property ;
+                        {space} ?type .
+        }}"""
+
+        wrong_domain_type = "Domain of {0} was not {1}"
+        wrong_range_type = "Range of {0} was not {1}"
+
+        count = 0
+        schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
+        for s, context in self.model.as_stream_context():
+            if context == schema:
+                continue
+            # check domain
+            query = RDF.SPARQLQuery(property_template.format(
+                predicate=s.predicate,
+                space='rdfs:domain'))
+            for r in query.execute(self.model):
+                if r['type'] == rdfsNS['Resource']:
+                    continue
+                check = RDF.Statement(s.subject, rdfNS['type'], r['type'])
+                if not self.model.contains_statement(check):
+                    yield wrong_domain_type.format(str(s),
+                                                   str(r['type']))
+            # check range
+            query = RDF.SPARQLQuery(property_template.format(
+                predicate=s.predicate,
+                space='rdfs:range'))
+            for r in query.execute(self.model):
+                if r['type'] == rdfsNS['Resource']:
+                    continue
+                check = RDF.Statement(s.object, rdfNS['type'], r['type'])
+                if not self.model.contains_statement(check):
+                    yield wrong_range_type.format(str(s),
+                                                  str(r['type']))
+
+        return
+
diff --git a/htsworkflow/util/rdfns.py b/htsworkflow/util/rdfns.py

new file mode 100644 (file)

index 0000000..d2164ee
--- /dev/null
+++ b/htsworkflow/util/rdfns.py
@@ -0,0 +1,25 @@
+"""Namespace definitions
+
+All in one place to make import rdfns.* work safely
+"""
+from RDF import NS
+
+# standard ontology namespaces
+rdfNS = NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+rdfsNS = NS("http://www.w3.org/2000/01/rdf-schema#")
+owlNS = NS('http://www.w3.org/2002/07/owl#')
+dcNS = NS("http://purl.org/dc/elements/1.1/")
+xmlNS = NS('http://www.w3.org/XML/1998/namespace')
+xsdNS = NS("http://www.w3.org/2001/XMLSchema#")
+vsNS = NS('http://www.w3.org/2003/06/sw-vocab-status/ns#')
+wotNS = NS('http://xmlns.com/wot/0.1/')
+
+# internal ontologies
+submissionOntology = NS(
+    "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
+dafTermOntology = NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
+libraryOntology = NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
+inventoryOntology = NS(
+    "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
+submissionLog = NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
+geoSoftNS = NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
diff --git a/htsworkflow/util/schemas/dc.turtle b/htsworkflow/util/schemas/dc.turtle

new file mode 100644 (file)

index 0000000..2aa6337
--- /dev/null
+++ b/htsworkflow/util/schemas/dc.turtle
@@ -0,0 +1,28 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix grddl: <http://www.w3.org/2003/g/data-view#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix vs: <http://www.w3.org/2003/06/sw-vocab-status/ns#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix wot: <http://xmlns.com/wot/0.1/> .
+
+# this is just a subset of dublin core
+<http://purl.org/dc/elements/1.1/>
+    dc:title "DCMI Metadata Terms" ;
+    rdfs:comment "Metadata terms maintained by the Dublin Core Metadata Initiative" ;
+    a owl:Ontology ;
+    rdfs:seeAlso <http://dublincore.org/documents/dcmi-terms/> .
+
+dc:title
+    a rdf:Property ;
+    rdfs:comment "A name given to the resource"@en ;
+    rdfs:range rdfs:Literal .
+
+dc:description 
+    a rdf:Property ;
+    rdfs:label "Description"@en ;
+    rdfs:comment "An account of the resource"@en ;
+    rdfs:range rdfs:Literal .
diff --git a/htsworkflow/util/schemas/owl.turtle b/htsworkflow/util/schemas/owl.turtle

new file mode 100644 (file)

index 0000000..fd2c392
--- /dev/null
+++ b/htsworkflow/util/schemas/owl.turtle
@@ -0,0 +1,556 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix grddl: <http://www.w3.org/2003/g/data-view#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix vs: <http://www.w3.org/2003/06/sw-vocab-status/ns#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix wot: <http://xmlns.com/wot/0.1/> .
+
+
+<http://www.w3.org/2002/07/owl> a owl:Ontology ;
+     dc:title "The OWL 2 Schema vocabulary (OWL 2)" ;
+     rdfs:comment """
+  This ontology partially describes the built-in classes and
+  properties that together form the basis of the RDF/XML syntax of OWL 2.
+  The content of this ontology is based on Tables 6.1 and 6.2
+  in Section 6.4 of the OWL 2 RDF-Based Semantics specification,
+  available at http://www.w3.org/TR/owl2-rdf-based-semantics/.
+  Please note that those tables do not include the different annotations
+  (labels, comments and rdfs:isDefinedBy links) used in this file.
+  Also note that the descriptions provided in this ontology do not
+  provide a complete and correct formal description of either the syntax
+  or the semantics of the introduced terms (please see the OWL 2
+  recommendations for the complete and normative specifications).
+  Furthermore, the information provided by this ontology may be
+  misleading if not used with care. This ontology SHOULD NOT be imported
+  into OWL ontologies. Importing this file into an OWL 2 DL ontology
+  will cause it to become an OWL 2 Full ontology and may have other,
+  unexpected, consequences.
+   """ ;
+     rdfs:isDefinedBy
+          <http://www.w3.org/TR/owl2-mapping-to-rdf/>,
+          <http://www.w3.org/TR/owl2-rdf-based-semantics/>,
+          <http://www.w3.org/TR/owl2-syntax/> ;
+     rdfs:seeAlso   <http://www.w3.org/TR/owl2-rdf-based-semantics/#table-axiomatic-classes>,
+                    <http://www.w3.org/TR/owl2-rdf-based-semantics/#table-axiomatic-properties> ;
+     owl:imports <http://www.w3.org/2000/01/rdf-schema> ;
+     owl:versionIRI <http://www.w3.org/2002/07/owl> ;
+     owl:versionInfo "$Date: 2009/11/15 10:54:12 $" ;
+     # grddl:namespaceTransformation <http://dev.w3.org/cvsweb/2009/owl-grddl/owx2rdf.xsl>
+     .
+
+
+owl:AllDifferent a rdfs:Class ;
+     rdfs:label "AllDifferent" ;
+     rdfs:comment "The class of collections of pairwise different individuals." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:AllDisjointClasses a rdfs:Class ;
+     rdfs:label "AllDisjointClasses" ;
+     rdfs:comment "The class of collections of pairwise disjoint classes." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:AllDisjointProperties a rdfs:Class ;
+     rdfs:label "AllDisjointProperties" ;
+     rdfs:comment "The class of collections of pairwise disjoint properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:Annotation a rdfs:Class ;
+     rdfs:label "Annotation" ;
+     rdfs:comment "The class of annotated annotations for which the RDF serialization consists of an annotated subject, predicate and object." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:AnnotationProperty a rdfs:Class ;
+     rdfs:label "AnnotationProperty" ;
+     rdfs:comment "The class of annotation properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:AsymmetricProperty a rdfs:Class ;
+     rdfs:label "AsymmetricProperty" ;
+     rdfs:comment "The class of asymmetric properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:Axiom a rdfs:Class ;
+     rdfs:label "Axiom" ;
+     rdfs:comment "The class of annotated axioms for which the RDF serialization consists of an annotated subject, predicate and object." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:Class a rdfs:Class ;
+     rdfs:label "Class" ;
+     rdfs:comment "The class of OWL classes." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Class .
+
+owl:DataRange a rdfs:Class ;
+     rdfs:label "DataRange" ;
+     rdfs:comment "The class of OWL data ranges, which are special kinds of datatypes. Note: The use of the IRI owl:DataRange has been deprecated as of OWL 2. The IRI rdfs:Datatype SHOULD be used instead." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Datatype .
+
+owl:DatatypeProperty a rdfs:Class ;
+     rdfs:label "DatatypeProperty" ;
+     rdfs:comment "The class of data properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:DeprecatedClass a rdfs:Class ;
+     rdfs:label "DeprecatedClass" ;
+     rdfs:comment "The class of deprecated classes." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Class .
+
+owl:DeprecatedProperty a rdfs:Class ;
+     rdfs:label "DeprecatedProperty" ;
+     rdfs:comment "The class of deprecated properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:FunctionalProperty a rdfs:Class ;
+     rdfs:label "FunctionalProperty" ;
+     rdfs:comment "The class of functional properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:InverseFunctionalProperty a rdfs:Class ;
+     rdfs:label "InverseFunctionalProperty" ;
+     rdfs:comment "The class of inverse-functional properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:IrreflexiveProperty a rdfs:Class ;
+     rdfs:label "IrreflexiveProperty" ;
+     rdfs:comment "The class of irreflexive properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:NamedIndividual a rdfs:Class ;
+     rdfs:label "NamedIndividual" ;
+     rdfs:comment "The class of named individuals." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:Thing .
+
+owl:NegativePropertyAssertion a rdfs:Class ;
+     rdfs:label "NegativePropertyAssertion" ;
+     rdfs:comment "The class of negative property assertions." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:Nothing a owl:Class ;
+     rdfs:label "Nothing" ;
+     rdfs:comment "This is the empty class." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:Thing .
+
+owl:ObjectProperty a rdfs:Class ;
+     rdfs:label "ObjectProperty" ;
+     rdfs:comment "The class of object properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:Ontology a rdfs:Class ;
+     rdfs:label "Ontology" ;
+     rdfs:comment "The class of ontologies." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdfs:Resource .
+
+owl:OntologyProperty a rdfs:Class ;
+     rdfs:label "OntologyProperty" ;
+     rdfs:comment "The class of ontology properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf rdf:Property .
+
+owl:ReflexiveProperty a rdfs:Class ;
+     rdfs:label "ReflexiveProperty" ;
+     rdfs:comment "The class of reflexive properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:Restriction a rdfs:Class ;
+     rdfs:label "Restriction" ;
+     rdfs:comment "The class of property restrictions." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:Class .
+
+owl:SymmetricProperty a rdfs:Class ;
+     rdfs:label "SymmetricProperty" ;
+     rdfs:comment "The class of symmetric properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:TransitiveProperty a rdfs:Class ;
+     rdfs:label "TransitiveProperty" ;
+     rdfs:comment "The class of transitive properties." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:subClassOf owl:ObjectProperty .
+
+owl:Thing a owl:Class , rdfs:Class ;
+     rdfs:label "Thing" ;
+     rdfs:comment "The class of OWL individuals." ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> .
+
+owl:allValuesFrom a rdf:Property ;
+     rdfs:label "allValuesFrom" ;
+     rdfs:comment "The property that determines the class that a universal property restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Class .
+
+owl:annotatedProperty a rdf:Property ;
+     rdfs:label "annotatedProperty" ;
+     rdfs:comment "The property that determines the predicate of an annotated axiom or annotated annotation." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:annotatedSource a rdf:Property ;
+     rdfs:label "annotatedSource" ;
+     rdfs:comment "The property that determines the subject of an annotated axiom or annotated annotation." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:annotatedTarget a rdf:Property ;
+     rdfs:label "annotatedTarget" ;
+     rdfs:comment "The property that determines the object of an annotated axiom or annotated annotation." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:assertionProperty a rdf:Property ;
+     rdfs:label "assertionProperty" ;
+     rdfs:comment "The property that determines the predicate of a negative property assertion." ;
+     rdfs:domain owl:NegativePropertyAssertion ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:Property .
+
+owl:backwardCompatibleWith a owl:AnnotationProperty, owl:OntologyProperty ;
+     rdfs:label "backwardCompatibleWith" ;
+     rdfs:comment "The annotation property that indicates that a given ontology is backward compatible with another ontology." ;
+     rdfs:domain owl:Ontology ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Ontology .
+
+owl:bottomDataProperty a owl:DatatypeProperty ;
+     rdfs:label "bottomDataProperty" ;
+     rdfs:comment "The data property that does not relate any individual to any data value." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Literal .
+
+owl:bottomObjectProperty a owl:ObjectProperty ;
+     rdfs:label "bottomObjectProperty" ;
+     rdfs:comment "The object property that does not relate any two individuals." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:cardinality a rdf:Property ;
+     rdfs:label "cardinality" ;
+     rdfs:comment "The property that determines the cardinality of an exact cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:complementOf a rdf:Property ;
+     rdfs:label "complementOf" ;
+     rdfs:comment "The property that determines that a given class is the complement of another class." ;
+     rdfs:domain owl:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Class .
+
+owl:datatypeComplementOf a rdf:Property ;
+     rdfs:label "datatypeComplementOf" ;
+     rdfs:comment "The property that determines that a given data range is the complement of another data range with respect to the data domain." ;
+     rdfs:domain rdfs:Datatype ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Datatype .
+
+owl:deprecated a owl:AnnotationProperty ;
+     rdfs:label "deprecated" ;
+     rdfs:comment "The annotation property that indicates that a given entity has been deprecated." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:differentFrom a rdf:Property ;
+     rdfs:label "differentFrom" ;
+     rdfs:comment "The property that determines that two given individuals are different." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:disjointUnionOf a rdf:Property ;
+     rdfs:label "disjointUnionOf" ;
+     rdfs:comment "The property that determines that a given class is equivalent to the disjoint union of a collection of other classes." ;
+     rdfs:domain owl:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:disjointWith a rdf:Property ;
+     rdfs:label "disjointWith" ;
+     rdfs:comment "The property that determines that two given classes are disjoint." ;
+     rdfs:domain owl:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Class .
+
+owl:distinctMembers a rdf:Property ;
+     rdfs:label "distinctMembers" ;
+     rdfs:comment "The property that determines the collection of pairwise different individuals in a owl:AllDifferent axiom." ;
+     rdfs:domain owl:AllDifferent ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:equivalentClass a rdf:Property ;
+     rdfs:label "equivalentClass" ;
+     rdfs:comment "The property that determines that two given classes are equivalent, and that is used to specify datatype definitions." ;
+     rdfs:domain rdfs:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Class .
+
+owl:equivalentProperty a rdf:Property ;
+     rdfs:label "equivalentProperty" ;
+     rdfs:comment "The property that determines that two given properties are equivalent." ;
+     rdfs:domain rdf:Property ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:Property .
+
+owl:hasKey a rdf:Property ;
+     rdfs:label "hasKey" ;
+     rdfs:comment "The property that determines the collection of properties that jointly build a key." ;
+     rdfs:domain owl:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:hasSelf a rdf:Property ;
+     rdfs:label "hasSelf" ;
+     rdfs:comment "The property that determines the property that a self restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:hasValue a rdf:Property ;
+     rdfs:label "hasValue" ;
+     rdfs:comment "The property that determines the individual that a has-value restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:imports a owl:OntologyProperty ;
+     rdfs:label "imports" ;
+     rdfs:comment "The property that is used for importing other ontologies into a given ontology." ;
+     rdfs:domain owl:Ontology ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Ontology .
+
+owl:incompatibleWith a owl:AnnotationProperty, owl:OntologyProperty ;
+     rdfs:label "incompatibleWith" ;
+     rdfs:comment "The annotation property that indicates that a given ontology is incompatible with another ontology." ;
+     rdfs:domain owl:Ontology ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Ontology .
+
+owl:intersectionOf a rdf:Property ;
+     rdfs:label "intersectionOf" ;
+     rdfs:comment "The property that determines the collection of classes or data ranges that build an intersection." ;
+     rdfs:domain rdfs:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:inverseOf a rdf:Property ;
+     rdfs:label "inverseOf" ;
+     rdfs:comment "The property that determines that two given properties are inverse." ;
+     rdfs:domain owl:ObjectProperty ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:ObjectProperty .
+
+owl:maxCardinality a rdf:Property ;
+     rdfs:label "maxCardinality" ;
+     rdfs:comment "The property that determines the cardinality of a maximum cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:maxQualifiedCardinality a rdf:Property ;
+     rdfs:label "maxQualifiedCardinality" ;
+     rdfs:comment "The property that determines the cardinality of a maximum qualified cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:members a rdf:Property ;
+     rdfs:label "members" ;
+     rdfs:comment "The property that determines the collection of members in either a owl:AllDifferent, owl:AllDisjointClasses or owl:AllDisjointProperties axiom." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:minCardinality a rdf:Property ;
+     rdfs:label "minCardinality" ;
+     rdfs:comment "The property that determines the cardinality of a minimum cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:minQualifiedCardinality a rdf:Property ;
+     rdfs:label "minQualifiedCardinality" ;
+     rdfs:comment "The property that determines the cardinality of a minimum qualified cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:onClass a rdf:Property ;
+     rdfs:label "onClass" ;
+     rdfs:comment "The property that determines the class that a qualified object cardinality restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Class .
+
+owl:onDataRange a rdf:Property ;
+     rdfs:label "onDataRange" ;
+     rdfs:comment "The property that determines the data range that a qualified data cardinality restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Datatype .
+
+owl:onDatatype a rdf:Property ;
+     rdfs:label "onDatatype" ;
+     rdfs:comment "The property that determines the datatype that a datatype restriction refers to." ;
+     rdfs:domain rdfs:Datatype ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Datatype .
+
+owl:oneOf a rdf:Property ;
+     rdfs:label "oneOf" ;
+     rdfs:comment "The property that determines the collection of individuals or data values that build an enumeration." ;
+     rdfs:domain rdfs:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:onProperties a rdf:Property ;
+     rdfs:label "onProperties" ;
+     rdfs:comment "The property that determines the n-tuple of properties that a property restriction on an n-ary data range refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:onProperty a rdf:Property ;
+     rdfs:label "onProperty" ;
+     rdfs:comment "The property that determines the property that a property restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:Property .
+
+owl:priorVersion a owl:AnnotationProperty, owl:OntologyProperty ;
+     rdfs:label "priorVersion" ;
+     rdfs:comment "The annotation property that indicates the predecessor ontology of a given ontology." ;
+     rdfs:domain owl:Ontology ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Ontology .
+
+owl:propertyChainAxiom a rdf:Property ;
+     rdfs:label "propertyChainAxiom" ;
+     rdfs:comment "The property that determines the n-tuple of properties that build a sub property chain of a given property." ;
+     rdfs:domain owl:ObjectProperty ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:propertyDisjointWith a rdf:Property ;
+     rdfs:label "propertyDisjointWith" ;
+     rdfs:comment "The property that determines that two given properties are disjoint." ;
+     rdfs:domain rdf:Property ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:Property .
+
+owl:qualifiedCardinality a rdf:Property ;
+     rdfs:label "qualifiedCardinality" ;
+     rdfs:comment "The property that determines the cardinality of an exact qualified cardinality restriction." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range xsd:nonNegativeInteger .
+
+owl:sameAs a rdf:Property ;
+     rdfs:label "sameAs" ;
+     rdfs:comment "The property that determines that two given individuals are equal." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:someValuesFrom a rdf:Property ;
+     rdfs:label "someValuesFrom" ;
+     rdfs:comment "The property that determines the class that an existential property restriction refers to." ;
+     rdfs:domain owl:Restriction ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Class .
+
+owl:sourceIndividual a rdf:Property ;
+     rdfs:label "sourceIndividual" ;
+     rdfs:comment "The property that determines the subject of a negative property assertion." ;
+     rdfs:domain owl:NegativePropertyAssertion ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:targetIndividual a rdf:Property ;
+     rdfs:label "targetIndividual" ;
+     rdfs:comment "The property that determines the object of a negative object property assertion." ;
+     rdfs:domain owl:NegativePropertyAssertion ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:targetValue a rdf:Property ;
+     rdfs:label "targetValue" ;
+     rdfs:comment "The property that determines the value of a negative data property assertion." ;
+     rdfs:domain owl:NegativePropertyAssertion ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Literal .
+
+owl:topDataProperty a owl:DatatypeProperty ;
+     rdfs:label "topDataProperty" ;
+     rdfs:comment "The data property that relates every individual to every data value." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Literal .
+
+owl:topObjectProperty a owl:ObjectProperty ;
+     rdfs:label "topObjectProperty" ;
+     rdfs:comment "The object property that relates every two individuals." ;
+     rdfs:domain owl:Thing ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Thing .
+
+owl:unionOf a rdf:Property ;
+     rdfs:label "unionOf" ;
+     rdfs:comment "The property that determines the collection of classes or data ranges that build a union." ;
+     rdfs:domain rdfs:Class ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
+
+owl:versionInfo a owl:AnnotationProperty ;
+     rdfs:label "versionInfo" ;
+     rdfs:comment "The annotation property that provides version information for an ontology or another OWL construct." ;
+     rdfs:domain rdfs:Resource ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdfs:Resource .
+
+owl:versionIRI a owl:OntologyProperty ;
+     rdfs:label "versionIRI" ;
+     rdfs:comment "The property that identifies the version IRI of an ontology." ;
+     rdfs:domain owl:Ontology ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range owl:Ontology .
+
+owl:withRestrictions a rdf:Property ;
+     rdfs:label "withRestrictions" ;
+     rdfs:comment "The property that determines the collection of facet-value pairs that define a datatype restriction." ;
+     rdfs:domain rdfs:Datatype ;
+     rdfs:isDefinedBy <http://www.w3.org/2002/07/owl#> ;
+     rdfs:range rdf:List .
diff --git a/htsworkflow/util/schemas/rdf.turtle b/htsworkflow/util/schemas/rdf.turtle

new file mode 100644 (file)

index 0000000..ce337be
--- /dev/null
+++ b/htsworkflow/util/schemas/rdf.turtle
@@ -0,0 +1,258 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix grddl: <http://www.w3.org/2003/g/data-view#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix vs: <http://www.w3.org/2003/06/sw-vocab-status/ns#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix wot: <http://xmlns.com/wot/0.1/> .
+
+<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    dc:description "This is the RDF Schema for the RDF vocabulary defined in the RDF namespace."@en ;
+    dc:title "The RDF Vocabulary (RDF)"@en ;
+    a owl:Ontology ;
+    rdfs:seeAlso <http://www.w3.org/2000/01/rdf-schema-more> .
+
+rdf:Alt
+    a rdfs:Class ;
+    rdfs:comment "The class of containers of alternatives." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Alt" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:Bag
+    a rdfs:Class ;
+    rdfs:comment "The class of unordered containers." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Bag" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:List
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF Lists." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "List" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:PlainLiteral
+    a rdfs:Datatype ;
+    rdfs:comment "The class of plain (i.e. untyped) literal values." ;
+    rdfs:isDefinedBy <http://www.w3.org/TR/rdf-plain-literal/> ;
+    rdfs:label "PlainLiteral" ;
+    rdfs:subClassOf rdfs:Literal .
+
+rdf:Property
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF properties." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Property" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:Seq
+    a rdfs:Class ;
+    rdfs:comment "The class of ordered containers." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Seq" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:Statement
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF statements." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Statement" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:XMLLiteral
+    a rdfs:Datatype ;
+    rdfs:comment "The class of XML literal values." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "XMLLiteral" ;
+    rdfs:subClassOf rdfs:Literal .
+
+rdf:first
+    a rdf:Property ;
+    rdfs:comment "The first item in the subject RDF list." ;
+    rdfs:domain rdf:List ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "first" ;
+    rdfs:range rdfs:Resource .
+
+rdf:nil
+    a rdf:List ;
+    rdfs:comment "The empty list, with no items in it. If the rest of a list is nil then the list has no more items in it." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "nil" .
+
+rdf:object
+    a rdf:Property ;
+    rdfs:comment "The object of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "object" ;
+    rdfs:range rdfs:Resource .
+
+rdf:predicate
+    a rdf:Property ;
+    rdfs:comment "The predicate of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "predicate" ;
+    rdfs:range rdfs:Resource .
+
+rdf:rest
+    a rdf:Property ;
+    rdfs:comment "The rest of the subject RDF list after the first item." ;
+    rdfs:domain rdf:List ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "rest" ;
+    rdfs:range rdf:List .
+
+rdf:subject
+    a rdf:Property ;
+    rdfs:comment "The subject of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "subject" ;
+    rdfs:range rdfs:Resource .
+
+rdf:type
+    a rdf:Property ;
+    rdfs:comment "The subject is an instance of a class." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "type" ;
+    rdfs:range rdfs:Class .
+
+rdf:value
+    a rdf:Property ;
+    rdfs:comment "Idiomatic property used for structured values." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "value" ;
+    rdfs:range rdfs:Resource .
+
+<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    dc:description "This is the RDF Schema for the RDF vocabulary defined in the RDF namespace."@en ;
+    dc:title "The RDF Vocabulary (RDF)"@en ;
+    a owl:Ontology ;
+    rdfs:seeAlso <http://www.w3.org/2000/01/rdf-schema-more> .
+
+rdf:Alt
+    a rdfs:Class ;
+    rdfs:comment "The class of containers of alternatives." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Alt" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:Bag
+    a rdfs:Class ;
+    rdfs:comment "The class of unordered containers." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Bag" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:List
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF Lists." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "List" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:PlainLiteral
+    a rdfs:Datatype ;
+    rdfs:comment "The class of plain (i.e. untyped) literal values." ;
+    rdfs:isDefinedBy <http://www.w3.org/TR/rdf-plain-literal/> ;
+    rdfs:label "PlainLiteral" ;
+    rdfs:subClassOf rdfs:Literal .
+
+rdf:Property
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF properties." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Property" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:Seq
+    a rdfs:Class ;
+    rdfs:comment "The class of ordered containers." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Seq" ;
+    rdfs:subClassOf rdfs:Container .
+
+rdf:Statement
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF statements." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "Statement" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdf:XMLLiteral
+    a rdfs:Datatype ;
+    rdfs:comment "The class of XML literal values." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "XMLLiteral" ;
+    rdfs:subClassOf rdfs:Literal .
+
+rdf:first
+    a rdf:Property ;
+    rdfs:comment "The first item in the subject RDF list." ;
+    rdfs:domain rdf:List ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "first" ;
+    rdfs:range rdfs:Resource .
+
+rdf:nil
+    a rdf:List ;
+    rdfs:comment "The empty list, with no items in it. If the rest of a list is nil then the list has no more items in it." ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "nil" .
+
+rdf:object
+    a rdf:Property ;
+    rdfs:comment "The object of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "object" ;
+    rdfs:range rdfs:Resource .
+
+rdf:predicate
+    a rdf:Property ;
+    rdfs:comment "The predicate of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "predicate" ;
+    rdfs:range rdfs:Resource .
+
+rdf:rest
+    a rdf:Property ;
+    rdfs:comment "The rest of the subject RDF list after the first item." ;
+    rdfs:domain rdf:List ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "rest" ;
+    rdfs:range rdf:List .
+
+rdf:subject
+    a rdf:Property ;
+    rdfs:comment "The subject of the subject RDF statement." ;
+    rdfs:domain rdf:Statement ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "subject" ;
+    rdfs:range rdfs:Resource .
+
+rdf:type
+    a rdf:Property ;
+    rdfs:comment "The subject is an instance of a class." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "type" ;
+    rdfs:range rdfs:Class .
+
+rdf:value
+    a rdf:Property ;
+    rdfs:comment "Idiomatic property used for structured values." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/1999/02/22-rdf-syntax-ns#> ;
+    rdfs:label "value" ;
+    rdfs:range rdfs:Resource .
diff --git a/htsworkflow/util/schemas/rdfs.turtle b/htsworkflow/util/schemas/rdfs.turtle

new file mode 100644 (file)

index 0000000..dc642f4
--- /dev/null
+++ b/htsworkflow/util/schemas/rdfs.turtle
@@ -0,0 +1,130 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix grddl: <http://www.w3.org/2003/g/data-view#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix vs: <http://www.w3.org/2003/06/sw-vocab-status/ns#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+@prefix wot: <http://xmlns.com/wot/0.1/> .
+
+<http://www.w3.org/2000/01/rdf-schema#>
+    dc:title "The RDF Schema vocabulary (RDFS)" ;
+    a owl:Ontology ;
+    rdfs:seeAlso <http://www.w3.org/2000/01/rdf-schema-more> .
+
+rdfs:Class
+    a rdfs:Class ;
+    rdfs:comment "The class of classes." ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "Class" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdfs:Container
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF containers." ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "Container" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdfs:ContainerMembershipProperty
+    a rdfs:Class ;
+    rdfs:comment """The class of container membership properties, rdf:_1, rdf:_2, ...,
+                    all of which are sub-properties of 'member'.""" ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "ContainerMembershipProperty" ;
+    rdfs:subClassOf rdf:Property .
+
+rdfs:Datatype
+    a rdfs:Class ;
+    rdfs:comment "The class of RDF datatypes." ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "Datatype" ;
+    rdfs:subClassOf rdfs:Class .
+
+rdfs:Literal
+    a rdfs:Class ;
+    rdfs:comment "The class of literal values, eg. textual strings and integers." ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "Literal" ;
+    rdfs:subClassOf rdfs:Resource .
+
+rdfs:Resource
+    a rdfs:Class ;
+    rdfs:comment "The class resource, everything." ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "Resource" .
+
+rdfs:comment
+    a rdf:Property ;
+    rdfs:comment "A description of the subject resource." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "comment" ;
+    rdfs:range rdfs:Literal .
+
+rdfs:domain
+    a rdf:Property ;
+    rdfs:comment "A domain of the subject property." ;
+    rdfs:domain rdf:Property ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "domain" ;
+    rdfs:range rdfs:Class .
+
+rdfs:isDefinedBy
+    a rdf:Property ;
+    rdfs:comment "The defininition of the subject resource." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "isDefinedBy" ;
+    rdfs:range rdfs:Resource ;
+    rdfs:subPropertyOf rdfs:seeAlso .
+
+rdfs:label
+    a rdf:Property ;
+    rdfs:comment "A human-readable name for the subject." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "label" ;
+    rdfs:range rdfs:Literal .
+
+rdfs:member
+    a rdf:Property ;
+    rdfs:comment "A member of the subject resource." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "member" ;
+    rdfs:range rdfs:Resource .
+
+rdfs:range
+    a rdf:Property ;
+    rdfs:comment "A range of the subject property." ;
+    rdfs:domain rdf:Property ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "range" ;
+    rdfs:range rdfs:Class .
+
+rdfs:seeAlso
+    a rdf:Property ;
+    rdfs:comment "Further information about the subject resource." ;
+    rdfs:domain rdfs:Resource ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "seeAlso" ;
+    rdfs:range rdfs:Resource .
+
+rdfs:subClassOf
+    a rdf:Property ;
+    rdfs:comment "The subject is a subclass of a class." ;
+    rdfs:domain rdfs:Class ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "subClassOf" ;
+    rdfs:range rdfs:Class .
+
+rdfs:subPropertyOf
+    a rdf:Property ;
+    rdfs:comment "The subject is a subproperty of a property." ;
+    rdfs:domain rdf:Property ;
+    rdfs:isDefinedBy <http://www.w3.org/2000/01/rdf-schema#> ;
+    rdfs:label "subPropertyOf" ;
+    rdfs:range rdf:Property .
diff --git a/htsworkflow/util/test/test_rdfhelp.py b/htsworkflow/util/test/test_rdfhelp.py

index 34c3200a909de310c8fb9dd48bcb04605aa8a3c9..948bcf407cf976eea74c44b0ff095b475f02770e 100644 (file)
--- a/htsworkflow/util/test/test_rdfhelp.py
+++ b/htsworkflow/util/test/test_rdfhelp.py
@@ -6,14 +6,22 @@ import types
  from datetime import datetime
  
  from htsworkflow.util.rdfhelp import \
+     add_default_schemas, \
       blankOrUri, \
+     dcNS, \
       dump_model, \
       fromTypedNode, \
       get_model, \
+     guess_parser, \
+     guess_parser_by_extension, \
       load_string_into_model, \
+     owlNS, \
+     rdfNS, \
       rdfsNS, \
+     remove_schemas, \
       toTypedNode, \
-     simplifyUri, \
+     stripNamespace, \
+     simplify_uri, \
       sanitize_literal, \
       xsdNS
  
@@ -102,27 +110,49 @@ try:
              self.assertEqual(fromTypedNode(toTypedNode(long_datetime)),
                               long_datetime)
  
-        def test_simplify_uri(self):
+        def test_strip_namespace_uri(self):
              nsOrg = RDF.NS('example.org/example#')
              nsCom = RDF.NS('example.com/example#')
  
              term = 'foo'
              node = nsOrg[term]
-            self.failUnlessEqual(simplifyUri(nsOrg, node), term)
-            self.failUnlessEqual(simplifyUri(nsCom, node), None)
-            self.failUnlessEqual(simplifyUri(nsOrg, node.uri), term)
+            self.failUnlessEqual(stripNamespace(nsOrg, node), term)
+            self.failUnlessEqual(stripNamespace(nsCom, node), None)
+            self.failUnlessEqual(stripNamespace(nsOrg, node.uri), term)
  
-        def test_simplify_uri_exceptions(self):
+        def test_strip_namespace_exceptions(self):
              nsOrg = RDF.NS('example.org/example#')
              nsCom = RDF.NS('example.com/example#')
  
              node = toTypedNode('bad')
-            self.failUnlessRaises(ValueError, simplifyUri, nsOrg, node)
-            self.failUnlessRaises(ValueError, simplifyUri, nsOrg, nsOrg)
+            self.failUnlessRaises(ValueError, stripNamespace, nsOrg, node)
+            self.failUnlessRaises(ValueError, stripNamespace, nsOrg, nsOrg)
+
+        def test_simplify_uri(self):
+            DATA = [('http://asdf.org/foo/bar', 'bar'),
+                    ('http://asdf.org/foo/bar#bleem', 'bleem'),
+                    ('http://asdf.org/foo/bar/', 'bar'),
+                    ('http://asdf.org/foo/bar?was=foo', 'was=foo')]
+
+            for uri, expected in DATA:
+                self.assertEqual(simplify_uri(uri), expected)
+
+            for uri, expected in DATA:
+                n = RDF.Uri(uri)
+                self.assertEqual(simplify_uri(n), expected)
+
+            for uri, expected in DATA:
+                n = RDF.Node(RDF.Uri(uri))
+                self.assertEqual(simplify_uri(n), expected)
+
+            # decoding literals is questionable
+            n = toTypedNode('http://foo/bar')
+            self.assertRaises(ValueError, simplify_uri, n)
  
          def test_owl_import(self):
              path, name = os.path.split(__file__)
-            loc = 'file://'+os.path.abspath(path)+'/'
+            #loc = 'file://'+os.path.abspath(path)+'/'
+            loc = os.path.abspath(path)+'/'
              model = get_model()
              fragment = '''
  @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@@ -172,6 +202,59 @@ _:a owl:imports "{loc}extra.turtle" .
              hostile_result = """hi <b>there</b>"""
              self.failUnlessEqual(str(hostile_sanitized), hostile_result)
  
+        def test_guess_parser_from_file(self):
+            DATA = [
+                ('/a/b/c.rdf', 'rdfxml'),
+                ('/a/b/c.xml', 'rdfxml'),
+                ('/a/b/c.html', 'rdfa'),
+                ('/a/b/c.turtle', 'turtle'),
+                ('http://foo.bar/bleem.turtle', 'turtle')]
+            for path, parser in DATA:
+                self.assertEqual(guess_parser_by_extension(path), parser)
+                self.assertEqual(guess_parser(None, path), parser)
+
+            DATA = [
+                ('application/rdf+xml', 'http://a.org/b/c', 'rdfxml'),
+                ('application/x-turtle', 'http://a.org/b/c', 'turtle'),
+                ('text/html', 'http://a.org/b/c', 'rdfa'),
+                ('text/html', 'http://a.org/b/c.html', 'rdfa'),
+                ('text/plain', 'http://a.org/b/c.turtle', 'turtle'),
+                ('text/plain', 'http://a.org/b/c', 'guess')
+            ]
+            for contenttype, url, parser in DATA:
+                self.assertEqual(guess_parser(contenttype, url), parser)
+
+    class TestRDFSchemas(unittest.TestCase):
+        def test_rdf_schema(self):
+            """Does it basically work?
+            """
+            model = get_model()
+            self.assertEqual(model.size(), 0)
+            add_default_schemas(model)
+            self.assertGreater(model.size(), 0)
+            remove_schemas(model)
+            self.assertEqual(model.size(), 0)
+
+        def test_included_schemas(self):
+            model = get_model()
+            add_default_schemas(model)
+
+            # rdf test
+            s = RDF.Statement(rdfNS[''], dcNS['title'], None)
+            title = model.get_target(rdfNS[''], dcNS['title'])
+            self.assertTrue(title is not None)
+
+            s = RDF.Statement(rdfNS['Property'], rdfNS['type'], rdfsNS['Class'])
+            self.assertTrue(model.contains_statement(s))
+
+            # rdfs test
+            s = RDF.Statement(rdfsNS['Class'], rdfNS['type'], rdfsNS['Class'])
+            self.assertTrue(model.contains_statement(s))
+
+            s = RDF.Statement(owlNS['inverseOf'], rdfNS['type'],
+                              rdfNS['Property'])
+            self.assertTrue(model.contains_statement(s))
+
  
      def suite():
          return unittest.makeSuite(TestRDFHelp, 'test')
diff --git a/htsworkflow/util/test/test_rdfinfer.py b/htsworkflow/util/test/test_rdfinfer.py

new file mode 100644 (file)

index 0000000..4ed2316
--- /dev/null
+++ b/htsworkflow/util/test/test_rdfinfer.py
@@ -0,0 +1,196 @@
+import unittest
+
+import RDF
+
+from htsworkflow.util.rdfhelp import get_model, \
+     add_default_schemas, add_schema, load_string_into_model, dump_model
+from htsworkflow.util.rdfns import *
+from htsworkflow.util.rdfinfer import Infer
+
+foafNS = RDF.NS('http://xmlns.com/foaf/0.1/')
+
+MINI_FOAF_ONTOLOGY = """
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+
+
+foaf:Agent
+     a rdfs:Class, owl:Class ;
+     rdfs:comment "An agent (person, group, software or physical artifiact)."@en;
+     rdfs:label "Agent" .
+
+foaf:Person
+     a rdfs:Class, owl:Class, foaf:Agent ;
+     rdfs:label "Person" .
+
+foaf:age
+     a rdf:Property, owl:DatatypeProperty, owl:FunctionalProperty ;
+     rdfs:comment "The age in years of some agent." ;
+     rdfs:domain foaf:Agent ;
+     rdfs:label "age";
+     rdfs:range rdfs:Literal .
+
+foaf:familyName
+     a rdf:Property, owl:DatatypeProperty ;
+     rdfs:comment "Family name of some person." ;
+     rdfs:label "familyName" ;
+     rdfs:domain foaf:Person ;
+     rdfs:range rdfs:Literal .
+
+foaf:firstName
+     a rdf:Property, owl:DatatypeProperty ;
+     rdfs:comment "the first name of a person." ;
+     rdfs:domain foaf:Person ;
+     rdfs:label "firstname" ;
+     rdfs:range rdfs:Literal .
+
+foaf:Document
+     a rdfs:Class, owl:Class ;
+     rdfs:comment "A document." .
+
+foaf:Image
+     a rdfs:Class, owl:Class ;
+     rdfs:comment "An image." ;
+     rdfs:subClassOf foaf:Document .
+
+foaf:depicts
+     a rdf:Property, owl:ObjectProperty ;
+     rdfs:comment "A thing depicted in this representation." ;
+     rdfs:domain foaf:Image ;
+     rdfs:range owl:Thing ;
+     owl:inverseOf foaf:depiction .
+
+foaf:depiction
+     a rdf:Property, owl:ObjectProperty ;
+     rdfs:comment "Depiction of some thing." ;
+     rdfs:domain owl:Thing ;
+     rdfs:range foaf:Image ;
+     owl:inverseOf foaf:depicts .
+"""
+
+FOAF_DATA = """
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix foaf: <http://xmlns.com/foaf/0.1/> .
+
+_:me
+     foaf:firstName "Diane" ;
+     foaf:familyName "Trout" ;
+     a foaf:Person, owl:Thing ;
+     <http://example.org/other_literal> "value" ;
+     <http://example.org/other_resource> <http://example.org/resource> .
+
+<http://example.org/me.jpg>
+     a foaf:Image, owl:Thing ;
+     foaf:depicts _:me .
+"""
+
+class TestInfer(unittest.TestCase):
+    def setUp(self):
+        self.model = get_model()
+        add_default_schemas(self.model)
+        load_string_into_model(self.model, 'turtle', MINI_FOAF_ONTOLOGY)
+
+    def test_class(self):
+        fooNS = RDF.NS('http://example.org/')
+        load_string_into_model(self.model, 'turtle', FOAF_DATA)
+        inference = Infer(self.model)
+
+        s = RDF.Statement(fooNS['me.jpg'], rdfNS['type'], rdfsNS['Class'])
+        found = list(self.model.find_statements(s))
+        self.assertEqual(len(found), 0)
+        inference._rule_class()
+        s = RDF.Statement(fooNS['me.jpg'], rdfNS['type'], rdfsNS['Class'])
+        found = list(self.model.find_statements(s))
+        self.assertEqual(len(found), 1)
+
+    def test_inverse_of(self):
+        fooNS = RDF.NS('http://example.org/')
+        load_string_into_model(self.model, 'turtle', FOAF_DATA)
+        inference = Infer(self.model)
+        depiction = RDF.Statement(None,
+                                  foafNS['depiction'],
+                                  fooNS['me.jpg'])
+        size = self.model.size()
+        found_statements = list(self.model.find_statements(depiction))
+        self.assertEqual(len(found_statements), 0)
+        inference._rule_inverse_of()
+        found_statements = list(self.model.find_statements(depiction))
+        self.assertEqual(len(found_statements), 1)
+
+        # we should've added one statement.
+        self.assertEqual(self.model.size(), size + 1)
+
+        size = self.model.size()
+        inference._rule_inverse_of()
+        # we should already have both versions in our model
+        self.assertEqual(self.model.size(), size)
+
+    def test_validate_types(self):
+        fooNS = RDF.NS('http://example.org/')
+        load_string_into_model(self.model, 'turtle', FOAF_DATA)
+        inference = Infer(self.model)
+
+        errors = list(inference._validate_types())
+        self.assertEqual(len(errors), 0)
+
+        s = RDF.Statement(fooNS['document'],
+                          dcNS['title'],
+                          RDF.Node("bleem"))
+        self.model.append(s)
+        errors = list(inference._validate_types())
+        self.assertEqual(len(errors), 1)
+
+    def test_validate_undefined_properties(self):
+        fooNS = RDF.NS('http://example.org/')
+        inference = Infer(self.model)
+
+        errors = list(inference._validate_undefined_properties())
+        self.assertEqual(len(errors), 0)
+
+        load_string_into_model(self.model, 'turtle', FOAF_DATA)
+
+        errors = list(inference._validate_undefined_properties())
+        self.assertEqual(len(errors), 2)
+
+
+    def test_validate_undefined_properties(self):
+        fooNS = RDF.NS('http://example.org/')
+        foafNS = RDF.NS('http://xmlns.com/foaf/0.1/')
+        load_string_into_model(self.model, 'turtle', FOAF_DATA)
+        inference = Infer(self.model)
+
+        errors = list(inference._validate_property_types())
+        self.assertEqual(len(errors), 0)
+
+        s = RDF.Statement(fooNS['me.jpg'],
+                          foafNS['firstName'],
+                          RDF.Node("name"))
+        self.model.append(s)
+        errors = list(inference._validate_property_types())
+        self.assertEqual(len(errors), 1)
+        self.assertTrue(errors[0].startswith('Domain of http://example.org'))
+        del self.model[s]
+
+        errors = list(inference._validate_property_types())
+        self.assertEqual(len(errors), 0)
+        s = RDF.Statement(fooNS['foo.txt'], rdfNS['type'], foafNS['Document'])
+        self.model.append(s)
+        s = RDF.Statement(fooNS['me.jpg'],
+                          foafNS['depicts'],
+                          foafNS['foo.txt'])
+        self.model.append(s)
+
+        errors = list(inference._validate_property_types())
+        self.assertEqual(len(errors), 1)
+        self.assertTrue(errors[0].startswith('Range of http://example.org'))
+        del self.model[s]
+
+def suite():
+    return unittest.makeSuite(TestInfer, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
diff --git a/setup.py b/setup.py

index 0c1a203d01999d2c8af669aae16642d8012ca768..5bbd5e9236b81f7bb57bb9896405d5de9215562e 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-from setuptools import setup
+from setuptools import setup, find_packages
  from version import get_git_version
  
  setup(
@@ -7,19 +7,7 @@ setup(
      description="Utilities to help manage high-through-put sequencing",
      author="Diane Trout, Brandon King",
      author_email="diane@caltech.edu",
-    packages=["htsworkflow", 
-              "htsworkflow.automation",
-              "htsworkflow.pipelines",
-              "htsworkflow.util",
-              # django site
-              "htsworkflow.frontend",
-              "htsworkflow.frontend.analysis",
-              "htsworkflow.frontend.eland_config",
-              "htsworkflow.frontend.experiments",
-              "htsworkflow.frontend.inventory",
-              "htsworkflow.frontend.reports",
-              "htsworkflow.frontend.samples",
-              ],
+    packages=find_packages(),
      scripts=[
          "scripts/htsw-copier",
          "scripts/htsw-eland2bed",
@@ -36,4 +24,7 @@ setup(
          "scripts/htsw-update-archive",
          "scripts/htsw-validate",
          ],
+    package_data = {
+        '': ['*.turtle']
+        },
      )
author	Diane Trout <diane@caltech.edu>
	Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)
committer	Diane Trout <diane@caltech.edu>
	Tue, 18 Sep 2012 23:34:27 +0000 (16:34 -0700)
MANIFEST.in		patch \| blob \| history
encode_submission/encode_find.py		patch \| blob \| history
encode_submission/geo_gather.py		patch \| blob \| history
encode_submission/submission_report.py		patch \| blob \| history
encode_submission/ucsc_gather.py		patch \| blob \| history
htsworkflow/pipelines/sequences.py		patch \| blob \| history
htsworkflow/pipelines/test/test_sequences.py		patch \| blob \| history
htsworkflow/submission/condorfastq.py		patch \| blob \| history
htsworkflow/submission/daf.py		patch \| blob \| history
htsworkflow/submission/geo.py		patch \| blob \| history
htsworkflow/submission/results.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/submission/test/test_condorfastq.py		patch \| blob \| history
htsworkflow/submission/test/test_daf.py		patch \| blob \| history
htsworkflow/submission/test/test_results.py		patch \| blob \| history
htsworkflow/templates/geo_fastqs.sparql		patch \| blob \| history
htsworkflow/templates/geo_samples.sparql		patch \| blob \| history
htsworkflow/templates/geo_submission.soft		patch \| blob \| history
htsworkflow/util/rdfhelp.py		patch \| blob \| history
htsworkflow/util/rdfinfer.py	[new file with mode: 0644]	patch \| blob
htsworkflow/util/rdfns.py	[new file with mode: 0644]	patch \| blob
htsworkflow/util/schemas/dc.turtle	[new file with mode: 0644]	patch \| blob
htsworkflow/util/schemas/owl.turtle	[new file with mode: 0644]	patch \| blob
htsworkflow/util/schemas/rdf.turtle	[new file with mode: 0644]	patch \| blob
htsworkflow/util/schemas/rdfs.turtle	[new file with mode: 0644]	patch \| blob
htsworkflow/util/test/test_rdfhelp.py		patch \| blob \| history
htsworkflow/util/test/test_rdfinfer.py	[new file with mode: 0644]	patch \| blob
setup.py		patch \| blob \| history