Further attempts to validate RDF models.

author Diane Trout <diane@caltech.edu>

Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)

committer Diane Trout <diane@caltech.edu>

Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)
author Diane Trout <diane@caltech.edu>
Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)
committer Diane Trout <diane@caltech.edu>
Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index acd100575744c94984d7b4b1ab5074188145844e..87212dddbb0acda92894cf49fec9937b7f2d9e21 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -453,6 +453,10 @@ def update_model_sequence_library(model, base_url):
                  library = guess_library_from_model(model, base_url,
                                                     flowcell,
                                                     lane_id)
+                if library is None:
+                    LOGGER.error("Unable to decypher: %s %s",
+                                 str(flowcell), str(lane_id))
+                    continue
                  library_id = toTypedNode(simplify_uri(library))
                  LOGGER.debug("Adding file (%s) to library (%s) link",
                               str(filenode),
@@ -478,11 +482,13 @@ def guess_library_from_model(model, base_url, flowcell, lane_id):
      where {{
        <{flowcell}> libns:has_lane ?lane ;
                     a libns:IlluminaFlowcell .
-      ?lane libns:lane_number "{lane_id}" ;
+      ?lane libns:lane_number ?lane_id ;
              libns:library ?library .
+      FILTER(str(?lane_id) = "{lane_id}")
      }}
      """
      lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id)
+    LOGGER.debug("guess_library_from_model: %s", lane_body)
      lanes = []
      tries = 3
      while len(lanes) == 0 and tries > 0:
@@ -503,5 +509,3 @@ def guess_library_from_model(model, base_url, flowcell, lane_id):
          else:
              # try grabbing data
              model.load(flowcellNode.uri, name="rdfa")
-
-
diff --git a/htsworkflow/util/rdfinfer.py b/htsworkflow/util/rdfinfer.py

index b36fe6293883a394ada8f005b37d78503acd79e2..baaa2e4bd3016119d0bebba0d43faa1b57085287 100644 (file)
--- a/htsworkflow/util/rdfinfer.py
+++ b/htsworkflow/util/rdfinfer.py
@@ -8,6 +8,7 @@ from htsworkflow.util.rdfns import *
  from htsworkflow.util.rdfhelp import SCHEMAS_URL
  
  INFER_URL='http://jumpgate.caltech.edu/phony/infer'
+LOGGER = logging.getLogger(__name__)
  
  class Infer(object):
      """Provide some simple inference.
@@ -36,6 +37,7 @@ class Infer(object):
  
              for method_name in dir(self):
                  if method_name.startswith('_rule_'):
+                    LOGGER.info("Running: %s", method_name)
                      method = getattr(self, method_name)
                      method()
              if self.model.size() == starting_size:
@@ -55,6 +57,7 @@ class Infer(object):
          """
          for method_name in dir(self):
              if method_name.startswith('_validate_'):
+                LOGGER.info("Running: %s", method_name)
                  method = getattr(self, method_name)
                  for msg in method():
                      yield msg
@@ -143,7 +146,7 @@ class Infer(object):
          query = RDF.SPARQLQuery(body)
          errmsg = "Missing type for: {0}"
          for r in query.execute(self.model):
-            yield errmsg.format(str(r['subject'].uri))
+            yield errmsg.format(str(r['subject']))
  
      def _validate_undefined_properties(self):
          """Find properties that aren't defined.
@@ -185,15 +188,32 @@ class Infer(object):
              e.g. is a subject (node) the domain (space) of this property
              and is the object (node) the range of of this property.
              """
+            resource_error = "Expected resource for {0} in range {1}"
+            type_error = "Type of {0} was {1} not {2}"
              # check domain
              query = RDF.SPARQLQuery(property_template.format(
                  predicate=predicate.uri,
                  space=space))
-            seen = []
+            seen = set()
              for r in query.execute(self.model):
+                # Make sure we have a resource if we're expecting one
                  if r['type'] == rdfsNS['Resource']:
+                    if not node.is_resource():
+                        return resource_error.format(str(node), space)
                      continue
-                seen.append(str(r['type'].uri))
+                seen.add(str(r['type'].uri))
+                if node.is_literal():
+                    # literal is a generic type.
+                    nodetype = node.literal_value['datatype']
+                    if nodetype is None:
+                        # lets default to string
+                        nodetype = xsdNS['string'].uri
+                    if r['type'] == rdfsNS['Literal']:
+                        pass
+                    elif nodetype != r['type'].uri:
+                        return type_error.format(
+                            str(node), nodetype, r['type'])
+                # check that node is the expetected class type
                  check = RDF.Statement(node, rdfNS['type'], r['type'])
                  if self.model.contains_statement(check):
                      return
@@ -221,5 +241,3 @@ class Infer(object):
                                     wrong_range_type.format(str(s)))
              if msg is not None: yield msg
          return
-
-
diff --git a/htsworkflow/util/schemas/htsworkflow.turtle b/htsworkflow/util/schemas/htsworkflow.turtle

index 92ed6e64e1dca4b62a47b9b5cd086aba9520a760..12c1f55cfdc98b321f9780dfd37f514375ac6ad2 100644 (file)
--- a/htsworkflow/util/schemas/htsworkflow.turtle
+++ b/htsworkflow/util/schemas/htsworkflow.turtle
@@ -14,10 +14,12 @@
      dc:title "HTS-Workflow ontology" ;
      a owl:Ontology .
  
-htswlib:Class a rdfs:Class .
+htswlib:Class rdfs:subClassOf rdfs:Class ;
+              a rdfs:Class .
+rdfs:Resource a rdfs:Class.
  
  htswlib:IlluminaFlowcell
-    a rdfs:Class, htswlib:Class ;
+    a rdfs:Class, htswlib:Class;
      rdfs:comment "information about a illumina flowcell" ;
      rdfs:label "Flowcell" .
  
@@ -69,7 +71,7 @@ htswlib:date
      rdfs:label "made on" ;
      rdfs:domain htswlib:IlluminaFlowcell ;
      rdfs:domain htswlib:Library ;
-    rdfs:range rdfs:Literal .
+    rdfs:range xsd:dateTime .
  
  htswlib:total_unique_locations
      a rdf:Property ;
@@ -77,7 +79,7 @@ htswlib:total_unique_locations
      rdfs:label "Unique locations" ;
      rdfs:domain htswlib:Library ;
      rdfs:domain htswlib:IlluminaLane ;
-    rdfs:range rdfs:Literal .
+    rdfs:range xsd:integer .
  
  htswlib:has_mappings
      a rdf:Property ;
@@ -272,7 +274,7 @@ htswlib:gel_cut
      rdfs:comment "The estimated fragment sizes cut from gel";
      rdfs:label "Gel Cut" ;
      rdfs:domain htswlib:Library ;
-    rdfs:range rdfs:Literal .
+    rdfs:range xsd:decimal .
  
  htswlib:made_by
      a rdf:Property ;
@@ -330,7 +332,7 @@ htswlib:lane_number
      rdfs:comment "Which lane were we run in" ;
      rdfs:label "lane id" ;
      rdfs:domain htswlib:IlluminaLane ;
-    rdfs:range rdfs:Literal .
+    rdfs:range xsd:string .
  
  # FIXME: should this be note?
  htswlib:comment
diff --git a/scripts/rdfcheck.py b/scripts/rdfcheck.py

new file mode 100644 (file)

index 0000000..7f5a6d1
--- /dev/null
+++ b/scripts/rdfcheck.py
@@ -0,0 +1,30 @@
+from argparse import ArgumentParser
+import logging
+from htsworkflow.util import rdfhelp, rdfinfer
+
+def main(cmdline=None):
+    parser = make_parser()
+    args = parser.parse_args(cmdline)
+
+    logging.basicConfig(level=logging.INFO)
+
+    validate_urls(args.urls)
+
+def make_parser():
+    parser = ArgumentParser()
+    parser.add_argument('urls',nargs='*')
+    return parser
+
+def validate_urls(urls):
+    model = rdfhelp.get_model()
+    rdfhelp.add_default_schemas(model)
+
+    for u in urls:
+        rdfhelp.load_into_model(model, None, u)
+
+    engine = rdfinfer.Infer(model)
+    #engine.think()
+    engine.validate()
+
+if __name__ == "__main__":
+    main()
author	Diane Trout <diane@caltech.edu>
	Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)
committer	Diane Trout <diane@caltech.edu>
	Fri, 16 Nov 2012 00:01:04 +0000 (16:01 -0800)
htsworkflow/pipelines/sequences.py		patch \| blob \| history
htsworkflow/util/rdfinfer.py		patch \| blob \| history
htsworkflow/util/schemas/htsworkflow.turtle		patch \| blob \| history
scripts/rdfcheck.py	[new file with mode: 0644]	patch \| blob