Stop copying attributes from library to a submission node
authorDiane Trout <diane@caltech.edu>
Wed, 20 Jul 2011 00:45:01 +0000 (17:45 -0700)
committerDiane Trout <diane@caltech.edu>
Wed, 20 Jul 2011 00:45:01 +0000 (17:45 -0700)
Instead just query the library directly.

This involved updating the sparql query in make_ddf a bit.

Associated with doing that I also changed when I import
data from the RDFa pages, so I only connect to the web server
once per library, instead of once per scanned file per attribute.

That provided a significant performance improvement.

extra/ucsc_encode_submission/ucsc_gather.py
htsworkflow/submission/daf.py
htsworkflow/submission/test/test_daf.py

index 37169c075d7c5dbdfefa908a86247917c92e0aae..f7cdcf71b938d06306264ec296fa8ff4bf472ffe 100755 (executable)
@@ -40,6 +40,7 @@ logger = logging.getLogger('ucsc_gather')
 def main(cmdline=None):
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
+    submission_uri = None
     
     if opts.debug:
         logging.basicConfig(level = logging.DEBUG )
@@ -53,12 +54,14 @@ def main(cmdline=None):
     model = get_model(opts.load_model)
     if opts.name:
         mapper = DAFMapper(opts.name, opts.daf,  model)
+        if opts.library_url is not None:
+            mapper.library_url = opts.library_url
         submission_uri = get_submission_uri(opts.name)
-
-    if opts.library_url is not None:
-        mapper.library_url = opts.library_url
         
+
     if opts.load_rdf is not None:
+        if submission_uri is None:
+            parser.error("Please specify the submission name")
         load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
 
     if opts.make_ddf and opts.daf is None:
@@ -210,7 +213,7 @@ def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None)
 PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 
-select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?controlId ?labExpId ?labVersion ?treatment ?protocol
+select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength
 WHERE {
   ?file ucscDaf:filename ?files ;
         ucscDaf:md5sum ?md5sum .
@@ -218,17 +221,19 @@ WHERE {
               ucscDaf:view ?dafView ;
               ucscDaf:submission <%(submission)s> .
   ?dafView ucscDaf:name ?view .
-  <%(submission)s> submissionOntology:library ?library .
-
-  OPTIONAL { ?submitView ucscDaf:antibody ?antibody }
-  OPTIONAL { ?submitView ucscDaf:cell ?cell }
-  OPTIONAL { ?submitView ucscDaf:control ?control }
-  OPTIONAL { ?library ucscDaf:controlId ?controlId }
-  OPTIONAL { ?submitView ucscDaf:sex ?sex }
-  OPTIONAL { ?submitView ucscDaf:labVersion ?labExpId }
-  OPTIONAL { ?submitView ucscDaf:labVersion ?labVersion }
-  OPTIONAL { ?library ucscDaf:treatment ?treatment }
-  OPTIONAL { ?submitView ucscDaf:protocol ?protocol }
+  <%(submission)s> submissionOntology:library ?library ;
+
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell }
+  OPTIONAL { <%(submission)s> ucscDaf:control ?control }
+  OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
+  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
+  OPTIONAL { ?library libraryOntology:condition ?treatment }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
 }
 ORDER BY  ?submitView""" 
     dag_fragments = []
@@ -253,7 +258,8 @@ ORDER BY  ?submitView"""
     variables = ['files']
     # filename goes first
     variables.extend(view_map.get_daf_variables())
-    variables += ['controlId', 'labExpId', 'md5sum']
+    # 'controlId',
+    variables += [ 'labExpId', 'md5sum']
     output.write('\t'.join(variables))
     output.write(os.linesep)
     
@@ -337,7 +343,7 @@ queue
     for f in files:
         pathname = os.path.join(outdir, f)
         if not os.path.exists(pathname):
-            raise RuntimeError("Missing %s" % (f,))
+            raise RuntimeError("Missing %s from %s" % (f,outdir))
 
     context = {'archivename': make_submission_name(name),
                'filelist': " ".join(files),
index 75c8868ea9f0716f0cfc775853d144cc556eac93..1c19339243290f77af81c9a7006d011894d2ce84 100644 (file)
@@ -236,6 +236,8 @@ class DAFMapper(object):
         #attributes = get_filename_attribute_map(paired)
         libNode = self.libraryNS[library_id + "/"]
         
+        self._add_library_details_to_model(libNode)
+        
         submission_files = os.listdir(submission_dir)
         for f in submission_files:
             self.construct_file_attributes(submission_dir, libNode, f)
@@ -250,6 +252,7 @@ class DAFMapper(object):
         """
         path, filename = os.path.split(pathname)
 
+        logger.debug("Searching for view")
         view = self.find_view(filename)
         if view is None:
             logger.warn("Unrecognized file: %s" % (pathname,))
@@ -260,17 +263,23 @@ class DAFMapper(object):
         submission_name = self.make_submission_name(submission_dir)
         submissionNode = self.get_submission_node(submission_dir)
         submission_uri = str(submissionNode.uri)
-        view_name = str(fromTypedNode(self.model.get_target(view, dafTermOntology['name'])))
+        view_name = fromTypedNode(self.model.get_target(view, dafTermOntology['name']))
+        if view_name is None:
+            logging.warning('Could not find view name for {0}'.format(str(view)))
+            return
+        
+        view_name = str(view_name)
         submissionView = RDF.Node(RDF.Uri(submission_uri + '/' + view_name))
 
         self.model.add_statement(
             RDF.Statement(self.submissionSet, dafTermOntology['has_submission'], submissionNode))
-
+        logger.debug("Adding statements to {0}".format(str(submissionNode)))
         self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['has_view'], submissionView))
         self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['name'], toTypedNode(submission_name)))
         self.model.add_statement(RDF.Statement(submissionNode, rdfNS['type'], submissionOntology['submission']))
         self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['library'], libNode))
-        
+
+        logger.debug("Adding statements to {0}".format(str(submissionView)))
         # add trac specific information
         self.model.add_statement(
             RDF.Statement(submissionView, dafTermOntology['view'], view))
@@ -285,13 +294,8 @@ class DAFMapper(object):
                  ]
         terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
 
-        # Add everything I can find
-        for term in terms:
-            value = str(self._get_library_attribute(libNode, term))
-            if value is not None:
-                self.model.add_statement(RDF.Statement(submissionView, term, value))
-
         # add file specific information
+        logger.debug("Updating file md5sum")
         fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
         submission_pathname = os.path.join(submission_dir, filename)
         md5 = make_md5sum(submission_pathname)
@@ -306,19 +310,16 @@ class DAFMapper(object):
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 
-            
+        logger.debug("Done.")
+        
     def _add_library_details_to_model(self, libNode):
         parser = RDF.Parser(name='rdfa')
         new_statements = parser.parse_as_stream(libNode.uri)
         for s in new_statements:
             # don't override things we already have in the model
-            q = RDF.Statement(s.subject, s.predicate, None)
-            if len(list(self.model.find_statements(q))) == 0:
+            targets = list(self.model.get_targets(s.subject, s.predicate))
+            if len(targets) == 0:
                 self.model.append(s)
-            
-        statements = list(self.model.find_statements(q))
-        if len(statements) == 0:
-            logger.warning("Nothing known about %s" % (str(libNode),))
 
     def get_daf_variables(self):
         """Returns simple variables names that to include in the ddf
@@ -349,21 +350,25 @@ class DAFMapper(object):
         if not isinstance(attribute, RDF.Node):
             attribute = libraryOntology[attribute]
 
-        # search through the model twice (adding in data from website)
-        for i in xrange(2):
-            targets = list(self.model.get_targets(libNode, attribute))
-            if len(targets) > 0:
-                return self._format_library_attribute(targets)
+        targets = list(self.model.get_targets(libNode, attribute))
+        if len(targets) > 0:
+            return self._format_library_attribute(targets)
+        else:
+            return None
 
-            targets = self._search_same_as(libNode, attribute)
-            if targets is not None:
-                return self._format_library_attribute(targets)
-            
-            # we don't know anything about this attribute
-            self._add_library_details_to_model(libNode)
+        #targets = self._search_same_as(libNode, attribute)
+        #if targets is not None:
+        #    return self._format_library_attribute(targets)
+        
+        # we don't know anything about this attribute
+        self._add_library_details_to_model(libNode)
+
+        targets = list(self.model.get_targets(libNode, attribute))
+        if len(targets) > 0:
+            return self._format_library_attribute(targets)
 
         return None
-            
+    
     def _format_library_attribute(self, targets):
         if len(targets) == 0:
             return None
@@ -402,7 +407,17 @@ class DAFMapper(object):
         else:
             return None
         
-
+    def get_view_name(self, view):
+        names = list(self.model.get_targets(view, submissionOntology['view_name']))
+        if len(names) == 1:
+            return fromTypedNode(names[0])
+        else:
+            msg = "Found wrong number of view names for {0} len = {1}"
+            msg = msg.format(str(view), len(names))
+            logger.error(msg)
+            raise RuntimeError(msg)
+        
+            
     def _get_filename_view_map(self):
         """Query our model for filename patterns
 
index e4ee4859928180d77970199431fcfce6b007bc21..913b0813c8bffcf1c37aa33b082ab8f1c69902b2 100644 (file)
@@ -83,14 +83,17 @@ class TestDAF(unittest.TestCase):
         name = model.get_target(signal_view_node, dafTermOntology['name'])
         self.failUnlessEqual(fromTypedNode(name), u'Signal')
 
-def load_daf_mapper(name, extra_statements=None):
+def load_daf_mapper(name, extra_statements=None, ns=None):
     """Load test model in
     """
     model = get_model()
+    if ns is None:
+        ns="http://extra"
+        
     if extra_statements is not None:
         parser = RDF.Parser(name='turtle')
         parser.parse_string_into_model(model, extra_statements,
-                                       'http://extra.extra')
+                                       ns)
         
     test_daf_stream = StringIO(test_daf)
     mapper = daf.DAFMapper(name, daf_file = test_daf_stream, model=model)
@@ -120,30 +123,31 @@ class TestDAFMapper(unittest.TestCase):
         #self.failUnlessEqual(search[0].object.literal_value['string'], pattern)
 
     def test_find_one_view(self):
+        name='testfind'
         extra = '''@prefix dafTerm:<http://jumpgate.caltech.edu/wiki/UcscDaf#> .
+@prefix thisView: <http://jumpgate.caltech.edu/wiki/SubmissionsLog/{0}/view/> .
 
-<%(submissionLog)s/testfind/view/Signal> dafTerm:filename_re ".*\\\\.bam" .
-<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*_r1\\\\.fastq" .
-''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog'}
-
-        daf_mapper = load_daf_mapper('testfind', extra_statements = extra)
+thisView:Signal dafTerm:filename_re ".*\\\\.bam" .
+thisView:FastqRd1 dafTerm:filename_re ".*_r1\\\\.fastq" .
+'''.format(name)
+        daf_mapper = load_daf_mapper(name, extra_statements = extra)
 
         view = daf_mapper.find_view('filename_r1.fastq')
-        self.failUnlessEqual(str(view),
-                             str(submissionLog['testfind/view/FastqRd1']))
-
-        #writer = get_serializer()
-        #turtle =  writer.serialize_model_to_string(model)
-        #print turtle
+        
+        # dump_model(daf_mapper.model)
+        view_root = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/{0}/view/'
+        view_root = view_root.format(name)
+        self.failUnlessEqual(str(view), '<{0}{1}>'.format(view_root,'FastqRd1'))
 
     def test_find_overlapping_view(self):
+        name = 'testfind'
         extra = '''@prefix dafTerm:<http://jumpgate.caltech.edu/wiki/UcscDaf#> .
+@prefix thisView: <http://jumpgate.caltech.edu/wiki/SubmissionsLog/{0}/view/> .
 
-<%(submissionLog)s/testfind/view/fastq> dafTerm:filename_re ".*\\\\.fastq" .
-<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*_r1\\\\.fastq" .
-''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog'}
-
-        daf_mapper = load_daf_mapper('testfind', extra_statements = extra)
+thisView:fastq dafTerm:filename_re ".*\\\\.fastq" .
+thisView:FastqRd1 dafTerm:filename_re ".*_r1\\\\.fastq" .
+'''.format(name)
+        daf_mapper = load_daf_mapper(name, extra_statements = extra)
 
         self.failUnlessRaises(daf.ModelException,
                               daf_mapper.find_view,
@@ -153,13 +157,16 @@ class TestDAFMapper(unittest.TestCase):
         lib_id = '11204'
         lib_url = 'http://jumpgate.caltech.edu/library/%s/' %(lib_id)
         extra = '''@prefix dafTerm: <http://jumpgate.caltech.edu/wiki/UcscDaf#> .
+@prefix submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
+@prefix thisView: <http://jumpgate.caltech.edu/wiki/SubmissionsLog/testfind/view/> .
 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
 
-<%(submissionLog)s/testfind/view/Signal> dafTerm:filename_re ".*\\\\.bam" .
-<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*\\\\.fastq" .
+thisView:Signal dafTerm:filename_re ".*\\\\.bam" ;
+      submissionOntology:view_name "Signal" .
+thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
+        submissionOntology:view_name "FastqRd1" .
 <%(libUrl)s> <%(libraryOntology)sgel_cut> "100"^^xsd:decimal . 
-''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog',
-       'libraryOntology': 'http://jumpgate.caltech.edu/wiki/LibraryOntology#',
+''' % {'libraryOntology': 'http://jumpgate.caltech.edu/wiki/LibraryOntology#',
        'libUrl': lib_url}
 
         daf_mapper = load_daf_mapper('testfind', extra)
@@ -181,16 +188,18 @@ class TestDAFMapper(unittest.TestCase):
                                                      libNode,
                                                      filename)
             
+        #dump_model(daf_mapper.model)
+        
         sub_root = "http://jumpgate.caltech.edu/wiki/SubmissionsLog/testfind/"
         submission_name = sub_root + analysis_name
         source = daf_mapper.model.get_source(rdfNS['type'], submissionOntology['submission'])
-
         self.failUnlessEqual(str(source.uri), submission_name)
 
         view_name = submission_name + '/Signal'
         view = daf_mapper.model.get_target(source, submissionOntology['has_view'])
         self.failUnlessEqual(str(view.uri), view_name)
 
+        
     def test_library_url(self):
         daf_mapper = load_daf_mapper('urltest')
 
@@ -215,6 +224,7 @@ def mktempfile(suffix='', prefix='tmp', dir=None):
     os.close(fd)
     os.unlink(pathname)
     print "unmade", pathname
+
     
 def suite():
     suite = unittest.makeSuite(TestDAF, 'test')