From ff3269caa0d45e10282952b0d4b8030c9b01cfd1 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Tue, 19 Jul 2011 17:45:01 -0700 Subject: [PATCH] Stop copying attributes from library to a submission node Instead just query the library directly. This involved updating the sparql query in make_ddf a bit. Associated with doing that I also changed when I import data from the RDFa pages, so I only connect to the web server once per library, instead of once per scanned file per attribute. That provided a significant performance improvement. --- extra/ucsc_encode_submission/ucsc_gather.py | 40 ++++++----- htsworkflow/submission/daf.py | 73 +++++++++++++-------- htsworkflow/submission/test/test_daf.py | 56 +++++++++------- 3 files changed, 100 insertions(+), 69 deletions(-) diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py index 37169c0..f7cdcf7 100755 --- a/extra/ucsc_encode_submission/ucsc_gather.py +++ b/extra/ucsc_encode_submission/ucsc_gather.py @@ -40,6 +40,7 @@ logger = logging.getLogger('ucsc_gather') def main(cmdline=None): parser = make_parser() opts, args = parser.parse_args(cmdline) + submission_uri = None if opts.debug: logging.basicConfig(level = logging.DEBUG ) @@ -53,12 +54,14 @@ def main(cmdline=None): model = get_model(opts.load_model) if opts.name: mapper = DAFMapper(opts.name, opts.daf, model) + if opts.library_url is not None: + mapper.library_url = opts.library_url submission_uri = get_submission_uri(opts.name) - - if opts.library_url is not None: - mapper.library_url = opts.library_url + if opts.load_rdf is not None: + if submission_uri is None: + parser.error("Please specify the submission name") load_into_model(model, 'turtle', opts.load_rdf, submission_uri) if opts.make_ddf and opts.daf is None: @@ -210,7 +213,7 @@ def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None) PREFIX submissionOntology: PREFIX ucscDaf: -select ?submitView ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?controlId ?labExpId ?labVersion ?treatment ?protocol +select ?submitView ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength WHERE { ?file ucscDaf:filename ?files ; ucscDaf:md5sum ?md5sum . @@ -218,17 +221,19 @@ WHERE { ucscDaf:view ?dafView ; ucscDaf:submission <%(submission)s> . ?dafView ucscDaf:name ?view . - <%(submission)s> submissionOntology:library ?library . - - OPTIONAL { ?submitView ucscDaf:antibody ?antibody } - OPTIONAL { ?submitView ucscDaf:cell ?cell } - OPTIONAL { ?submitView ucscDaf:control ?control } - OPTIONAL { ?library ucscDaf:controlId ?controlId } - OPTIONAL { ?submitView ucscDaf:sex ?sex } - OPTIONAL { ?submitView ucscDaf:labVersion ?labExpId } - OPTIONAL { ?submitView ucscDaf:labVersion ?labVersion } - OPTIONAL { ?library ucscDaf:treatment ?treatment } - OPTIONAL { ?submitView ucscDaf:protocol ?protocol } + <%(submission)s> submissionOntology:library ?library ; + + OPTIONAL { ?library libraryOntology:antibody ?antibody } + OPTIONAL { ?library libraryOntology:cell_line ?cell } + OPTIONAL { <%(submission)s> ucscDaf:control ?control } + OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId } + OPTIONAL { ?library ucscDaf:sex ?sex } + OPTIONAL { ?library libraryOntology:library_id ?labExpId } + OPTIONAL { ?library libraryOntology:library_id ?labVersion } + OPTIONAL { ?library libraryOntology:condition ?treatment } + OPTIONAL { ?library ucscDaf:protocol ?protocol } + OPTIONAL { ?library ucscDaf:readType ?readType } + OPTIONAL { ?library libraryOntology:insert_size ?insertLength } } ORDER BY ?submitView""" dag_fragments = [] @@ -253,7 +258,8 @@ ORDER BY ?submitView""" variables = ['files'] # filename goes first variables.extend(view_map.get_daf_variables()) - variables += ['controlId', 'labExpId', 'md5sum'] + # 'controlId', + variables += [ 'labExpId', 'md5sum'] output.write('\t'.join(variables)) output.write(os.linesep) @@ -337,7 +343,7 @@ queue for f in files: pathname = os.path.join(outdir, f) if not os.path.exists(pathname): - raise RuntimeError("Missing %s" % (f,)) + raise RuntimeError("Missing %s from %s" % (f,outdir)) context = {'archivename': make_submission_name(name), 'filelist': " ".join(files), diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py index 75c8868..1c19339 100644 --- a/htsworkflow/submission/daf.py +++ b/htsworkflow/submission/daf.py @@ -236,6 +236,8 @@ class DAFMapper(object): #attributes = get_filename_attribute_map(paired) libNode = self.libraryNS[library_id + "/"] + self._add_library_details_to_model(libNode) + submission_files = os.listdir(submission_dir) for f in submission_files: self.construct_file_attributes(submission_dir, libNode, f) @@ -250,6 +252,7 @@ class DAFMapper(object): """ path, filename = os.path.split(pathname) + logger.debug("Searching for view") view = self.find_view(filename) if view is None: logger.warn("Unrecognized file: %s" % (pathname,)) @@ -260,17 +263,23 @@ class DAFMapper(object): submission_name = self.make_submission_name(submission_dir) submissionNode = self.get_submission_node(submission_dir) submission_uri = str(submissionNode.uri) - view_name = str(fromTypedNode(self.model.get_target(view, dafTermOntology['name']))) + view_name = fromTypedNode(self.model.get_target(view, dafTermOntology['name'])) + if view_name is None: + logging.warning('Could not find view name for {0}'.format(str(view))) + return + + view_name = str(view_name) submissionView = RDF.Node(RDF.Uri(submission_uri + '/' + view_name)) self.model.add_statement( RDF.Statement(self.submissionSet, dafTermOntology['has_submission'], submissionNode)) - + logger.debug("Adding statements to {0}".format(str(submissionNode))) self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['has_view'], submissionView)) self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['name'], toTypedNode(submission_name))) self.model.add_statement(RDF.Statement(submissionNode, rdfNS['type'], submissionOntology['submission'])) self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['library'], libNode)) - + + logger.debug("Adding statements to {0}".format(str(submissionView))) # add trac specific information self.model.add_statement( RDF.Statement(submissionView, dafTermOntology['view'], view)) @@ -285,13 +294,8 @@ class DAFMapper(object): ] terms.extend((dafTermOntology[v] for v in self.get_daf_variables())) - # Add everything I can find - for term in terms: - value = str(self._get_library_attribute(libNode, term)) - if value is not None: - self.model.add_statement(RDF.Statement(submissionView, term, value)) - # add file specific information + logger.debug("Updating file md5sum") fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename)) submission_pathname = os.path.join(submission_dir, filename) md5 = make_md5sum(submission_pathname) @@ -306,19 +310,16 @@ class DAFMapper(object): self.model.add_statement( RDF.Statement(fileNode, dafTermOntology['md5sum'], md5)) - + logger.debug("Done.") + def _add_library_details_to_model(self, libNode): parser = RDF.Parser(name='rdfa') new_statements = parser.parse_as_stream(libNode.uri) for s in new_statements: # don't override things we already have in the model - q = RDF.Statement(s.subject, s.predicate, None) - if len(list(self.model.find_statements(q))) == 0: + targets = list(self.model.get_targets(s.subject, s.predicate)) + if len(targets) == 0: self.model.append(s) - - statements = list(self.model.find_statements(q)) - if len(statements) == 0: - logger.warning("Nothing known about %s" % (str(libNode),)) def get_daf_variables(self): """Returns simple variables names that to include in the ddf @@ -349,21 +350,25 @@ class DAFMapper(object): if not isinstance(attribute, RDF.Node): attribute = libraryOntology[attribute] - # search through the model twice (adding in data from website) - for i in xrange(2): - targets = list(self.model.get_targets(libNode, attribute)) - if len(targets) > 0: - return self._format_library_attribute(targets) + targets = list(self.model.get_targets(libNode, attribute)) + if len(targets) > 0: + return self._format_library_attribute(targets) + else: + return None - targets = self._search_same_as(libNode, attribute) - if targets is not None: - return self._format_library_attribute(targets) - - # we don't know anything about this attribute - self._add_library_details_to_model(libNode) + #targets = self._search_same_as(libNode, attribute) + #if targets is not None: + # return self._format_library_attribute(targets) + + # we don't know anything about this attribute + self._add_library_details_to_model(libNode) + + targets = list(self.model.get_targets(libNode, attribute)) + if len(targets) > 0: + return self._format_library_attribute(targets) return None - + def _format_library_attribute(self, targets): if len(targets) == 0: return None @@ -402,7 +407,17 @@ class DAFMapper(object): else: return None - + def get_view_name(self, view): + names = list(self.model.get_targets(view, submissionOntology['view_name'])) + if len(names) == 1: + return fromTypedNode(names[0]) + else: + msg = "Found wrong number of view names for {0} len = {1}" + msg = msg.format(str(view), len(names)) + logger.error(msg) + raise RuntimeError(msg) + + def _get_filename_view_map(self): """Query our model for filename patterns diff --git a/htsworkflow/submission/test/test_daf.py b/htsworkflow/submission/test/test_daf.py index e4ee485..913b081 100644 --- a/htsworkflow/submission/test/test_daf.py +++ b/htsworkflow/submission/test/test_daf.py @@ -83,14 +83,17 @@ class TestDAF(unittest.TestCase): name = model.get_target(signal_view_node, dafTermOntology['name']) self.failUnlessEqual(fromTypedNode(name), u'Signal') -def load_daf_mapper(name, extra_statements=None): +def load_daf_mapper(name, extra_statements=None, ns=None): """Load test model in """ model = get_model() + if ns is None: + ns="http://extra" + if extra_statements is not None: parser = RDF.Parser(name='turtle') parser.parse_string_into_model(model, extra_statements, - 'http://extra.extra') + ns) test_daf_stream = StringIO(test_daf) mapper = daf.DAFMapper(name, daf_file = test_daf_stream, model=model) @@ -120,30 +123,31 @@ class TestDAFMapper(unittest.TestCase): #self.failUnlessEqual(search[0].object.literal_value['string'], pattern) def test_find_one_view(self): + name='testfind' extra = '''@prefix dafTerm: . +@prefix thisView: . -<%(submissionLog)s/testfind/view/Signal> dafTerm:filename_re ".*\\\\.bam" . -<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*_r1\\\\.fastq" . -''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog'} - - daf_mapper = load_daf_mapper('testfind', extra_statements = extra) +thisView:Signal dafTerm:filename_re ".*\\\\.bam" . +thisView:FastqRd1 dafTerm:filename_re ".*_r1\\\\.fastq" . +'''.format(name) + daf_mapper = load_daf_mapper(name, extra_statements = extra) view = daf_mapper.find_view('filename_r1.fastq') - self.failUnlessEqual(str(view), - str(submissionLog['testfind/view/FastqRd1'])) - - #writer = get_serializer() - #turtle = writer.serialize_model_to_string(model) - #print turtle + + # dump_model(daf_mapper.model) + view_root = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/{0}/view/' + view_root = view_root.format(name) + self.failUnlessEqual(str(view), '<{0}{1}>'.format(view_root,'FastqRd1')) def test_find_overlapping_view(self): + name = 'testfind' extra = '''@prefix dafTerm: . +@prefix thisView: . -<%(submissionLog)s/testfind/view/fastq> dafTerm:filename_re ".*\\\\.fastq" . -<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*_r1\\\\.fastq" . -''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog'} - - daf_mapper = load_daf_mapper('testfind', extra_statements = extra) +thisView:fastq dafTerm:filename_re ".*\\\\.fastq" . +thisView:FastqRd1 dafTerm:filename_re ".*_r1\\\\.fastq" . +'''.format(name) + daf_mapper = load_daf_mapper(name, extra_statements = extra) self.failUnlessRaises(daf.ModelException, daf_mapper.find_view, @@ -153,13 +157,16 @@ class TestDAFMapper(unittest.TestCase): lib_id = '11204' lib_url = 'http://jumpgate.caltech.edu/library/%s/' %(lib_id) extra = '''@prefix dafTerm: . +@prefix submissionOntology: . +@prefix thisView: . @prefix xsd: . -<%(submissionLog)s/testfind/view/Signal> dafTerm:filename_re ".*\\\\.bam" . -<%(submissionLog)s/testfind/view/FastqRd1> dafTerm:filename_re ".*\\\\.fastq" . +thisView:Signal dafTerm:filename_re ".*\\\\.bam" ; + submissionOntology:view_name "Signal" . +thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ; + submissionOntology:view_name "FastqRd1" . <%(libUrl)s> <%(libraryOntology)sgel_cut> "100"^^xsd:decimal . -''' % {'submissionLog': 'http://jumpgate.caltech.edu/wiki/SubmissionsLog', - 'libraryOntology': 'http://jumpgate.caltech.edu/wiki/LibraryOntology#', +''' % {'libraryOntology': 'http://jumpgate.caltech.edu/wiki/LibraryOntology#', 'libUrl': lib_url} daf_mapper = load_daf_mapper('testfind', extra) @@ -181,16 +188,18 @@ class TestDAFMapper(unittest.TestCase): libNode, filename) + #dump_model(daf_mapper.model) + sub_root = "http://jumpgate.caltech.edu/wiki/SubmissionsLog/testfind/" submission_name = sub_root + analysis_name source = daf_mapper.model.get_source(rdfNS['type'], submissionOntology['submission']) - self.failUnlessEqual(str(source.uri), submission_name) view_name = submission_name + '/Signal' view = daf_mapper.model.get_target(source, submissionOntology['has_view']) self.failUnlessEqual(str(view.uri), view_name) + def test_library_url(self): daf_mapper = load_daf_mapper('urltest') @@ -215,6 +224,7 @@ def mktempfile(suffix='', prefix='tmp', dir=None): os.close(fd) os.unlink(pathname) print "unmade", pathname + def suite(): suite = unittest.makeSuite(TestDAF, 'test') -- 2.30.2