From: Diane Trout Date: Fri, 25 Feb 2011 23:46:15 +0000 (-0800) Subject: Only pull web data when its missing from the rdf model. X-Git-Tag: 0.5.2~56 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=2960690795da5c7c28da4563d8fdc2a091b14d48 Only pull web data when its missing from the rdf model. This involved alterning the RDF model. I changed the namespace to something a bit more discriptive. The information grabbed from the ddf is now attached to the submission status node, as changing the ddf will change the status. --- diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py index b4d6c5d..df51119 100644 --- a/extra/ucsc_encode_submission/encode_find.py +++ b/extra/ucsc_encode_submission/encode_find.py @@ -55,8 +55,7 @@ def main(cmdline=None): if opts.update: cookie = login(cookie=cookie) load_my_submissions(model, cookie=cookie) - update_submission_detail(model, cookie=cookie) - load_libraries(model, htswapi) + load_encode_libraries(model, htswapi) if opts.sparql is not None: sparql_query(model, opts.sparql) @@ -123,49 +122,44 @@ def load_my_submissions(model, cookie=None): tr = p.findNext('tr') # first record is header tr = tr.findNext() - ClassP = rdfsNS['Class'] - NameP = submitOntologyNS['name'] - StatusP = submitOntologyNS['status'] - LastModifyP = submitOntologyNS['last_modify_date'] - SpeciesP = submitOntologyNS['species'] + TypeN = rdfsNS['type'] + NameN = submitOntologyNS['name'] + SpeciesN = submitOntologyNS['species'] LibraryURN = submitOntologyNS['library_urn'] - # typing saving - add_stmt = model.add_statement - Stmt = RDF.Statement + while tr is not None: td = tr.findAll('td') if td is not None and len(td) > 1: - subIdText = td[0].contents[0].contents[0].encode(CHARSET) - subId = submissionNS[subIdText] - submission_stmt = Stmt(subId, ClassP, - submitOntologyNS['Submission']) - if model.contains_statement(submission_stmt): - logger.debug("Have {0}".format(str(submission_stmt))) - else: - logger.info("New submission {0}".format(str(submission_stmt))) - add_stmt(submission_stmt) + subUrnText = td[0].contents[0].contents[0].encode(CHARSET) + subUrn = submissionNS[subUrnText] + + add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission']) - name = get_contents(td[4]) - add_stmt(Stmt(subId, NameP, name)) + name = get_contents(td[4]) + add_stmt(model, subUrn, NameN, name) - status = get_contents(td[6]).strip() - add_stmt(Stmt(subId, StatusP, status)) - - last_mod_datetime = get_date_contents(td[8]) - last_mod = last_mod_datetime.isoformat() - add_stmt(Stmt(subId, LastModifyP, last_mod)) - - species = get_contents(td[2]) - if species is not None: - add_stmt(Stmt(subId, SpeciesP, species)) - - library_id = get_library_id(name) - if library_id is not None: - add_submission_to_library_urn(model, - subId, - LibraryURN, - library_id) + species = get_contents(td[2]) + if species is not None: + add_stmt(model, subUrn, SpeciesN, species) + + library_id = get_library_id(name) + if library_id is not None: + add_submission_to_library_urn(model, + subUrn, + LibraryURN, + library_id) + add_submission_creation_date(model, subUrn, cookie) + + # grab changing atttributes + status = get_contents(td[6]).strip() + last_mod_datetime = get_date_contents(td[8]) + last_mod = last_mod_datetime.isoformat() + + update_submission_detail(model, subUrn, status, last_mod, cookie=cookie) + + logging.info("Processed {0}".format( subUrn)) + tr = tr.findNext('tr') @@ -179,15 +173,23 @@ def add_submission_to_library_urn(model, submissionUrn, predicate, library_id): logger.info("Adding Sub -> Lib link: {0}".format(link)) model.add_statement(link) else: - logger.info("Found: {0}".format(str(result[0]))) + logger.debug("Found: {0}".format(str(query))) def find_submissions_with_no_library(model): - p = os.path.abspath(__file__) - sourcedir = os.path.dirname(p) - no_lib = open(os.path.join(sourcedir, "no-lib.sparql"),'r').read() - query = RDF.SPARQLQuery(no_lib) - results = query.execute(model) + missing_lib_query = RDF.SPARQLQuery(""" +PREFIX submissionOntology:<{submissionOntology}> + +SELECT + ?subid ?name +WHERE {{ + ?subid submissionOntology:name ?name + OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }} + FILTER (!bound(?libid)) +}}""".format(submissionOntology=submitOntologyNS[''].uri) +) + + results = missing_lib_query.execute(model) for row in results: subid = row['subid'] name = row['name'] @@ -196,22 +198,12 @@ def find_submissions_with_no_library(model): print " encodeSubmit:library_urn ." print "" -def update_submission_detail(model, cookie=None): - """Look for submission IDs in our model and go get their ddfs - """ - submissions = model.get_sources(rdfsNS['Class'], - submitOntologyNS['Submission']) - for subUrn in submissions: - logging.info("Updating detail for: {0}".format(str(subUrn))) - update_submission_creation_date(model, subUrn, cookie) - download_ddf(model, subUrn, cookie=cookie) - -def update_submission_creation_date(model, subUrn, cookie): +def add_submission_creation_date(model, subUrn, cookie): # in theory the submission page might have more information on it. - creationDateP = libNS['date'] + creationDateN = libOntNS['date'] dateTimeType = xsdNS['dateTime'] - query = RDF.Statement(subUrn, creationDateP, None) + query = RDF.Statement(subUrn, creationDateN, None) creation_dates = list(model.find_statements(query)) if len(creation_dates) == 0: logger.info("Getting creation date for: {0}".format(str(subUrn))) @@ -221,108 +213,113 @@ def update_submission_creation_date(model, subUrn, cookie): created_date = get_date_contents(created_label.next) created_date_node = RDF.Node(literal=created_date.isoformat(), datatype=dateTimeType.uri) - model.add_statement( - RDF.Statement(subUrn, creationDateP, created_date_node) - ) + add_stmt(model, subUrn, creationDateN, created_date_node) + else: + logger.debug("Found creation date for: {0}".format(str(subUrn))) + +def update_submission_detail(model, subUrn, status, recent_update, cookie): + HasStatusN = submitOntologyNS['has_status'] + StatusN = submitOntologyNS['status'] + LastModifyN = submitOntologyNS['last_modify_date'] + + status_nodes_query = RDF.Statement(subUrn, HasStatusN, None) + status_nodes = list(model.find_statements(status_nodes_query)) + + if len(status_nodes) == 0: + # has no status node, add one + logging.info("Adding status node to {0}".format(subUrn)) + status_blank = RDF.Node() + add_stmt(model, subUrn, HasStatusN, status_blank) + add_stmt(model, status_blank, rdfs['type'], StatusT) + add_stmt(model, status_blank, StatusN, status) + add_stmt(model, status_blank, LastModifyN, recent_update) + update_ddf(model, subUrn, status_blank, cookie=cookie) + else: + logging.info("Found {0} status blanks".format(len(status_nodes))) + for status_statement in status_nodes: + status_blank = status_statement.object + last_modified_query = RDF.Statement(status_blank, LastModifyN, None) + last_mod_nodes = model.find_statements(last_modified_query) + for last_mod_statement in last_mod_nodes: + last_mod_date = str(last_mod_statement.object) + if recent_update == str(last_mod_date): + update_ddf(model, subUrn, status_blank, cookie=cookie) + break + -def download_ddf(model, subId, cookie=None): - """Read a DDF - """ - if cookie is None: - cookie = login() - - download_ddf_url = str(subId).replace('show', 'download_ddf') - ddf = get_url_as_text(download_ddf_url, 'GET', cookie) +def update_ddf(model, subUrn, statusNode, cookie): + TypeN = rdfsNS['type'] + + download_ddf_url = str(subUrn).replace('show', 'download_ddf') ddfUrn = RDF.Uri(download_ddf_url) - query = RDF.Statement(ddfUrn, rdfsNS['Class'], ddfNS['ddf']) - if not model.contains_statement(query): - statements = parse_ddf(subId, ddf) - for s in statements: - model.add_statement(s) + + status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf']) + if not model.contains_statement(status_is_ddf): + logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode)) + ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie) + add_ddf_statements(model, statusNode, ddf_text) + model.add_statement(status_is_ddf) -def parse_ddf(subId, ddf_blob): +def add_ddf_statements(model, statusNode, ddf_string): """Convert a ddf text file into RDF Statements """ - ddf_data = ddf_blob.split('\n') + ddf_lines = ddf_string.split('\n') # first line is header - header = ddf_data[0].split() + header = ddf_lines[0].split() attributes = [ ddfNS[x] for x in header ] statements = [] - subIdUri = str(subId.uri) - # force it to look like a namespace - if subIdUri[-1] != '/': - subIdUri += '/' - subIdNS = RDF.NS(subIdUri) - for ddf_line in ddf_data[1:]: + + for ddf_line in ddf_lines[1:]: ddf_line = ddf_line.strip() if len(ddf_line) == 0: continue if ddf_line.startswith("#"): continue - ddf_records = ddf_line.split('\t') - files = ddf_records[0].split(',') - file_attributes = ddf_records[1:] + ddf_record = ddf_line.split('\t') + files = ddf_record[0].split(',') + file_attributes = ddf_record[1:] for f in files: - blank = RDF.Node() - statements += [RDF.Statement(subId, - submitOntologyNS['has_file'], - blank)] - statements += [RDF.Statement(blank, rdfsNS['Class'], - submitOntologyNS['File'])] - statements += [RDF.Statement(blank, ddfNS['filename'], f)] - file_uri_list = [ blank ] * len(file_attributes) - for s,p,o in zip(file_uri_list, attributes[1:], file_attributes): - statements += [RDF.Statement(s,p,o)] - - return statements - -def load_libraries(model, htswapi): + fileNode = RDF.Node() + add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode) + add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file']) + add_stmt(model, fileNode, ddfNS['filename'], f) + + for predicate, object in zip( attributes[1:], file_attributes): + add_stmt(model, fileNode, predicate, object) + + +def load_encode_libraries(model, htswapi): + """Get libraries associated with encode. """ + encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44") + rdfaParser = RDF.Parser(name='rdfa') + print encodeUrl + rdfaParser.parse_into_model(model, encodeUrl) + query = RDF.Statement(None, libOntNS['library_id'], None) + libraries = model.find_statements(query) + for statement in libraries: + libraryUrn = statement.subject + load_library_detail(model, libraryUrn) + + +def load_library_detail(model, libraryUrn): + """Grab detail information from library page """ - query = RDF.SPARQLQuery(""" - SELECT distinct ?library_urn - WHERE { - ?subid ?library_urn . - }""") - results = query.execute(model) - #newmodel = get_model() - newmodel = model - for row in results: - lib_id = row['library_urn'] - lib_uri = str(row['library_urn'].uri) - short_lib_id = lib_uri.replace(libraryNS._prefix,"") - logging.info("Loading library info: {0}".format(short_lib_id)) - if short_lib_id.startswith("SL"): - continue - lib_info = htswapi.get_library(short_lib_id) - - for lib_k, lib_v in lib_info.items(): - if lib_k != 'lane_set': - attribute = lib_k.encode(CHARSET) - newmodel.append( - RDF.Statement(lib_id, - submitOntologyNS[attribute], - str(lib_v))) - else: - for flowcell in lib_v: - blank = RDF.Node() - newmodel.append( - RDF.Statement(lib_id, - submitOntologyNS['has_lane'], - blank)) - for fc_k, fc_v in flowcell.items(): - newmodel.append( - RDF.Statement(blank, - submitOntologyNS[fc_k.encode(CHARSET)], - str(fc_v))) - - #serializer = RDF.Serializer('turtle') - #print serializer.serialize_model_to_string(newmodel) - + rdfaParser = RDF.Parser(name='rdfa') + query = RDF.Statement(libraryUrn, libOntNS['date'], None) + results = list(model.find_statements(query)) + if len(results) == 0: + logger.info("Loading {0}".format(str(libraryUrn))) + rdfaParser.parse_into_model(model, libraryUrn.uri) + elif len(results) == 1: + pass # Assuming that a loaded dataset has one record + else: + logging.warning("Many dates for {0}".format(libraryUrn)) + def get_library_id(name): """Guess library ID from library name """ @@ -375,7 +372,13 @@ def load_into_model(model, parser_name, filename): ns_uri = submitOntologyNS[''].uri rdf_parser.parse_string_into_model(model, data, ns_uri) - +def add_stmt(model, subject, predicate, object): + """Convienence create RDF Statement and add to a model + """ + return model.add_statement( + RDF.Statement(subject, predicate, object) + ) + def login(cookie=None): """Login if we don't have a cookie """ diff --git a/extra/ucsc_encode_submission/failed-submissions.sparql b/extra/ucsc_encode_submission/failed-submissions.sparql new file mode 100644 index 0000000..af4af4e --- /dev/null +++ b/extra/ucsc_encode_submission/failed-submissions.sparql @@ -0,0 +1,22 @@ +## +## Find submissions that are currently "failed" +## + +PREFIX rdfs: +PREFIX submitOnt: +PREFIX libOntNS: + +#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/") +#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/") +#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#") + +SELECT + ?subid ?subname ?liburn ?status +WHERE { + ?subid submitOnt:name ?subname . + ?subid submitOnt:library_urn ?liburn . + ?subid submitOnt:has_status ?statusNode . + ?statusNode submitOnt:status ?status . + ?statusNode submitOnt:last_modify_date ?last_modify . + FILTER (regex(?status, "failed", "i")) +} diff --git a/extra/ucsc_encode_submission/find-lib-by-cell.sparql b/extra/ucsc_encode_submission/find-lib-by-cell.sparql new file mode 100644 index 0000000..1342dac --- /dev/null +++ b/extra/ucsc_encode_submission/find-lib-by-cell.sparql @@ -0,0 +1,14 @@ +# Produce list of submissions associated with a cell/replicate + +PREFIX rdfs: +PREFIX encodeSubmit: +PREFIX libraryOntology: + +SELECT distinct ?liburn ?cell ?replicate ?subid +WHERE { + ?subid encodeSubmit:library_urn ?liburn ; + encodeSubmit:name ?name . + ?liburn libraryOntology:cell_line ?cell ; + libraryOntology:replicate ?replicate +} +ORDER BY ?cell ?replicate ?liburn diff --git a/extra/ucsc_encode_submission/no-lib.sparql b/extra/ucsc_encode_submission/no-lib.sparql deleted file mode 100644 index 22e36ac..0000000 --- a/extra/ucsc_encode_submission/no-lib.sparql +++ /dev/null @@ -1,18 +0,0 @@ -## Find submissions that don't have a library_id -## -## The code to generate names scans the name for the ID -## and some of the older names lack an included library ID -## -## So I need some manual way of adding in the submission to -## our library_id map - -PREFIX rdfs: -PREFIX encodeSubmit: - -SELECT - ?subid ?name -WHERE { - ?subid encodeSubmit:name ?name - OPTIONAL { ?subid encodeSubmit:library_urn ?libid } - FILTER (!bound(?libid)) -} \ No newline at end of file