Fix namespace issues in encode_find after the conversion

[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

index 3ac4f0420d435b66439b9d2b8488ae5999e522e2..9955e296a8451316424d218daf3036f81e293e2d 100644 (file)
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -14,24 +14,33 @@ import re
  import RDF 
  import sys
  import urllib
+import urlparse
  
  from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dublinCoreNS, \
+     get_model, \
+     get_serializer, \
+     sparql_query, \
+     submissionOntology, \
+     libraryOntology, \
+     load_into_model, \
+     rdfNS, \
+     rdfsNS, \
+     xsdNS
+
+# URL mappings
+libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
  
+from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
+download_ddf = UCSCEncodePipeline+"download_ddf#"
+ddfNS = RDF.NS(download_ddf)
+               
  DBDIR = os.path.expanduser("~diane/proj/submission")
  
  logger = logging.getLogger("encode_find")
  
-libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
-submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
-submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
-ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
-libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
-
-dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
-rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
-rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
-xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
-
  LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  
@@ -49,10 +58,11 @@ def main(cmdline=None):
      htswapi = api.HtswApi(opts.host, htsw_authdata)
      
      cookie = None
-    model = get_model(opts.load_model)
+    model = get_model(opts.load_model, DBDIR)
      
      if opts.load_rdf is not None:
-        load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
+        ns_uri = submissionOntology[''].uri
+        load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
          
      if opts.update:
          cookie = login(cookie=cookie)
@@ -66,7 +76,7 @@ def main(cmdline=None):
          missing = find_submissions_with_no_library(model)
                  
      if opts.print_rdf:
-        serializer = RDF.Serializer(name=opts.rdf_parser_name)
+        serializer = get_serializer(name=opts.rdf_parser_name)
          print serializer.serialize_model_to_string(model)
  
  
@@ -107,15 +117,6 @@ def make_parser():
  
      return parser
  
-def get_model(model_name=None):
-    if model_name is None:
-        storage = RDF.MemoryStorage()
-    else:
-        storage = RDF.HashStorage(model_name,
-                      options="hash-type='bdb',dir='{0}'".format(DBDIR))
-    model = RDF.Model(storage)
-    return model
-        
  def load_my_submissions(model, cookie=None):
      if cookie is None:
          cookie = login()
@@ -126,17 +127,17 @@ def load_my_submissions(model, cookie=None):
      # first record is header
      tr = tr.findNext()
      TypeN = rdfsNS['type']
-    NameN = submitOntologyNS['name']
-    SpeciesN = submitOntologyNS['species']
-    LibraryURN = submitOntologyNS['library_urn']
+    NameN = submissionOntology['name']
+    SpeciesN = submissionOntology['species']
+    LibraryURN = submissionOntology['library_urn']
  
      while tr is not None:
          td = tr.findAll('td')
          if td is not None and len(td) > 1:
              subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
-            subUrn = submissionNS[subUrnText]
+            subUrn = RDF.Uri(submission_view_url(subUrnText))
  
-            add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission'])
+            add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
                  
              name = get_contents(td[4])
              add_stmt(model, subUrn, NameN, name)
@@ -169,10 +170,10 @@ def load_my_submissions(model, cookie=None):
  def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
      """Add a link from a UCSC submission to woldlab library if needed
      """
-    libraryUrn = libraryNS[library_id]
+    libraryUrn = libraryNS[library_id+'/']
      query = RDF.Statement(submissionUrn, predicate, libraryUrn)
      if not model.contains_statement(query):
-        link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
+        link = RDF.Statement(submissionUrn, predicate, libraryUrn)
          logger.info("Adding Sub -> Lib link: {0}".format(link))
          model.add_statement(link)
      else:
@@ -189,7 +190,7 @@ WHERE {{
    ?subid submissionOntology:name ?name
    OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
    FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submitOntologyNS[''].uri)
+}}""".format(submissionOntology=submissionOntology[''].uri)
  )    
  
      results = missing_lib_query.execute(model)
@@ -204,13 +205,13 @@ WHERE {{
  
  def add_submission_creation_date(model, subUrn, cookie):
      # in theory the submission page might have more information on it.
-    creationDateN = libOntNS['date']
+    creationDateN = libraryOntology['date']
      dateTimeType = xsdNS['dateTime']
      query = RDF.Statement(subUrn, creationDateN, None)
      creation_dates = list(model.find_statements(query))
      if len(creation_dates) == 0:
          logger.info("Getting creation date for: {0}".format(str(subUrn)))
-        soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
+        soup = get_url_as_soup(str(subUrn), 'GET', cookie)
          created_label = soup.find(text="Created: ")
          if created_label:
              created_date = get_date_contents(created_label.next)
@@ -221,9 +222,9 @@ def add_submission_creation_date(model, subUrn, cookie):
          logger.debug("Found creation date for: {0}".format(str(subUrn)))
  
  def update_submission_detail(model, subUrn, status, recent_update, cookie):
-    HasStatusN = submitOntologyNS['has_status']
-    StatusN = submitOntologyNS['status']
-    LastModifyN = submitOntologyNS['last_modify_date']
+    HasStatusN = submissionOntology['has_status']
+    StatusN = submissionOntology['status']
+    LastModifyN = submissionOntology['last_modify_date']
  
      status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
      status_nodes = list(model.find_statements(status_nodes_query))
@@ -257,7 +258,7 @@ def update_ddf(model, subUrn, statusNode, cookie):
      download_ddf_url = str(subUrn).replace('show', 'download_ddf')
      ddfUrn = RDF.Uri(download_ddf_url)
      
-    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
+    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
      if not model.contains_statement(status_is_ddf):
          logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
          ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -287,7 +288,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
  
          for f in files:
              fileNode = RDF.Node()
-            add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode)
+            add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
              add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
              add_stmt(model, fileNode, ddfNS['filename'], f)
  
@@ -302,7 +303,7 @@ def load_encode_libraries(model, htswapi):
      rdfaParser = RDF.Parser(name='rdfa')
      print encodeUrl
      rdfaParser.parse_into_model(model, encodeUrl)
-    query = RDF.Statement(None, libOntNS['library_id'], None)
+    query = RDF.Statement(None, libraryOntology['library_id'], None)
      libraries = model.find_statements(query)
      for statement in libraries:
          libraryUrn = statement.subject
@@ -313,7 +314,7 @@ def load_library_detail(model, libraryUrn):
      """Grab detail information from library page
      """
      rdfaParser = RDF.Parser(name='rdfa')
-    query = RDF.Statement(libraryUrn, libOntNS['date'], None)
+    query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
      results = list(model.find_statements(query))
      if len(results) == 0:
          logger.info("Loading {0}".format(str(libraryUrn)))
@@ -353,28 +354,7 @@ def get_date_contents(element):
      else:
          return None
  
-def sparql_query(model, query_filename):
-    """Execute sparql query from file
-    """
-    query_body = open(query_filename,'r').read()
-    query = RDF.SPARQLQuery(query_body)
-    results = query.execute(model)
-    for row in results:
-        output = []
-        for k,v in row.items()[::-1]:
-            print "{0}: {1}".format(k,v)
-        print 
-
          
-def load_into_model(model, parser_name, filename):
-    if not os.path.exists(filename):
-        raise IOError("Can't find {0}".format(filename))
-    
-    data = open(filename, 'r').read()
-    rdf_parser = RDF.Parser(name=parser_name)
-    ns_uri = submitOntologyNS[''].uri
-    rdf_parser.parse_string_into_model(model, data, ns_uri)
-
  def add_stmt(model, subject, predicate, object):
      """Convienence create RDF Statement and add to a model
      """
@@ -382,6 +362,7 @@ def add_stmt(model, subject, predicate, object):
          RDF.Statement(subject, predicate, object)
      )
  
+
  def login(cookie=None):
      """Login if we don't have a cookie
      """
@@ -502,7 +483,7 @@ def library_to_freeze(selected_libraries):
          for d in freezes:
              report.append('<td>')
              for s in batched.get(d, []):
-                show_url = submissionNS[s.subid].uri
+                show_url = submission_view_url(s.subid)
                  subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
                  report.append("{0}:{1}".format(subid, s.status))
              report.append('</td>')