mark the example submission rule files as being raw, so the escapes dont get confused

[htsworkflow.git] / encode_submission / encode_find.py
diff --git a/encode_submission/encode_find.py b/encode_submission/encode_find.py

index 6608a05de61154713d9e47bd8ab37b71424c520d..7589f5487aef5ef85a6c4ec65072aa3d025ef5bc 100644 (file)
--- a/encode_submission/encode_find.py
+++ b/encode_submission/encode_find.py
@@ -20,21 +20,20 @@ import sys
  import urllib
  import urlparse
  
+if not 'DJANGO_SETTINGS_MODULE' in os.environ:
+    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
+
  from htsworkflow.submission import daf, ucsc
  
  from htsworkflow.util import api
+from htsworkflow.util.rdfns import *
  from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     dublinCoreNS, \
       get_model, \
       get_serializer, \
       sparql_query, \
       submissionOntology, \
       libraryOntology, \
-     load_into_model, \
-     rdfNS, \
-     rdfsNS, \
-     xsdNS
+     load_into_model
  TYPE_N = rdfNS['type']
  CREATION_DATE = libraryOntology['date']
  
@@ -44,7 +43,7 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  from htsworkflow.submission.ucsc import \
       daf_download_url, \
       ddf_download_url, \
-     get_ucsc_file_index, \
+     get_encodedcc_file_index, \
       submission_view_url, \
       UCSCEncodePipeline
  
@@ -60,8 +59,10 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
-GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
-                   "{genome}/encodeDCC/{composite}/"
+SL_MAP = {'SL2970': '02970',
+          'SL2971': '02971',
+          'SL2973': '02973',}
+
  def main(cmdline=None):
      """
      Parse command line arguments
@@ -76,12 +77,14 @@ def main(cmdline=None):
          logging.basicConfig(level=logging.DEBUG)
      elif opts.verbose:
          logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.ERROR)
  
      htsw_authdata = api.make_auth_from_opts(opts, parser)
      htswapi = api.HtswApi(opts.host, htsw_authdata)
  
      cookie = None
-    model = get_model(opts.load_model, DBDIR)
+    model = get_model(opts.model, DBDIR)
  
      if opts.load_rdf is not None:
          ns_uri = submissionOntology[''].uri
@@ -92,25 +95,38 @@ def main(cmdline=None):
      else:
          limit = args
  
+    if opts.reload_libraries:
+        reload_libraries(model, args)
+        return
+
      if opts.update:
+        opts.update_submission = True
+        opts.update_libraries = True
+        opts.update_ucsc_downloads = True
+
+    if opts.update_submission:
          cookie = login(cookie=cookie)
          load_my_submissions(model, limit=limit, cookie=cookie)
-        load_encode_libraries(model, htswapi)
+
+    if opts.update_libraries:
+        load_encode_assigned_libraries(model, htswapi)
+        load_unassigned_submitted_libraries(model)
+
+    if opts.update_ucsc_downloads:
          our_tracks = [
              {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
              {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
-            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
              {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
          ]
          for track_info in our_tracks:
-            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
-
+            load_encodedcc_files(model, **track_info )
  
      if opts.sparql is not None:
-        sparql_query(model, opts.sparql)
+        sparql_query(model, opts.sparql, 'html')
  
      if opts.find_submission_with_no_library:
-        find_submissions_with_no_library(model)
+        report_submissions_with_no_library(model)
  
      if opts.print_rdf:
          serializer = get_serializer(name=opts.rdf_parser_name)
@@ -122,21 +138,27 @@ def make_parser():
      """
      parser = OptionParser()
      commands = OptionGroup(parser, "Commands")
-    commands.add_option('--load-model', default=None,
+    commands.add_option('--model', default=None,
        help="Load model database")
      commands.add_option('--load-rdf', default=None,
        help="load rdf statements into model")
      commands.add_option('--print-rdf', action="store_true", default=False,
        help="print ending model state")
      commands.add_option('--update', action="store_true", default=False,
-      help="Query remote data sources and update our database")
-    #commands.add_option('--update-ucsc-status', default=None,
-    #  help="download status from ucsc, requires filename for extra rules")
-    #commands.add_option('--update-ddfs', action="store_true", default=False,
-    #  help="download ddf information for known submission")
-    #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, "\
-    #       "requires filename for extra rules")
+      help="Do all updates")
+    commands.add_option('--update-submission', action="store_true",
+                        default=False,
+      help="download status from ucsc")
+    commands.add_option('--update-ucsc-downloads', action="store_true",
+                        default=False,
+      help="Update download locations from UCSC")
+    commands.add_option('--update-libraries', action="store_true",
+                        default=False,
+      help="download library info from htsw")
+    commands.add_option('--reload-libraries', action="store_true",
+                        default=False,
+                        help="Delete and redownload library information. "\
+                             "Optionally list specific library IDs.")
      parser.add_option_group(commands)
  
      queries = OptionGroup(parser, "Queries")
@@ -160,7 +182,7 @@ def make_parser():
  
  
  def load_my_submissions(model, limit=None, cookie=None):
-    """Parse all the submissions from UCSC into model
+    """Parse all of my submissions from encodesubmit into model
      It will look at the global USER_URL to figure out who to scrape
      cookie contains the session cookie, if none, will attempt to login
      """
@@ -234,6 +256,17 @@ def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
          LOGGER.debug("Found: {0}".format(str(query)))
  
  
+def report_submissions_with_no_library(model):
+    missing = find_submissions_with_no_library(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn "\
+              "<http://jumpgate.caltech.edu/library/> ."
+        print ""
+
  def find_submissions_with_no_library(model):
      missing_lib_query_text = """
  PREFIX submissionOntology:<{submissionOntology}>
@@ -247,15 +280,39 @@ WHERE {{
  }}""".format(submissionOntology=submissionOntology[''].uri)
      missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
  
-    results = missing_lib_query.execute(model)
-    for row in results:
-        subid = row['subid']
-        name = row['name']
-        print "# {0}".format(name)
-        print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn "\
-              "<http://jumpgate.caltech.edu/library/> ."
-        print ""
+    return missing_lib_query.execute(model)
+
+
+def find_unscanned_submitted_libraries(model):
+    """Scan model for libraries that don't have library details loaded
+    """
+    unscanned_libraries = """
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT distinct ?submission ?library_urn
+WHERE {{
+  ?submission submissionOntology:library_urn ?library_urn .
+  OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
+  FILTER(!BOUND(?library_type))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+    query = RDF.SPARQLQuery(unscanned_libraries)
+    return query.execute(model)
+
+def find_all_libraries(model):
+    """Scan model for every library marked as
+    """
+    libraries = """
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX libraryOntology:<{libraryOntology}>
+
+SELECT distinct ?library_urn
+WHERE {{
+  ?library_urn rdf:type ?library_type .
+  FILTER(regex(?libray
+}}""".format(libraryOntology=libraryOntology[''].uri)
+    query = RDF.SPARQLQuery(libraries)
+    return query.execute(model)
  
  
  def add_submission_creation_date(model, subUrn, cookie):
@@ -264,15 +321,17 @@ def add_submission_creation_date(model, subUrn, cookie):
      if len(creation_dates) == 0:
          LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
          submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        parse_submission_page(model, cells, subUrn)
+        parse_submission_page(model, submissionTree, subUrn)
      else:
          LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
  
+
  def get_creation_dates(model, subUrn):
      query = RDF.Statement(subUrn, CREATION_DATE, None)
      creation_dates = list(model.find_statements(query))
      return creation_dates
  
+
  def parse_submission_page(model, submissionTree, subUrn):
      cells = submissionTree.findall('.//td')
      dateTimeType = xsdNS['dateTime']
@@ -384,7 +443,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
                  add_stmt(model, fileNode, predicate, object)
  
  
-def load_encode_libraries(model, htswapi):
+def load_encode_assigned_libraries(model, htswapi):
      """Get libraries associated with encode.
      """
      encodeFilters = ["/library/?affiliations__id__exact=44",
@@ -400,21 +459,99 @@ def load_encode_libraries(model, htswapi):
          libraries = model.find_statements(query)
          for statement in libraries:
              libraryUrn = statement.subject
-            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
              load_library_detail(model, libraryUrn)
  
  
-def load_encodedcc_files(model, base_url):
-    if base_url[-1] != '/':
-        base_url += '/'
+def load_unassigned_submitted_libraries(model):
+    unassigned = find_unscanned_submitted_libraries(model)
+    for query_record in unassigned:
+        library_urn = query_record['library_urn']
+        LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
+        load_library_detail(model, library_urn)
  
-    file_index = ucsc.get_ucsc_file_index(base_url)
+def reload_libraries(model, library_list):
+    if len(library_list) == 0:
+        # reload everything.
+        queryset = find_all_libraries(model)
+        libraries = ( str(s['library_urn']) for s in queryset )
+    else:
+        libraries = ( user_library_id_to_library_urn(l) for l in library_list )
+
+    for library_urn in libraries:
+        delete_library(model, library_urn)
+        load_library_detail(model, library_urn)
+
+def user_library_id_to_library_urn(library_id):
+    split_url = urlparse.urlsplit(library_id)
+    if len(split_url.scheme) == 0:
+        return LIBRARY_NS[library_id]
+    else:
+        return library_id
+
+def delete_library(model, library_urn):
+    if not isinstance(library_urn, RDF.Node):
+        raise ValueError("library urn must be a RDF.Node")
+
+    LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
+    lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
+    for lane in model.find_statements(lane_query):
+        delete_lane(model, lane.object)
+    library_attrib_query = RDF.Statement(library_urn, None, None)
+    for library_attrib in model.find_statements(library_attrib_query):
+        LOGGER.debug("Deleting {0}".format(str(library_attrib)))
+        del model[library_attrib]
+
+
+def delete_lane(model, lane_urn):
+    if not isinstance(lane_urn, RDF.Node):
+        raise ValueError("lane urn must be a RDF.Node")
+
+    delete_lane_mapping(model, lane_urn)
+    lane_attrib_query = RDF.Statement(lane_urn,None,None)
+    for lane_attrib in model.find_statements(lane_attrib_query):
+        LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
+        del model[lane_attrib]
+
+
+def delete_lane_mapping(model, lane_urn):
+    if not isinstance(lane_urn, RDF.Node):
+        raise ValueError("lane urn must be a RDF.Node")
+
+    lane_mapping_query = RDF.Statement(lane_urn,
+                                       libraryOntology['has_mappings'],
+                                       None)
+    for lane_mapping in model.find_statements(lane_mapping_query):
+        mapping_attrib_query = RDF.Statement(lane_mapping.object,
+                                             None,
+                                             None)
+        for mapping_attrib in model.find_statements(mapping_attrib_query):
+            LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
+            del model[mapping_attrib]
+
+
+def load_encodedcc_files(model, genome, composite):
+    file_index = ucsc.get_encodedcc_file_index(genome, composite)
+    if file_index is None:
+        return
+
+    lib_term = submissionOntology['library_urn']
+    sub_term = submissionOntology['submission_urn']
      for filename, attributes in file_index.items():
-        s = RDF.Node(RDF.Uri(base_url + filename))
+        s = RDF.Node(RDF.Uri(filename))
+        model.add_statement(
+            RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
          for name, value in attributes.items():
              p = RDF.Node(DCC_NS[name])
              o = RDF.Node(value)
              model.add_statement(RDF.Statement(s,p,o))
+            if name.lower() == 'labexpid':
+                model.add_statement(
+                    RDF.Statement(s, lib_term, LIBRARY_NS[value+'/']))
+            elif name.lower() == 'subid':
+                sub_url = RDF.Uri(submission_view_url(value))
+                model.add_statement(
+                    RDF.Statement(s, sub_term, sub_url))
+
  
  def load_library_detail(model, libraryUrn):
      """Grab detail information from library page
@@ -426,7 +563,11 @@ def load_library_detail(model, libraryUrn):
      LOGGER.debug(log_message.format(len(results), libraryUrn))
      if len(results) == 0:
          LOGGER.info("Loading {0}".format(str(libraryUrn)))
-        rdfaParser.parse_into_model(model, libraryUrn.uri)
+        try:
+            body = get_url_as_text(str(libraryUrn.uri), 'GET')
+            rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
+        except httplib2.HttpLib2ErrorWithResponse, e:
+            LOGGER.error(str(e))
      elif len(results) == 1:
          pass  # Assuming that a loaded dataset has one record
      else:
@@ -440,11 +581,15 @@ def get_library_id(name):
      '11039'
      >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
      '10150'
+    >>> get_library_id('2x75-GM12892-rep2-SL2970')
+    '02970'
      """
      match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
      library_id = None
      if match is not None:
          library_id = match.group('id')
+    if library_id in SL_MAP:
+        library_id = SL_MAP[library_id]
      return library_id
  
  
@@ -522,6 +667,7 @@ def get_url_as_tree(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
  
  
  def get_url_as_text(url, method, cookie=None):
@@ -536,6 +682,7 @@ def get_url_as_text(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
  
  ################
  #  old stuff