Added scanning for libraries that aren't attached to an encode affiliation.

author Diane Trout <diane@caltech.edu>

Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)

committer Diane Trout <diane@caltech.edu>

Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
author Diane Trout <diane@caltech.edu>
Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
committer Diane Trout <diane@caltech.edu>
Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
diff --git a/encode_submission/encode_find.py b/encode_submission/encode_find.py

index 6608a05de61154713d9e47bd8ab37b71424c520d..14d5bdfe8e3b900ce81ab37418d9757f661cf98a 100644 (file)
--- a/encode_submission/encode_find.py
+++ b/encode_submission/encode_find.py
@@ -44,7 +44,7 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  from htsworkflow.submission.ucsc import \
       daf_download_url, \
       ddf_download_url, \
-     get_ucsc_file_index, \
+     get_encodedcc_file_index, \
       submission_view_url, \
       UCSCEncodePipeline
  
@@ -60,8 +60,10 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
-GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
-                   "{genome}/encodeDCC/{composite}/"
+SL_MAP = {'SL2970': '02970',
+          'SL2971': '02971',
+          'SL2973': '02973',}
+
  def main(cmdline=None):
      """
      Parse command line arguments
@@ -76,12 +78,14 @@ def main(cmdline=None):
          logging.basicConfig(level=logging.DEBUG)
      elif opts.verbose:
          logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.ERROR)
  
      htsw_authdata = api.make_auth_from_opts(opts, parser)
      htswapi = api.HtswApi(opts.host, htsw_authdata)
  
      cookie = None
-    model = get_model(opts.load_model, DBDIR)
+    model = get_model(opts.model, DBDIR)
  
      if opts.load_rdf is not None:
          ns_uri = submissionOntology[''].uri
@@ -93,24 +97,33 @@ def main(cmdline=None):
          limit = args
  
      if opts.update:
+        opts.update_submission = True
+        opts.update_libraries = True
+        opts.update_ucsc_downloads = True
+
+    if opts.update_submission:
          cookie = login(cookie=cookie)
          load_my_submissions(model, limit=limit, cookie=cookie)
-        load_encode_libraries(model, htswapi)
+
+    if opts.update_libraries:
+        load_encode_assigned_libraries(model, htswapi)
+        load_unassigned_submitted_libraries(model)
+
+    if opts.update_ucsc_downloads:
          our_tracks = [
              {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
              {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
-            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
              {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
          ]
          for track_info in our_tracks:
-            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
-
+            load_encodedcc_files(model, **track_info )
  
      if opts.sparql is not None:
          sparql_query(model, opts.sparql)
  
      if opts.find_submission_with_no_library:
-        find_submissions_with_no_library(model)
+        report_submissions_with_no_library(model)
  
      if opts.print_rdf:
          serializer = get_serializer(name=opts.rdf_parser_name)
@@ -122,21 +135,23 @@ def make_parser():
      """
      parser = OptionParser()
      commands = OptionGroup(parser, "Commands")
-    commands.add_option('--load-model', default=None,
+    commands.add_option('--model', default=None,
        help="Load model database")
      commands.add_option('--load-rdf', default=None,
        help="load rdf statements into model")
      commands.add_option('--print-rdf', action="store_true", default=False,
        help="print ending model state")
      commands.add_option('--update', action="store_true", default=False,
-      help="Query remote data sources and update our database")
-    #commands.add_option('--update-ucsc-status', default=None,
-    #  help="download status from ucsc, requires filename for extra rules")
-    #commands.add_option('--update-ddfs', action="store_true", default=False,
-    #  help="download ddf information for known submission")
-    #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, "\
-    #       "requires filename for extra rules")
+      help="Do all updates")
+    commands.add_option('--update-submission', action="store_true",
+                        default=False,
+      help="download status from ucsc")
+    commands.add_option('--update-ucsc-downloads', action="store_true",
+                        default=False,
+      help="Update download locations from UCSC")
+    commands.add_option('--update-libraries', action="store_true",
+                        default=False,
+      help="download library info from htsw")
      parser.add_option_group(commands)
  
      queries = OptionGroup(parser, "Queries")
@@ -234,6 +249,17 @@ def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
          LOGGER.debug("Found: {0}".format(str(query)))
  
  
+def report_submissions_with_no_library(model):
+    missing = find_submissions_with_no_library(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn "\
+              "<http://jumpgate.caltech.edu/library/> ."
+        print ""
+
  def find_submissions_with_no_library(model):
      missing_lib_query_text = """
  PREFIX submissionOntology:<{submissionOntology}>
@@ -247,15 +273,24 @@ WHERE {{
  }}""".format(submissionOntology=submissionOntology[''].uri)
      missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
  
-    results = missing_lib_query.execute(model)
-    for row in results:
-        subid = row['subid']
-        name = row['name']
-        print "# {0}".format(name)
-        print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn "\
-              "<http://jumpgate.caltech.edu/library/> ."
-        print ""
+    return missing_lib_query.execute(model)
+
+
+def find_unscanned_submitted_libraries(model):
+    """Scan model for libraries that don't have library details loaded
+    """
+    unscanned_libraries = """
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT distinct ?submission ?library_urn
+WHERE {{
+  ?submission submissionOntology:library_urn ?library_urn .
+  OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
+  FILTER(!BOUND(?library_type))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+    query = RDF.SPARQLQuery(unscanned_libraries)
+    return query.execute(model)
  
  
  def add_submission_creation_date(model, subUrn, cookie):
@@ -264,15 +299,17 @@ def add_submission_creation_date(model, subUrn, cookie):
      if len(creation_dates) == 0:
          LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
          submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        parse_submission_page(model, cells, subUrn)
+        parse_submission_page(model, submissionTree, subUrn)
      else:
          LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
  
+
  def get_creation_dates(model, subUrn):
      query = RDF.Statement(subUrn, CREATION_DATE, None)
      creation_dates = list(model.find_statements(query))
      return creation_dates
  
+
  def parse_submission_page(model, submissionTree, subUrn):
      cells = submissionTree.findall('.//td')
      dateTimeType = xsdNS['dateTime']
@@ -384,7 +421,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
                  add_stmt(model, fileNode, predicate, object)
  
  
-def load_encode_libraries(model, htswapi):
+def load_encode_assigned_libraries(model, htswapi):
      """Get libraries associated with encode.
      """
      encodeFilters = ["/library/?affiliations__id__exact=44",
@@ -400,22 +437,30 @@ def load_encode_libraries(model, htswapi):
          libraries = model.find_statements(query)
          for statement in libraries:
              libraryUrn = statement.subject
-            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
              load_library_detail(model, libraryUrn)
  
  
-def load_encodedcc_files(model, base_url):
-    if base_url[-1] != '/':
-        base_url += '/'
+def load_unassigned_submitted_libraries(model):
+    unassigned = find_unscanned_submitted_libraries(model)
+    for query_record in unassigned:
+        library_urn = query_record['library_urn']
+        LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
+        load_library_detail(model, library_urn)
+
+
+def load_encodedcc_files(model, genome, composite):
+    file_index = ucsc.get_encodedcc_file_index(genome, composite)
+    if file_index is None:
+        return
  
-    file_index = ucsc.get_ucsc_file_index(base_url)
      for filename, attributes in file_index.items():
-        s = RDF.Node(RDF.Uri(base_url + filename))
+        s = RDF.Node(RDF.Uri(filename))
          for name, value in attributes.items():
              p = RDF.Node(DCC_NS[name])
              o = RDF.Node(value)
              model.add_statement(RDF.Statement(s,p,o))
  
+
  def load_library_detail(model, libraryUrn):
      """Grab detail information from library page
      """
@@ -426,7 +471,11 @@ def load_library_detail(model, libraryUrn):
      LOGGER.debug(log_message.format(len(results), libraryUrn))
      if len(results) == 0:
          LOGGER.info("Loading {0}".format(str(libraryUrn)))
-        rdfaParser.parse_into_model(model, libraryUrn.uri)
+        try:
+            body = get_url_as_text(str(libraryUrn.uri), 'GET')
+            rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
+        except httplib2.HttpLib2ErrorWithResponse, e:
+            LOGGER.error(str(e))
      elif len(results) == 1:
          pass  # Assuming that a loaded dataset has one record
      else:
@@ -440,11 +489,15 @@ def get_library_id(name):
      '11039'
      >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
      '10150'
+    >>> get_library_id('2x75-GM12892-rep2-SL2970')
+    '02970'
      """
      match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
      library_id = None
      if match is not None:
          library_id = match.group('id')
+    if library_id in SL_MAP:
+        library_id = SL_MAP[library_id]
      return library_id
  
  
@@ -522,6 +575,7 @@ def get_url_as_tree(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
  
  
  def get_url_as_text(url, method, cookie=None):
@@ -536,6 +590,7 @@ def get_url_as_text(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
  
  ################
  #  old stuff
diff --git a/htsworkflow/submission/test/test_ucsc.py b/htsworkflow/submission/test/test_ucsc.py

index a003594310069b5108073890abe1d99ec409f2bf..53ac287a5f0b876eb18df0638277493e7e106604 100644 (file)
--- a/htsworkflow/submission/test/test_ucsc.py
+++ b/htsworkflow/submission/test/test_ucsc.py
@@ -12,7 +12,7 @@ wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep1.fastq.tgz  project=wgEncode; g
  class TestUCSCInfo(unittest.TestCase):
      def test_parse_encodedcc_file(self):
          stream = StringIO(ENCODE_FILES)
-        file_index = ucsc.parse_ucsc_file_index(stream)
+        file_index = ucsc.parse_ucsc_file_index(stream, 'http://example.com/files')
          self.assertEquals(len(file_index), 2)
  
          for attributes in file_index.values():
diff --git a/htsworkflow/submission/ucsc.py b/htsworkflow/submission/ucsc.py

index 9181830e34983724e586a906a7194d96fc9507eb..f7734adbd972b26db9150b155a7cdeb39818e0c1 100644 (file)
--- a/htsworkflow/submission/ucsc.py
+++ b/htsworkflow/submission/ucsc.py
@@ -1,10 +1,18 @@
  """Utilities for extracting information from the ENCODE DCC
  """
+import logging
  import urlparse
  import urllib2
  
+LOGGER = logging.getLogger(__name__)
+
  UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
  
+GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/",
+                "http://hgdownload.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/"]
+
  
  def ddf_download_url(submission_id):
      """Return url to download a DDF for a submission
@@ -36,21 +44,41 @@ def submission_view_url(submission_id):
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
  
  
-def get_ucsc_file_index(base_url):
+def get_encodedcc_file_index(genome, composite):
      """Get index of files for a ENCODE collection
+
+    returns None on error
      """
-    if base_url[-1] != '/': base_url += '/'
-    request = urllib2.urlopen(base_url + 'files.txt')
-    file_index = parse_ucsc_file_index(request)
-    return file_index
+    err = None
+    params = {'genome': genome,
+              'composite': composite}
+
+    for path in GOLDEN_PATHS:
+        base_url = path.format(**params)
+        request_url = base_url + 'files.txt'
+
+        try:
+            request = urllib2.urlopen(request_url)
+            file_index = parse_ucsc_file_index(request, base_url)
+            return file_index
+        except urllib2.HTTPError, e:
+            err = e
+            pass
+
+    if err is not None:
+        errmsg = "get_ucsc_file_index <{0}>: {1}"
+        LOGGER.error(errmsg.format(request_url, str(e)))
+
+    return None
  
  
-def parse_ucsc_file_index(stream):
+def parse_ucsc_file_index(stream, base_url):
      """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
      """
      file_index = {}
      for line in stream:
          filename, attribute_line = line.split('\t')
+        filename = base_url + filename
          attributes = {}
          for assignment in  attribute_line.split(';'):
              name, value = assignment.split('=')
author	Diane Trout <diane@caltech.edu>
	Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
committer	Diane Trout <diane@caltech.edu>
	Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
encode_submission/encode_find.py		patch \| blob \| history
htsworkflow/submission/test/test_ucsc.py		patch \| blob \| history
htsworkflow/submission/ucsc.py		patch \| blob \| history