Added scanning for libraries that aren't attached to an encode affiliation.
authorDiane Trout <diane@caltech.edu>
Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
committerDiane Trout <diane@caltech.edu>
Wed, 25 Jan 2012 19:47:46 +0000 (11:47 -0800)
Also I'm looking at both the test and live server for golden path links
And I added a hack to convert SL2971 library IDs to 02971.
Lastly I brought back updating specific data sources in addition to
--updating everything

encode_submission/encode_find.py
htsworkflow/submission/test/test_ucsc.py
htsworkflow/submission/ucsc.py

index 6608a05de61154713d9e47bd8ab37b71424c520d..14d5bdfe8e3b900ce81ab37418d9757f661cf98a 100644 (file)
@@ -44,7 +44,7 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
 from htsworkflow.submission.ucsc import \
      daf_download_url, \
      ddf_download_url, \
-     get_ucsc_file_index, \
+     get_encodedcc_file_index, \
      submission_view_url, \
      UCSCEncodePipeline
 
@@ -60,8 +60,10 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
 USERNAME = 'detrout'
 CHARSET = 'utf-8'
 
-GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
-                   "{genome}/encodeDCC/{composite}/"
+SL_MAP = {'SL2970': '02970',
+          'SL2971': '02971',
+          'SL2973': '02973',}
+
 def main(cmdline=None):
     """
     Parse command line arguments
@@ -76,12 +78,14 @@ def main(cmdline=None):
         logging.basicConfig(level=logging.DEBUG)
     elif opts.verbose:
         logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.ERROR)
 
     htsw_authdata = api.make_auth_from_opts(opts, parser)
     htswapi = api.HtswApi(opts.host, htsw_authdata)
 
     cookie = None
-    model = get_model(opts.load_model, DBDIR)
+    model = get_model(opts.model, DBDIR)
 
     if opts.load_rdf is not None:
         ns_uri = submissionOntology[''].uri
@@ -93,24 +97,33 @@ def main(cmdline=None):
         limit = args
 
     if opts.update:
+        opts.update_submission = True
+        opts.update_libraries = True
+        opts.update_ucsc_downloads = True
+
+    if opts.update_submission:
         cookie = login(cookie=cookie)
         load_my_submissions(model, limit=limit, cookie=cookie)
-        load_encode_libraries(model, htswapi)
+
+    if opts.update_libraries:
+        load_encode_assigned_libraries(model, htswapi)
+        load_unassigned_submitted_libraries(model)
+
+    if opts.update_ucsc_downloads:
         our_tracks = [
             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
-            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
         ]
         for track_info in our_tracks:
-            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
-
+            load_encodedcc_files(model, **track_info )
 
     if opts.sparql is not None:
         sparql_query(model, opts.sparql)
 
     if opts.find_submission_with_no_library:
-        find_submissions_with_no_library(model)
+        report_submissions_with_no_library(model)
 
     if opts.print_rdf:
         serializer = get_serializer(name=opts.rdf_parser_name)
@@ -122,21 +135,23 @@ def make_parser():
     """
     parser = OptionParser()
     commands = OptionGroup(parser, "Commands")
-    commands.add_option('--load-model', default=None,
+    commands.add_option('--model', default=None,
       help="Load model database")
     commands.add_option('--load-rdf', default=None,
       help="load rdf statements into model")
     commands.add_option('--print-rdf', action="store_true", default=False,
       help="print ending model state")
     commands.add_option('--update', action="store_true", default=False,
-      help="Query remote data sources and update our database")
-    #commands.add_option('--update-ucsc-status', default=None,
-    #  help="download status from ucsc, requires filename for extra rules")
-    #commands.add_option('--update-ddfs', action="store_true", default=False,
-    #  help="download ddf information for known submission")
-    #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, "\
-    #       "requires filename for extra rules")
+      help="Do all updates")
+    commands.add_option('--update-submission', action="store_true",
+                        default=False,
+      help="download status from ucsc")
+    commands.add_option('--update-ucsc-downloads', action="store_true",
+                        default=False,
+      help="Update download locations from UCSC")
+    commands.add_option('--update-libraries', action="store_true",
+                        default=False,
+      help="download library info from htsw")
     parser.add_option_group(commands)
 
     queries = OptionGroup(parser, "Queries")
@@ -234,6 +249,17 @@ def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
         LOGGER.debug("Found: {0}".format(str(query)))
 
 
+def report_submissions_with_no_library(model):
+    missing = find_submissions_with_no_library(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn "\
+              "<http://jumpgate.caltech.edu/library/> ."
+        print ""
+
 def find_submissions_with_no_library(model):
     missing_lib_query_text = """
 PREFIX submissionOntology:<{submissionOntology}>
@@ -247,15 +273,24 @@ WHERE {{
 }}""".format(submissionOntology=submissionOntology[''].uri)
     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
 
-    results = missing_lib_query.execute(model)
-    for row in results:
-        subid = row['subid']
-        name = row['name']
-        print "# {0}".format(name)
-        print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn "\
-              "<http://jumpgate.caltech.edu/library/> ."
-        print ""
+    return missing_lib_query.execute(model)
+
+
+def find_unscanned_submitted_libraries(model):
+    """Scan model for libraries that don't have library details loaded
+    """
+    unscanned_libraries = """
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT distinct ?submission ?library_urn
+WHERE {{
+  ?submission submissionOntology:library_urn ?library_urn .
+  OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
+  FILTER(!BOUND(?library_type))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+    query = RDF.SPARQLQuery(unscanned_libraries)
+    return query.execute(model)
 
 
 def add_submission_creation_date(model, subUrn, cookie):
@@ -264,15 +299,17 @@ def add_submission_creation_date(model, subUrn, cookie):
     if len(creation_dates) == 0:
         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        parse_submission_page(model, cells, subUrn)
+        parse_submission_page(model, submissionTree, subUrn)
     else:
         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 
+
 def get_creation_dates(model, subUrn):
     query = RDF.Statement(subUrn, CREATION_DATE, None)
     creation_dates = list(model.find_statements(query))
     return creation_dates
 
+
 def parse_submission_page(model, submissionTree, subUrn):
     cells = submissionTree.findall('.//td')
     dateTimeType = xsdNS['dateTime']
@@ -384,7 +421,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
                 add_stmt(model, fileNode, predicate, object)
 
 
-def load_encode_libraries(model, htswapi):
+def load_encode_assigned_libraries(model, htswapi):
     """Get libraries associated with encode.
     """
     encodeFilters = ["/library/?affiliations__id__exact=44",
@@ -400,22 +437,30 @@ def load_encode_libraries(model, htswapi):
         libraries = model.find_statements(query)
         for statement in libraries:
             libraryUrn = statement.subject
-            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
             load_library_detail(model, libraryUrn)
 
 
-def load_encodedcc_files(model, base_url):
-    if base_url[-1] != '/':
-        base_url += '/'
+def load_unassigned_submitted_libraries(model):
+    unassigned = find_unscanned_submitted_libraries(model)
+    for query_record in unassigned:
+        library_urn = query_record['library_urn']
+        LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
+        load_library_detail(model, library_urn)
+
+
+def load_encodedcc_files(model, genome, composite):
+    file_index = ucsc.get_encodedcc_file_index(genome, composite)
+    if file_index is None:
+        return
 
-    file_index = ucsc.get_ucsc_file_index(base_url)
     for filename, attributes in file_index.items():
-        s = RDF.Node(RDF.Uri(base_url + filename))
+        s = RDF.Node(RDF.Uri(filename))
         for name, value in attributes.items():
             p = RDF.Node(DCC_NS[name])
             o = RDF.Node(value)
             model.add_statement(RDF.Statement(s,p,o))
 
+
 def load_library_detail(model, libraryUrn):
     """Grab detail information from library page
     """
@@ -426,7 +471,11 @@ def load_library_detail(model, libraryUrn):
     LOGGER.debug(log_message.format(len(results), libraryUrn))
     if len(results) == 0:
         LOGGER.info("Loading {0}".format(str(libraryUrn)))
-        rdfaParser.parse_into_model(model, libraryUrn.uri)
+        try:
+            body = get_url_as_text(str(libraryUrn.uri), 'GET')
+            rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
+        except httplib2.HttpLib2ErrorWithResponse, e:
+            LOGGER.error(str(e))
     elif len(results) == 1:
         pass  # Assuming that a loaded dataset has one record
     else:
@@ -440,11 +489,15 @@ def get_library_id(name):
     '11039'
     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
     '10150'
+    >>> get_library_id('2x75-GM12892-rep2-SL2970')
+    '02970'
     """
     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
     library_id = None
     if match is not None:
         library_id = match.group('id')
+    if library_id in SL_MAP:
+        library_id = SL_MAP[library_id]
     return library_id
 
 
@@ -522,6 +575,7 @@ def get_url_as_tree(url, method, cookie=None):
         msg = "error accessing {0}, status {1}"
         msg = msg.format(url, response['status'])
         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
 
 
 def get_url_as_text(url, method, cookie=None):
@@ -536,6 +590,7 @@ def get_url_as_text(url, method, cookie=None):
         msg = "error accessing {0}, status {1}"
         msg = msg.format(url, response['status'])
         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+        raise e
 
 ################
 #  old stuff
index a003594310069b5108073890abe1d99ec409f2bf..53ac287a5f0b876eb18df0638277493e7e106604 100644 (file)
@@ -12,7 +12,7 @@ wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep1.fastq.tgz  project=wgEncode; g
 class TestUCSCInfo(unittest.TestCase):
     def test_parse_encodedcc_file(self):
         stream = StringIO(ENCODE_FILES)
-        file_index = ucsc.parse_ucsc_file_index(stream)
+        file_index = ucsc.parse_ucsc_file_index(stream, 'http://example.com/files')
         self.assertEquals(len(file_index), 2)
 
         for attributes in file_index.values():
index 9181830e34983724e586a906a7194d96fc9507eb..f7734adbd972b26db9150b155a7cdeb39818e0c1 100644 (file)
@@ -1,10 +1,18 @@
 """Utilities for extracting information from the ENCODE DCC
 """
+import logging
 import urlparse
 import urllib2
 
+LOGGER = logging.getLogger(__name__)
+
 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
 
+GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/",
+                "http://hgdownload.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/"]
+
 
 def ddf_download_url(submission_id):
     """Return url to download a DDF for a submission
@@ -36,21 +44,41 @@ def submission_view_url(submission_id):
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
 
-def get_ucsc_file_index(base_url):
+def get_encodedcc_file_index(genome, composite):
     """Get index of files for a ENCODE collection
+
+    returns None on error
     """
-    if base_url[-1] != '/': base_url += '/'
-    request = urllib2.urlopen(base_url + 'files.txt')
-    file_index = parse_ucsc_file_index(request)
-    return file_index
+    err = None
+    params = {'genome': genome,
+              'composite': composite}
+
+    for path in GOLDEN_PATHS:
+        base_url = path.format(**params)
+        request_url = base_url + 'files.txt'
+
+        try:
+            request = urllib2.urlopen(request_url)
+            file_index = parse_ucsc_file_index(request, base_url)
+            return file_index
+        except urllib2.HTTPError, e:
+            err = e
+            pass
+
+    if err is not None:
+        errmsg = "get_ucsc_file_index <{0}>: {1}"
+        LOGGER.error(errmsg.format(request_url, str(e)))
+
+    return None
 
 
-def parse_ucsc_file_index(stream):
+def parse_ucsc_file_index(stream, base_url):
     """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
     """
     file_index = {}
     for line in stream:
         filename, attribute_line = line.split('\t')
+        filename = base_url + filename
         attributes = {}
         for assignment in  attribute_line.split(';'):
             name, value = assignment.split('=')