Try to figure out where UCSC is hosting our submitted files
authorDiane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
committerDiane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
This patch reads the download files index from UCSC's test server for
our tracks and adds them to my ENCODE metadata RDF database.

In addition there's a bug fix where the status URN wasn't being
constructed correctly. It was just using the timestamp and not
submission URN + timestamp.

extra/ucsc_encode_submission/encode_find.py
extra/ucsc_encode_submission/test_encode_find.py [new file with mode: 0644]
htsworkflow/submission/ucsc.py
htsworkflow/util/rdfhelp.py

index 508291244ffea73c8cda46b7a74b875bc3ca1af3..6608a05de61154713d9e47bd8ab37b71424c520d 100644 (file)
@@ -20,7 +20,7 @@ import sys
 import urllib
 import urlparse
 
-from htsworkflow.submission import daf
+from htsworkflow.submission import daf, ucsc
 
 from htsworkflow.util import api
 from htsworkflow.util.rdfhelp import \
@@ -36,6 +36,7 @@ from htsworkflow.util.rdfhelp import \
      rdfsNS, \
      xsdNS
 TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
 
 # URL mappings
 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
@@ -43,11 +44,11 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
 from htsworkflow.submission.ucsc import \
      daf_download_url, \
      ddf_download_url, \
+     get_ucsc_file_index, \
      submission_view_url, \
      UCSCEncodePipeline
 
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
 
 DBDIR = os.path.expanduser("~diane/proj/submission")
 
@@ -59,7 +60,8 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
 USERNAME = 'detrout'
 CHARSET = 'utf-8'
 
-
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
 def main(cmdline=None):
     """
     Parse command line arguments
@@ -94,6 +96,15 @@ def main(cmdline=None):
         cookie = login(cookie=cookie)
         load_my_submissions(model, limit=limit, cookie=cookie)
         load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
 
     if opts.sparql is not None:
         sparql_query(model, opts.sparql)
@@ -171,7 +182,14 @@ def load_my_submissions(model, limit=None, cookie=None):
             if limit is None or submission_id in limit:
                 subUrn = RDF.Uri(submission_view_url(submission_id))
 
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
 
                 name = str(cell[4].text_content())
                 add_stmt(model, subUrn, name_n, name)
@@ -203,8 +221,6 @@ def load_my_submissions(model, limit=None, cookie=None):
                 LOGGER.info("Processed {0}".format(subUrn))
 
 
-
-
 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
     """Add a link from a UCSC submission to woldlab library if needed
     """
@@ -244,28 +260,34 @@ WHERE {{
 
 def add_submission_creation_date(model, subUrn, cookie):
     # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
+    creation_dates = get_creation_dates(model, subUrn)
     if len(creation_dates) == 0:
         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
     else:
         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
 
 def update_submission_detail(model, subUrn, status, recent_update, cookie):
     HasStatusN = submissionOntology['has_status']
@@ -322,7 +344,7 @@ def update_ddf(model, subUrn, statusNode, cookie):
     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
     ddfUrn = RDF.Uri(download_ddf_url)
 
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
     if not model.contains_statement(status_is_ddf):
         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -336,7 +358,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
     ddf_lines = ddf_string.split('\n')
     # first line is header
     header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
+    attributes = [DCC_NS[x] for x in header]
 
     for ddf_line in ddf_lines[1:]:
         ddf_line = ddf_line.strip()
@@ -355,8 +377,8 @@ def add_ddf_statements(model, statusNode, ddf_string):
                      statusNode,
                      submissionOntology['has_file'],
                      fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
 
             for predicate, object in zip(attributes[1:], file_attributes):
                 add_stmt(model, fileNode, predicate, object)
@@ -382,6 +404,18 @@ def load_encode_libraries(model, htswapi):
             load_library_detail(model, libraryUrn)
 
 
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
 def load_library_detail(model, libraryUrn):
     """Grab detail information from library page
     """
@@ -429,7 +463,9 @@ def get_contents(element):
 
 def create_status_node(submission_uri, timestamp):
     submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
     return RDF.Node(RDF.Uri(status_uri))
 
 
diff --git a/extra/ucsc_encode_submission/test_encode_find.py b/extra/ucsc_encode_submission/test_encode_find.py
new file mode 100644 (file)
index 0000000..98bdb46
--- /dev/null
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+    def test_create_status_node_with_uri(self):
+        subURL = submission_view_url('5136')
+        submissionUri = RDF.Uri(subURL)
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_create_status_node_with_str(self):
+        subURL = submission_view_url('5136')
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(subURL, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_parse_submission_page(self):
+        timestamp = '2011-12-19T12:42:53.048956'
+        subURL = submission_view_url('5136')
+        subNode = encode_find.create_status_node(subURL, timestamp)
+        test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+        from lxml.html import parse
+        tree = parse(test_file)
+        model = get_model()
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 0)
+        encode_find.parse_submission_page(model, tree, subNode)
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 1)
+        self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+    return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
index f80629a9cda462ac6ce8e45f58949b4ffb9037f9..9181830e34983724e586a906a7194d96fc9507eb 100644 (file)
@@ -1,7 +1,11 @@
+"""Utilities for extracting information from the ENCODE DCC
+"""
 import urlparse
+import urllib2
 
 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
 
+
 def ddf_download_url(submission_id):
     """Return url to download a DDF for a submission
 
@@ -11,6 +15,7 @@ def ddf_download_url(submission_id):
     fragment = 'download_ddf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def daf_download_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -20,6 +25,7 @@ def daf_download_url(submission_id):
     fragment = 'download_daf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def submission_view_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -28,3 +34,27 @@ def submission_view_url(submission_id):
     """
     fragment = 'show/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+    """Get index of files for a ENCODE collection
+    """
+    if base_url[-1] != '/': base_url += '/'
+    request = urllib2.urlopen(base_url + 'files.txt')
+    file_index = parse_ucsc_file_index(request)
+    return file_index
+
+
+def parse_ucsc_file_index(stream):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index
index 9fcd3311ee77215fd8ae4ed732e7a22b287d4a62..8fb1424960571d4ab327ab2c4261b293eb1efb56 100644 (file)
@@ -33,7 +33,7 @@ def sparql_query(model, query_filename):
         output = []
         for k,v in row.items()[::-1]:
             print "{0}: {1}".format(k,v)
-        print 
+        print
 
 
 def blankOrUri(value=None):
@@ -99,11 +99,11 @@ def fromTypedNode(node):
 
     return literal
 
-    
+
 def get_model(model_name=None, directory=None):
     if directory is None:
         directory = os.getcwd()
-        
+
     if model_name is None:
         storage = RDF.MemoryStorage()
         logger.info("Using RDF Memory model")
@@ -114,12 +114,12 @@ def get_model(model_name=None, directory=None):
         logger.info("Using {0} with options {1}".format(model_name, options))
     model = RDF.Model(storage)
     return model
-        
+
 
 def load_into_model(model, parser_name, filename, ns=None):
     if not os.path.exists(filename):
         raise IOError("Can't find {0}".format(filename))
-    
+
     data = open(filename, 'r').read()
     load_string_into_model(model, parser_name, data, ns)
 
@@ -127,7 +127,7 @@ def load_into_model(model, parser_name, filename, ns=None):
 def load_string_into_model(model, parser_name, data, ns=None):
     if ns is None:
         ns = "http://localhost/"
-        
+
     rdf_parser = RDF.Parser(name=parser_name)
     rdf_parser.parse_string_into_model(model, data, ns)
 
@@ -148,3 +148,6 @@ def get_serializer(name='turtle'):
     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
     return writer
 
+def dump_model(model):
+    serializer = get_serializer()
+    print serializer.serialize_model_to_string(model)