From: Diane Trout <diane@caltech.edu>
Date: Tue, 20 Dec 2011 00:03:44 +0000 (-0800)
Subject: Try to figure out where UCSC is hosting our submitted files
X-Git-Tag: v0.5.5~76^2~5
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=8b3c93dd6aff4b5b6bffd5164dc098eafbcd808b

Try to figure out where UCSC is hosting our submitted files
This patch reads the download files index from UCSC's test server for
our tracks and adds them to my ENCODE metadata RDF database.

In addition there's a bug fix where the status URN wasn't being
constructed correctly. It was just using the timestamp and not
submission URN + timestamp.
---

diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py
index 5082912..6608a05 100644
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -20,7 +20,7 @@ import sys
 import urllib
 import urlparse
 
-from htsworkflow.submission import daf
+from htsworkflow.submission import daf, ucsc
 
 from htsworkflow.util import api
 from htsworkflow.util.rdfhelp import \
@@ -36,6 +36,7 @@ from htsworkflow.util.rdfhelp import \
      rdfsNS, \
      xsdNS
 TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
 
 # URL mappings
 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
@@ -43,11 +44,11 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
 from htsworkflow.submission.ucsc import \
      daf_download_url, \
      ddf_download_url, \
+     get_ucsc_file_index, \
      submission_view_url, \
      UCSCEncodePipeline
 
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
 
 DBDIR = os.path.expanduser("~diane/proj/submission")
 
@@ -59,7 +60,8 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
 USERNAME = 'detrout'
 CHARSET = 'utf-8'
 
-
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
 def main(cmdline=None):
     """
     Parse command line arguments
@@ -94,6 +96,15 @@ def main(cmdline=None):
         cookie = login(cookie=cookie)
         load_my_submissions(model, limit=limit, cookie=cookie)
         load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
 
     if opts.sparql is not None:
         sparql_query(model, opts.sparql)
@@ -171,7 +182,14 @@ def load_my_submissions(model, limit=None, cookie=None):
             if limit is None or submission_id in limit:
                 subUrn = RDF.Uri(submission_view_url(submission_id))
 
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
 
                 name = str(cell[4].text_content())
                 add_stmt(model, subUrn, name_n, name)
@@ -203,8 +221,6 @@ def load_my_submissions(model, limit=None, cookie=None):
                 LOGGER.info("Processed {0}".format(subUrn))
 
 
-
-
 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
     """Add a link from a UCSC submission to woldlab library if needed
     """
@@ -244,28 +260,34 @@ WHERE {{
 
 def add_submission_creation_date(model, subUrn, cookie):
     # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
+    creation_dates = get_creation_dates(model, subUrn)
     if len(creation_dates) == 0:
         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
     else:
         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
 
 def update_submission_detail(model, subUrn, status, recent_update, cookie):
     HasStatusN = submissionOntology['has_status']
@@ -322,7 +344,7 @@ def update_ddf(model, subUrn, statusNode, cookie):
     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
     ddfUrn = RDF.Uri(download_ddf_url)
 
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
     if not model.contains_statement(status_is_ddf):
         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -336,7 +358,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
     ddf_lines = ddf_string.split('\n')
     # first line is header
     header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
+    attributes = [DCC_NS[x] for x in header]
 
     for ddf_line in ddf_lines[1:]:
         ddf_line = ddf_line.strip()
@@ -355,8 +377,8 @@ def add_ddf_statements(model, statusNode, ddf_string):
                      statusNode,
                      submissionOntology['has_file'],
                      fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
 
             for predicate, object in zip(attributes[1:], file_attributes):
                 add_stmt(model, fileNode, predicate, object)
@@ -382,6 +404,18 @@ def load_encode_libraries(model, htswapi):
             load_library_detail(model, libraryUrn)
 
 
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
 def load_library_detail(model, libraryUrn):
     """Grab detail information from library page
     """
@@ -429,7 +463,9 @@ def get_contents(element):
 
 def create_status_node(submission_uri, timestamp):
     submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
     return RDF.Node(RDF.Uri(status_uri))
 
 
diff --git a/extra/ucsc_encode_submission/test_encode_find.py b/extra/ucsc_encode_submission/test_encode_find.py
new file mode 100644
index 0000000..98bdb46
--- /dev/null
+++ b/extra/ucsc_encode_submission/test_encode_find.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+    def test_create_status_node_with_uri(self):
+        subURL = submission_view_url('5136')
+        submissionUri = RDF.Uri(subURL)
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_create_status_node_with_str(self):
+        subURL = submission_view_url('5136')
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(subURL, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_parse_submission_page(self):
+        timestamp = '2011-12-19T12:42:53.048956'
+        subURL = submission_view_url('5136')
+        subNode = encode_find.create_status_node(subURL, timestamp)
+        test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+        from lxml.html import parse
+        tree = parse(test_file)
+        model = get_model()
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 0)
+        encode_find.parse_submission_page(model, tree, subNode)
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 1)
+        self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+    return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/htsworkflow/submission/ucsc.py b/htsworkflow/submission/ucsc.py
index f80629a..9181830 100644
--- a/htsworkflow/submission/ucsc.py
+++ b/htsworkflow/submission/ucsc.py
@@ -1,7 +1,11 @@
+"""Utilities for extracting information from the ENCODE DCC
+"""
 import urlparse
+import urllib2
 
 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
 
+
 def ddf_download_url(submission_id):
     """Return url to download a DDF for a submission
 
@@ -11,6 +15,7 @@ def ddf_download_url(submission_id):
     fragment = 'download_ddf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def daf_download_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -20,6 +25,7 @@ def daf_download_url(submission_id):
     fragment = 'download_daf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def submission_view_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -28,3 +34,27 @@ def submission_view_url(submission_id):
     """
     fragment = 'show/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+    """Get index of files for a ENCODE collection
+    """
+    if base_url[-1] != '/': base_url += '/'
+    request = urllib2.urlopen(base_url + 'files.txt')
+    file_index = parse_ucsc_file_index(request)
+    return file_index
+
+
+def parse_ucsc_file_index(stream):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py
index 9fcd331..8fb1424 100644
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -33,7 +33,7 @@ def sparql_query(model, query_filename):
         output = []
         for k,v in row.items()[::-1]:
             print "{0}: {1}".format(k,v)
-        print 
+        print
 
 
 def blankOrUri(value=None):
@@ -99,11 +99,11 @@ def fromTypedNode(node):
 
     return literal
 
-    
+
 def get_model(model_name=None, directory=None):
     if directory is None:
         directory = os.getcwd()
-        
+
     if model_name is None:
         storage = RDF.MemoryStorage()
         logger.info("Using RDF Memory model")
@@ -114,12 +114,12 @@ def get_model(model_name=None, directory=None):
         logger.info("Using {0} with options {1}".format(model_name, options))
     model = RDF.Model(storage)
     return model
-        
+
 
 def load_into_model(model, parser_name, filename, ns=None):
     if not os.path.exists(filename):
         raise IOError("Can't find {0}".format(filename))
-    
+
     data = open(filename, 'r').read()
     load_string_into_model(model, parser_name, data, ns)
 
@@ -127,7 +127,7 @@ def load_into_model(model, parser_name, filename, ns=None):
 def load_string_into_model(model, parser_name, data, ns=None):
     if ns is None:
         ns = "http://localhost/"
-        
+
     rdf_parser = RDF.Parser(name=parser_name)
     rdf_parser.parse_string_into_model(model, data, ns)
 
@@ -148,3 +148,6 @@ def get_serializer(name='turtle'):
     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
     return writer
 
+def dump_model(model):
+    serializer = get_serializer()
+    print serializer.serialize_model_to_string(model)