Try to figure out where UCSC is hosting our submitted files

author Diane Trout <diane@caltech.edu>

Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)

committer Diane Trout <diane@caltech.edu>

Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
author Diane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
committer Diane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

index 508291244ffea73c8cda46b7a74b875bc3ca1af3..6608a05de61154713d9e47bd8ab37b71424c520d 100644 (file)
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -20,7 +20,7 @@ import sys
  import urllib
  import urlparse
  
-from htsworkflow.submission import daf
+from htsworkflow.submission import daf, ucsc
  
  from htsworkflow.util import api
  from htsworkflow.util.rdfhelp import \
@@ -36,6 +36,7 @@ from htsworkflow.util.rdfhelp import \
       rdfsNS, \
       xsdNS
  TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
  
  # URL mappings
  LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
@@ -43,11 +44,11 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  from htsworkflow.submission.ucsc import \
       daf_download_url, \
       ddf_download_url, \
+     get_ucsc_file_index, \
       submission_view_url, \
       UCSCEncodePipeline
  
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
  
  DBDIR = os.path.expanduser("~diane/proj/submission")
  
@@ -59,7 +60,8 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
-
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
  def main(cmdline=None):
      """
      Parse command line arguments
@@ -94,6 +96,15 @@ def main(cmdline=None):
          cookie = login(cookie=cookie)
          load_my_submissions(model, limit=limit, cookie=cookie)
          load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
  
      if opts.sparql is not None:
          sparql_query(model, opts.sparql)
@@ -171,7 +182,14 @@ def load_my_submissions(model, limit=None, cookie=None):
              if limit is None or submission_id in limit:
                  subUrn = RDF.Uri(submission_view_url(submission_id))
  
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
  
                  name = str(cell[4].text_content())
                  add_stmt(model, subUrn, name_n, name)
@@ -203,8 +221,6 @@ def load_my_submissions(model, limit=None, cookie=None):
                  LOGGER.info("Processed {0}".format(subUrn))
  
  
-
-
  def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
      """Add a link from a UCSC submission to woldlab library if needed
      """
@@ -244,28 +260,34 @@ WHERE {{
  
  def add_submission_creation_date(model, subUrn, cookie):
      # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
+    creation_dates = get_creation_dates(model, subUrn)
      if len(creation_dates) == 0:
          LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
      else:
          LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
  
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
  
  def update_submission_detail(model, subUrn, status, recent_update, cookie):
      HasStatusN = submissionOntology['has_status']
@@ -322,7 +344,7 @@ def update_ddf(model, subUrn, statusNode, cookie):
      download_ddf_url = str(subUrn).replace('show', 'download_ddf')
      ddfUrn = RDF.Uri(download_ddf_url)
  
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
      if not model.contains_statement(status_is_ddf):
          LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
          ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -336,7 +358,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
      ddf_lines = ddf_string.split('\n')
      # first line is header
      header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
+    attributes = [DCC_NS[x] for x in header]
  
      for ddf_line in ddf_lines[1:]:
          ddf_line = ddf_line.strip()
@@ -355,8 +377,8 @@ def add_ddf_statements(model, statusNode, ddf_string):
                       statusNode,
                       submissionOntology['has_file'],
                       fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
  
              for predicate, object in zip(attributes[1:], file_attributes):
                  add_stmt(model, fileNode, predicate, object)
@@ -382,6 +404,18 @@ def load_encode_libraries(model, htswapi):
              load_library_detail(model, libraryUrn)
  
  
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
  def load_library_detail(model, libraryUrn):
      """Grab detail information from library page
      """
@@ -429,7 +463,9 @@ def get_contents(element):
  
  def create_status_node(submission_uri, timestamp):
      submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
      return RDF.Node(RDF.Uri(status_uri))
  
  
diff --git a/extra/ucsc_encode_submission/test_encode_find.py b/extra/ucsc_encode_submission/test_encode_find.py

new file mode 100644 (file)

index 0000000..98bdb46
--- /dev/null
+++ b/extra/ucsc_encode_submission/test_encode_find.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+    def test_create_status_node_with_uri(self):
+        subURL = submission_view_url('5136')
+        submissionUri = RDF.Uri(subURL)
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_create_status_node_with_str(self):
+        subURL = submission_view_url('5136')
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(subURL, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_parse_submission_page(self):
+        timestamp = '2011-12-19T12:42:53.048956'
+        subURL = submission_view_url('5136')
+        subNode = encode_find.create_status_node(subURL, timestamp)
+        test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+        from lxml.html import parse
+        tree = parse(test_file)
+        model = get_model()
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 0)
+        encode_find.parse_submission_page(model, tree, subNode)
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 1)
+        self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+    return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/htsworkflow/submission/ucsc.py b/htsworkflow/submission/ucsc.py

index f80629a9cda462ac6ce8e45f58949b4ffb9037f9..9181830e34983724e586a906a7194d96fc9507eb 100644 (file)
--- a/htsworkflow/submission/ucsc.py
+++ b/htsworkflow/submission/ucsc.py
@@ -1,7 +1,11 @@
+"""Utilities for extracting information from the ENCODE DCC
+"""
  import urlparse
+import urllib2
  
  UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
  
+
  def ddf_download_url(submission_id):
      """Return url to download a DDF for a submission
  
@@ -11,6 +15,7 @@ def ddf_download_url(submission_id):
      fragment = 'download_ddf/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
  
+
  def daf_download_url(submission_id):
      """Return url to download a DAF for a submission
  
@@ -20,6 +25,7 @@ def daf_download_url(submission_id):
      fragment = 'download_daf/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
  
+
  def submission_view_url(submission_id):
      """Return url to download a DAF for a submission
  
@@ -28,3 +34,27 @@ def submission_view_url(submission_id):
      """
      fragment = 'show/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+    """Get index of files for a ENCODE collection
+    """
+    if base_url[-1] != '/': base_url += '/'
+    request = urllib2.urlopen(base_url + 'files.txt')
+    file_index = parse_ucsc_file_index(request)
+    return file_index
+
+
+def parse_ucsc_file_index(stream):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py

index 9fcd3311ee77215fd8ae4ed732e7a22b287d4a62..8fb1424960571d4ab327ab2c4261b293eb1efb56 100644 (file)
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -33,7 +33,7 @@ def sparql_query(model, query_filename):
          output = []
          for k,v in row.items()[::-1]:
              print "{0}: {1}".format(k,v)
-        print 
+        print
  
  
  def blankOrUri(value=None):
@@ -99,11 +99,11 @@ def fromTypedNode(node):
  
      return literal
  
-    
+
  def get_model(model_name=None, directory=None):
      if directory is None:
          directory = os.getcwd()
-        
+
      if model_name is None:
          storage = RDF.MemoryStorage()
          logger.info("Using RDF Memory model")
@@ -114,12 +114,12 @@ def get_model(model_name=None, directory=None):
          logger.info("Using {0} with options {1}".format(model_name, options))
      model = RDF.Model(storage)
      return model
-        
+
  
  def load_into_model(model, parser_name, filename, ns=None):
      if not os.path.exists(filename):
          raise IOError("Can't find {0}".format(filename))
-    
+
      data = open(filename, 'r').read()
      load_string_into_model(model, parser_name, data, ns)
  
@@ -127,7 +127,7 @@ def load_into_model(model, parser_name, filename, ns=None):
  def load_string_into_model(model, parser_name, data, ns=None):
      if ns is None:
          ns = "http://localhost/"
-        
+
      rdf_parser = RDF.Parser(name=parser_name)
      rdf_parser.parse_string_into_model(model, data, ns)
  
@@ -148,3 +148,6 @@ def get_serializer(name='turtle'):
      writer.set_namespace('ucscDaf', dafTermOntology._prefix)
      return writer
  
+def dump_model(model):
+    serializer = get_serializer()
+    print serializer.serialize_model_to_string(model)
author	Diane Trout <diane@caltech.edu>
	Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
committer	Diane Trout <diane@caltech.edu>
	Tue, 20 Dec 2011 00:03:44 +0000 (16:03 -0800)
extra/ucsc_encode_submission/encode_find.py		patch \| blob \| history
extra/ucsc_encode_submission/test_encode_find.py	[new file with mode: 0644]	patch \| blob
htsworkflow/submission/ucsc.py		patch \| blob \| history
htsworkflow/util/rdfhelp.py		patch \| blob \| history