Try to figure out where UCSC is hosting our submitted files

[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

index 508291244ffea73c8cda46b7a74b875bc3ca1af3..6608a05de61154713d9e47bd8ab37b71424c520d 100644 (file)
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -20,7 +20,7 @@ import sys
  import urllib
  import urlparse
  
-from htsworkflow.submission import daf
+from htsworkflow.submission import daf, ucsc
  
  from htsworkflow.util import api
  from htsworkflow.util.rdfhelp import \
@@ -36,6 +36,7 @@ from htsworkflow.util.rdfhelp import \
       rdfsNS, \
       xsdNS
  TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
  
  # URL mappings
  LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
@@ -43,11 +44,11 @@ LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  from htsworkflow.submission.ucsc import \
       daf_download_url, \
       ddf_download_url, \
+     get_ucsc_file_index, \
       submission_view_url, \
       UCSCEncodePipeline
  
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
  
  DBDIR = os.path.expanduser("~diane/proj/submission")
  
@@ -59,7 +60,8 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
-
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
  def main(cmdline=None):
      """
      Parse command line arguments
@@ -94,6 +96,15 @@ def main(cmdline=None):
          cookie = login(cookie=cookie)
          load_my_submissions(model, limit=limit, cookie=cookie)
          load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
  
      if opts.sparql is not None:
          sparql_query(model, opts.sparql)
@@ -171,7 +182,14 @@ def load_my_submissions(model, limit=None, cookie=None):
              if limit is None or submission_id in limit:
                  subUrn = RDF.Uri(submission_view_url(submission_id))
  
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
  
                  name = str(cell[4].text_content())
                  add_stmt(model, subUrn, name_n, name)
@@ -203,8 +221,6 @@ def load_my_submissions(model, limit=None, cookie=None):
                  LOGGER.info("Processed {0}".format(subUrn))
  
  
-
-
  def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
      """Add a link from a UCSC submission to woldlab library if needed
      """
@@ -244,28 +260,34 @@ WHERE {{
  
  def add_submission_creation_date(model, subUrn, cookie):
      # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
+    creation_dates = get_creation_dates(model, subUrn)
      if len(creation_dates) == 0:
          LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
      else:
          LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
  
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
  
  def update_submission_detail(model, subUrn, status, recent_update, cookie):
      HasStatusN = submissionOntology['has_status']
@@ -322,7 +344,7 @@ def update_ddf(model, subUrn, statusNode, cookie):
      download_ddf_url = str(subUrn).replace('show', 'download_ddf')
      ddfUrn = RDF.Uri(download_ddf_url)
  
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
      if not model.contains_statement(status_is_ddf):
          LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
          ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -336,7 +358,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
      ddf_lines = ddf_string.split('\n')
      # first line is header
      header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
+    attributes = [DCC_NS[x] for x in header]
  
      for ddf_line in ddf_lines[1:]:
          ddf_line = ddf_line.strip()
@@ -355,8 +377,8 @@ def add_ddf_statements(model, statusNode, ddf_string):
                       statusNode,
                       submissionOntology['has_file'],
                       fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
  
              for predicate, object in zip(attributes[1:], file_attributes):
                  add_stmt(model, fileNode, predicate, object)
@@ -382,6 +404,18 @@ def load_encode_libraries(model, htswapi):
              load_library_detail(model, libraryUrn)
  
  
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
  def load_library_detail(model, libraryUrn):
      """Grab detail information from library page
      """
@@ -429,7 +463,9 @@ def get_contents(element):
  
  def create_status_node(submission_uri, timestamp):
      submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
      return RDF.Node(RDF.Uri(status_uri))