Initial port to python3
[htsworkflow.git] / htsworkflow / submission / ucsc.py
index f80629a9cda462ac6ce8e45f58949b4ffb9037f9..3fd83484096858207be4a7e056a56632ae049bb7 100644 (file)
@@ -1,7 +1,19 @@
-import urlparse
+"""Utilities for extracting information from the ENCODE DCC
+"""
+import logging
+import urllib.parse
+import urllib.request, urllib.error, urllib.parse
+
+LOGGER = logging.getLogger(__name__)
 
 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
 
+GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/",
+                "http://hgdownload.cse.ucsc.edu/goldenPath/"\
+                "{genome}/encodeDCC/{composite}/"]
+
+
 def ddf_download_url(submission_id):
     """Return url to download a DDF for a submission
 
@@ -9,7 +21,8 @@ def ddf_download_url(submission_id):
     'http://encodesubmit.ucsc.edu/pipeline/download_ddf/1234'
     """
     fragment = 'download_ddf/%s' % (submission_id,)
-    return urlparse.urljoin(UCSCEncodePipeline, fragment)
+    return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
+
 
 def daf_download_url(submission_id):
     """Return url to download a DAF for a submission
@@ -18,7 +31,8 @@ def daf_download_url(submission_id):
     'http://encodesubmit.ucsc.edu/pipeline/download_daf/1234'
     """
     fragment = 'download_daf/%s' % (submission_id,)
-    return urlparse.urljoin(UCSCEncodePipeline, fragment)
+    return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
+
 
 def submission_view_url(submission_id):
     """Return url to download a DAF for a submission
@@ -27,4 +41,48 @@ def submission_view_url(submission_id):
     'http://encodesubmit.ucsc.edu/pipeline/show/1234'
     """
     fragment = 'show/%s' % (submission_id,)
-    return urlparse.urljoin(UCSCEncodePipeline, fragment)
+    return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_encodedcc_file_index(genome, composite):
+    """Get index of files for a ENCODE collection
+
+    returns None on error
+    """
+    err = None
+    params = {'genome': genome,
+              'composite': composite}
+
+    for path in GOLDEN_PATHS:
+        base_url = path.format(**params)
+        request_url = base_url + 'files.txt'
+
+        try:
+            request = urllib.request.urlopen(request_url)
+            file_index = parse_ucsc_file_index(request, base_url)
+            return file_index
+        except urllib.error.HTTPError as e:
+            err = e
+            pass
+
+    if err is not None:
+        errmsg = "get_ucsc_file_index <{0}>: {1}"
+        LOGGER.error(errmsg.format(request_url, str(e)))
+
+    return None
+
+
+def parse_ucsc_file_index(stream, base_url):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        filename = base_url + filename
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index