1 """Utilities for extracting information from the ENCODE DCC
5 import urllib.request, urllib.error, urllib.parse
7 LOGGER = logging.getLogger(__name__)
9 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
11 GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
12 "{genome}/encodeDCC/{composite}/",
13 "http://hgdownload.cse.ucsc.edu/goldenPath/"\
14 "{genome}/encodeDCC/{composite}/"]
17 def ddf_download_url(submission_id):
18 """Return url to download a DDF for a submission
20 >>> ddf_download_url(1234)
21 'http://encodesubmit.ucsc.edu/pipeline/download_ddf/1234'
23 fragment = 'download_ddf/%s' % (submission_id,)
24 return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
27 def daf_download_url(submission_id):
28 """Return url to download a DAF for a submission
30 >>> daf_download_url(1234)
31 'http://encodesubmit.ucsc.edu/pipeline/download_daf/1234'
33 fragment = 'download_daf/%s' % (submission_id,)
34 return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
37 def submission_view_url(submission_id):
38 """Return url to download a DAF for a submission
40 >>> submission_view_url(1234)
41 'http://encodesubmit.ucsc.edu/pipeline/show/1234'
43 fragment = 'show/%s' % (submission_id,)
44 return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
47 def get_encodedcc_file_index(genome, composite):
48 """Get index of files for a ENCODE collection
53 params = {'genome': genome,
54 'composite': composite}
56 for path in GOLDEN_PATHS:
57 base_url = path.format(**params)
58 request_url = base_url + 'files.txt'
61 request = urllib.request.urlopen(request_url)
62 file_index = parse_ucsc_file_index(request, base_url)
64 except urllib.error.HTTPError as e:
69 errmsg = "get_ucsc_file_index <{0}>: {1}"
70 LOGGER.error(errmsg.format(request_url, str(e)))
75 def parse_ucsc_file_index(stream, base_url):
76 """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
80 filename, attribute_line = line.split('\t')
81 filename = base_url + filename
83 for assignment in attribute_line.split(';'):
84 name, value = assignment.split('=')
85 attributes[name.strip()] = value.strip()
87 file_index[filename] = attributes