Initial port to python3
[htsworkflow.git] / htsworkflow / submission / ucsc.py
1 """Utilities for extracting information from the ENCODE DCC
2 """
3 import logging
4 import urllib.parse
5 import urllib.request, urllib.error, urllib.parse
6
7 LOGGER = logging.getLogger(__name__)
8
9 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
10
11 GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
12                 "{genome}/encodeDCC/{composite}/",
13                 "http://hgdownload.cse.ucsc.edu/goldenPath/"\
14                 "{genome}/encodeDCC/{composite}/"]
15
16
17 def ddf_download_url(submission_id):
18     """Return url to download a DDF for a submission
19
20     >>> ddf_download_url(1234)
21     'http://encodesubmit.ucsc.edu/pipeline/download_ddf/1234'
22     """
23     fragment = 'download_ddf/%s' % (submission_id,)
24     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
25
26
27 def daf_download_url(submission_id):
28     """Return url to download a DAF for a submission
29
30     >>> daf_download_url(1234)
31     'http://encodesubmit.ucsc.edu/pipeline/download_daf/1234'
32     """
33     fragment = 'download_daf/%s' % (submission_id,)
34     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
35
36
37 def submission_view_url(submission_id):
38     """Return url to download a DAF for a submission
39
40     >>> submission_view_url(1234)
41     'http://encodesubmit.ucsc.edu/pipeline/show/1234'
42     """
43     fragment = 'show/%s' % (submission_id,)
44     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
45
46
47 def get_encodedcc_file_index(genome, composite):
48     """Get index of files for a ENCODE collection
49
50     returns None on error
51     """
52     err = None
53     params = {'genome': genome,
54               'composite': composite}
55
56     for path in GOLDEN_PATHS:
57         base_url = path.format(**params)
58         request_url = base_url + 'files.txt'
59
60         try:
61             request = urllib.request.urlopen(request_url)
62             file_index = parse_ucsc_file_index(request, base_url)
63             return file_index
64         except urllib.error.HTTPError as e:
65             err = e
66             pass
67
68     if err is not None:
69         errmsg = "get_ucsc_file_index <{0}>: {1}"
70         LOGGER.error(errmsg.format(request_url, str(e)))
71
72     return None
73
74
75 def parse_ucsc_file_index(stream, base_url):
76     """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
77     """
78     file_index = {}
79     for line in stream:
80         filename, attribute_line = line.split('\t')
81         filename = base_url + filename
82         attributes = {}
83         for assignment in  attribute_line.split(';'):
84             name, value = assignment.split('=')
85             attributes[name.strip()] = value.strip()
86
87         file_index[filename] = attributes
88     return file_index