use six.moves to work around urllib / urllib2 / urlparse to urllib 2to3 cleanup
[htsworkflow.git] / htsworkflow / submission / ucsc.py
1 """Utilities for extracting information from the ENCODE DCC
2 """
3 import logging
4 from six.moves import urllib
5
6 LOGGER = logging.getLogger(__name__)
7
8 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
9
10 GOLDEN_PATHS = ["http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
11                 "{genome}/encodeDCC/{composite}/",
12                 "http://hgdownload.cse.ucsc.edu/goldenPath/"\
13                 "{genome}/encodeDCC/{composite}/"]
14
15
16 def ddf_download_url(submission_id):
17     """Return url to download a DDF for a submission
18
19     >>> ddf_download_url(1234)
20     'http://encodesubmit.ucsc.edu/pipeline/download_ddf/1234'
21     """
22     fragment = 'download_ddf/%s' % (submission_id,)
23     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
24
25
26 def daf_download_url(submission_id):
27     """Return url to download a DAF for a submission
28
29     >>> daf_download_url(1234)
30     'http://encodesubmit.ucsc.edu/pipeline/download_daf/1234'
31     """
32     fragment = 'download_daf/%s' % (submission_id,)
33     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
34
35
36 def submission_view_url(submission_id):
37     """Return url to download a DAF for a submission
38
39     >>> submission_view_url(1234)
40     'http://encodesubmit.ucsc.edu/pipeline/show/1234'
41     """
42     fragment = 'show/%s' % (submission_id,)
43     return urllib.parse.urljoin(UCSCEncodePipeline, fragment)
44
45
46 def get_encodedcc_file_index(genome, composite):
47     """Get index of files for a ENCODE collection
48
49     returns None on error
50     """
51     err = None
52     params = {'genome': genome,
53               'composite': composite}
54
55     for path in GOLDEN_PATHS:
56         base_url = path.format(**params)
57         request_url = base_url + 'files.txt'
58
59         try:
60             request = urllib.request.urlopen(request_url)
61             file_index = parse_ucsc_file_index(request, base_url)
62             return file_index
63         except urllib.request.HTTPError as e:
64             err = e
65             pass
66
67     if err is not None:
68         errmsg = "get_ucsc_file_index <{0}>: {1}"
69         LOGGER.error(errmsg.format(request_url, str(e)))
70
71     return None
72
73
74 def parse_ucsc_file_index(stream, base_url):
75     """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
76     """
77     file_index = {}
78     for line in stream:
79         filename, attribute_line = line.split('\t')
80         filename = base_url + filename
81         attributes = {}
82         for assignment in  attribute_line.split(';'):
83             name, value = assignment.split('=')
84             attributes[name.strip()] = value.strip()
85
86         file_index[filename] = attributes
87     return file_index