htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5
   6
   7 from __future__ import print_function
   8 import collections
   9 import logging
  10 import json
  11 import jsonschema
  12 import requests
  13 import types
  14 from urlparse import urljoin, urlparse, urlunparse
  15
  16 LOGGER = logging.getLogger(__name__)
  17
  18 ENCODED_CONTEXT = {
  19     # The None context will get added to the root of the tree and will
  20     # provide common defaults.
  21     None: {
  22         # terms in multiple encoded objects
  23         'award': { '@type': '@id' },
  24         'dataset': {'@type': '@id'},
  25         'description': 'rdf:description',
  26         'documents': { '@type': '@id' },
  27         'experiment': {'@type': '@id'},
  28         'href': { '@type': '@id' },
  29         'lab': { '@type': '@id' },
  30         'library': {'@type': '@id' },
  31         'pi': { '@type': '@id' },
  32         'platform': { '@type': '@id' },
  33         'replicates': { '@type': '@id' },
  34         'submitted_by': { '@type': '@id' },
  35         'url': { '@type': '@id' },
  36     },
  37     # Identify and markup contained classes.
  38     # e.g. in the tree there was a sub-dictionary named 'biosample'
  39     # That dictionary had a term 'biosample_term_id, which is the
  40     # term that should be used as the @id.
  41     'biosample': {
  42         'biosample_term_id': { '@type': '@id' },
  43     },
  44     'experiment': {
  45         "assay_term_id": { "@type": "@id" },
  46         "files": { "@type": "@id" },
  47         "original_files": { "@type": "@id"},
  48     },
  49     # I tried to use the JSON-LD mapping capabilities to convert the lab
  50     # contact information into a vcard record, but the encoded model
  51     # didn't lend itself well to the vcard schema
  52     #'lab': {
  53     #    "address1": "vcard:street-address",
  54     #    "address2": "vcard:street-address",
  55     #    "city": "vcard:locality",
  56     #    "state": "vcard:region",
  57     #    "country": "vcard:country"
  58     #},
  59     'library': {
  60         'nucleic_acid_term_id': { '@type': '@id' }
  61     }
  62 }
  63
  64 #FIXME: this needs to be initialized from rdfns
  65 ENCODED_NAMESPACES = {
  66     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  67     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  68     # rdfs:label)
  69     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  70     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  71     "owl": "http://www.w3.org/2002/07/owl#",
  72     "dc": "htp://purl.org/dc/elements/1.1/",
  73     "xsd": "http://www.w3.org/2001/XMLSchema#",
  74     "vcard": "http://www.w3.org/2006/vcard/ns#",
  75
  76     # for some namespaces I made a best guess for the ontology root.
  77     "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
  78     "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
  79     "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
  80     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  81     'SO': 'http://purl.obolibrary.org/obo/SO_', # Sequence ontology
  82     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  83     # NTR: New Term Request space for DCC to implement new ontology terms
  84
  85 }
  86
  87 ENCODED_SCHEMA_ROOT='/profiles/'
  88
  89 class ENCODED:
  90     '''Programatic access encoded, the software powering ENCODE3's submit site.
  91     '''
  92     def __init__(self, server, contexts=None):
  93         self.server = server
  94         self.username = None
  95         self.password = None
  96         self.contexts = contexts if contexts else ENCODED_CONTEXT
  97         self.json_headers = {'Content-Type': 'application/json'}
  98         self.schemas = {}
  99
 100     def get_auth(self):
 101         return (self.username, self.password)
 102     auth = property(get_auth)
 103
 104     def load_netrc(self):
 105         import netrc
 106         session = netrc.netrc()
 107         authenticators = session.authenticators(self.server)
 108         if authenticators:
 109             self.username = authenticators[0]
 110             self.password = authenticators[2]
 111
 112     def add_jsonld_context(self, tree, default_base):
 113         """Add contexts to various objects in the tree.
 114
 115         tree is a json tree returned from the DCC's encoded database.
 116         contexts is a dictionary of dictionaries containing contexts
 117                 for the various  possible encoded classes.
 118         base, if supplied allows setting the base url that relative
 119             urls will be resolved against.
 120         """
 121         self.add_jsonld_child_context(tree, default_base)
 122         self.add_jsonld_namespaces(tree['@context'])
 123
 124     def add_jsonld_child_context(self, obj, default_base):
 125         '''Add JSON-LD context to the encoded JSON.
 126
 127         This is recursive because some of the IDs were relative URLs
 128         and I needed a way to properly compute a the correct base URL.
 129         '''
 130         # pretend strings aren't iterable
 131         if type(obj) in types.StringTypes:
 132             return
 133
 134         # recurse on container types
 135         if isinstance(obj, collections.Sequence):
 136             # how should I update lists?
 137             for v in obj:
 138                 self.add_jsonld_child_context(v, default_base)
 139             return
 140
 141         if isinstance(obj, collections.Mapping):
 142             for v in obj.values():
 143                 self.add_jsonld_child_context(v, default_base)
 144
 145         # we have an object. attach a context to it.
 146         if self._is_encoded_object(obj):
 147             context = self.create_jsonld_context(obj, default_base)
 148             if len(context) > 0:
 149                 obj.setdefault('@context', {}).update(context)
 150
 151     def add_jsonld_namespaces(self, context):
 152         '''Add shortcut namespaces to a context
 153
 154         Only needs to be run on the top-most context
 155         '''
 156         context.update(ENCODED_NAMESPACES)
 157
 158     def create_jsonld_context(self, obj, default_base):
 159         '''Synthesize the context for a encoded type
 160
 161         self.contexts[None] = default context attributes added to any type
 162         self.contexts[type] = context attributes for this type.
 163         '''
 164         context = {'@base': urljoin(default_base, obj['@id']),
 165                     '@vocab': self.get_schema_url(obj)}
 166         # add in defaults
 167         context.update(self.contexts[None])
 168         for t in obj['@type']:
 169             if t in self.contexts:
 170                 context.update(self.contexts[t])
 171         return context
 172
 173     def get_json(self, obj_id, **kwargs):
 174         '''GET an ENCODE object as JSON and return as dict
 175
 176         Uses prepare_url to allow url short-cuts
 177         if no keyword arguments are specified it will default to adding limit=all
 178         Alternative keyword arguments can be passed in and will be sent to the host.
 179
 180         Known keywords are:
 181           limit - (integer or 'all') how many records to return, all for all of them
 182           embed - (bool) if true expands linking ids into their associated object.
 183           format - text/html or application/json
 184         '''
 185         if len(kwargs) == 0:
 186             kwargs['limit'] = 'all'
 187
 188         url = self.prepare_url(obj_id)
 189         LOGGER.info('requesting url: {}'.format(url))
 190
 191         # do the request
 192
 193         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 194         response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
 195         if not response.status_code == requests.codes.ok:
 196             LOGGER.error("Error http status: {}".format(response.status_code))
 197             response.raise_for_status()
 198         return response.json()
 199
 200     def get_jsonld(self, obj_id, **kwargs):
 201         '''Get ENCODE object as JSONLD annotated with classses contexts
 202
 203         see get_json for documentation about what keywords can be passed.
 204         '''
 205         url = self.prepare_url(obj_id)
 206         json = self.get_json(obj_id, **kwargs)
 207         self.add_jsonld_context(json, url)
 208         return json
 209
 210     def get_object_type(self, obj):
 211         """Return type for a encoded object
 212         """
 213         obj_type = obj.get('@type')
 214         if obj_type and isinstance(obj_type, collections.Sequence):
 215             return obj_type[0]
 216
 217     def get_schema_url(self, obj):
 218         obj_type = self.get_object_type(obj)
 219         if obj_type:
 220             return self.prepare_url(ENCODED_SCHEMA_ROOT + obj_type + '.json') + '#'
 221
 222     def _is_encoded_object(self, obj):
 223         '''Test to see if an object is a JSON-LD object
 224
 225         Some of the nested dictionaries lack the @id or @type
 226         information necessary to convert them.
 227         '''
 228         if not isinstance(obj, collections.Iterable):
 229             return False
 230
 231         if '@id' in obj and '@type' in obj:
 232             return True
 233         return False
 234
 235
 236     def patch_json(self, obj_id, changes):
 237         """Given a dictionary of changes push them as a HTTP patch request
 238         """
 239         url = self.prepare_url(obj_id)
 240         payload = json.dumps(changes)
 241         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 242         if response.status_code != requests.codes.ok:
 243             LOGGER.error("Error http status: {}".format(response.status_code))
 244             LOGGER.error("Response: %s", response.text)
 245             response.raise_for_status()
 246         return response.json()
 247
 248     def put_json(self, obj_id, new_object):
 249         url = self.prepare_url(obj_id)
 250         payload = json.dumps(new_object)
 251         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 252         if response.status_code != requests.codes.created:
 253             LOGGER.error("Error http status: {}".format(response.status_code))
 254             response.raise_for_status()
 255         return response.json()
 256
 257     def post_json(self, collection_id, new_object):
 258         url = self.prepare_url(collection_id)
 259         payload = json.dumps(new_object)
 260         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 261         if response.status_code != requests.codes.created:
 262             LOGGER.error("Error http status: {}".format(response.status_code))
 263             response.raise_for_status()
 264         return response.json()
 265
 266     def prepare_url(self, request_url):
 267         '''This attempts to provide some convienence for accessing a URL
 268
 269         Given a url fragment it will default to :
 270         * requests over http
 271         * requests to self.server
 272
 273         This allows fairly flexible urls. e.g.
 274
 275         prepare_url('/experiments/ENCSR000AEG')
 276         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 277         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 278
 279         should all return the same url
 280         '''
 281         # clean up potentially messy urls
 282         url = urlparse(request_url)._asdict()
 283         if not url['scheme']:
 284             url['scheme'] = 'https'
 285         if not url['netloc']:
 286             url['netloc'] = self.server
 287         url = urlunparse(url.values())
 288         return url
 289
 290     def search_jsonld(self, term, **kwargs):
 291         '''Send search request to ENCODED
 292         '''
 293         url = self.prepare_url('/search/')
 294         result = self.get_json(url, searchTerm=term, **kwargs)
 295         self.convert_search_to_jsonld(result)
 296         return result
 297
 298     def convert_search_to_jsonld(self, result):
 299         '''Add the context to search result
 300
 301         Also remove hard to handle nested attributes
 302           e.g. remove object.term when we have no id
 303         '''
 304         graph = result['@graph']
 305         for i, obj in enumerate(graph):
 306             # suppress nested attributes
 307             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 308
 309         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 310         return result
 311
 312     def validate(self, obj):
 313         obj_type = self.get_object_type(obj)
 314         schema_url = self.get_schema_url(obj)
 315         if not schema_url:
 316             raise ValueError("Unable to construct schema url")
 317
 318         schema = self.schemas.setdefault(obj_type, self.get_json(schema_url))
 319         hidden = obj.copy()
 320         del hidden['@id']
 321         del hidden['@type']
 322         jsonschema.validate(hidden, schema)
 323
 324
 325 if __name__ == '__main__':
 326     # try it
 327     from htsworkflow.util.rdfhelp import get_model, dump_model
 328     from htsworkflow.util.rdfjsonld import load_into_model
 329     from pprint import pprint
 330     model = get_model()
 331     logging.basicConfig(level=logging.DEBUG)
 332     encoded = ENCODED('test.encodedcc.org')
 333     encoded.load_netrc()
 334     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 335     pprint(body)
 336     load_into_model(model, body)
 337     #dump_model(model)