htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5
   6
   7 from __future__ import print_function
   8 import collections
   9 import logging
  10 import json
  11 import jsonschema
  12 import requests
  13 from requests.utils import urlparse, urlunparse
  14 import types
  15 from urlparse import urljoin
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 ENCODED_CONTEXT = {
  20     # The None context will get added to the root of the tree and will
  21     # provide common defaults.
  22     None: {
  23         # terms in multiple encoded objects
  24         'description': 'rdf:description',
  25         'experiment': {'@type': '@id'},
  26         'href': { '@type': '@id' },
  27         'lab': { '@type': '@id' },
  28         'library': {'@type': '@id' },
  29         'pi': { '@type': '@id' },
  30         'platform': { '@type': '@id' },
  31         'submitted_by': { '@type': '@id' },
  32         'url': { '@type': '@id' },
  33     },
  34     # Identify and markup contained classes.
  35     # e.g. in the tree there was a sub-dictionary named 'biosample'
  36     # That dictionary had a term 'biosample_term_id, which is the
  37     # term that should be used as the @id.
  38     'biosample': {
  39         'biosample_term_id': { '@type': '@id' },
  40     },
  41     'experiment': {
  42         "assay_term_id": { "@type": "@id" },
  43     },
  44     'file': {
  45         'dataset': {'@type': '@id'},
  46     },
  47     # I tried to use the JSON-LD mapping capabilities to convert the lab
  48     # contact information into a vcard record, but the encoded model
  49     # didn't lend itself well to the vcard schema
  50     #'lab': {
  51     #    "address1": "vcard:street-address",
  52     #    "address2": "vcard:street-address",
  53     #    "city": "vcard:locality",
  54     #    "state": "vcard:region",
  55     #    "country": "vcard:country"
  56     #},
  57     'human_donor': {
  58         'award': { '@type': '@id' },
  59     },
  60     'library': {
  61         'award': { '@type': '@id' },
  62         'nucleic_acid_term_id': { '@type': '@id' }
  63     }
  64 }
  65
  66 #FIXME: this needs to be initialized from rdfns
  67 ENCODED_NAMESPACES = {
  68     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  69     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  70     # rdfs:label)
  71     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  72     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  73     "owl": "http://www.w3.org/2002/07/owl#",
  74     "dc": "htp://purl.org/dc/elements/1.1/",
  75     "xsd": "http://www.w3.org/2001/XMLSchema#",
  76     "vcard": "http://www.w3.org/2006/vcard/ns#",
  77
  78     # for some namespaces I made a best guess for the ontology root.
  79     "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
  80     "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
  81     "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
  82     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  83     'SO': 'http://purl.obolibrary.org/obo/SO_', # Sequence ontology
  84     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  85
  86 }
  87
  88 ENCODED_SCHEMA_ROOT='/profiles/'
  89
  90 class ENCODED:
  91     '''Programatic access encoded, the software powering ENCODE3's submit site.
  92     '''
  93     def __init__(self, server, contexts=None):
  94         self.server = server
  95         self.username = None
  96         self.password = None
  97         self.contexts = contexts if contexts else ENCODED_CONTEXT
  98         self.schemas = {}
  99
 100     def get_auth(self):
 101         return (self.username, self.password)
 102     auth = property(get_auth)
 103
 104     def load_netrc(self):
 105         import netrc
 106         session = netrc.netrc()
 107         authenticators = session.authenticators(self.server)
 108         if authenticators:
 109             self.username = authenticators[0]
 110             self.password = authenticators[2]
 111
 112     def add_jsonld_context(self, tree, contexts, base):
 113         """Add contexts to various objects in the tree.
 114
 115         tree is a json tree returned from the DCC's encoded database.
 116         contexts is a dictionary of dictionaries containing contexts
 117                 for the various  possible encoded classes.
 118         base, if supplied allows setting the base url that relative
 119             urls will be resolved against.
 120         """
 121         tree['@context'] = contexts[None]
 122         tree['@context']['@base'] = base
 123         self.add_jsonld_child_context(tree, contexts)
 124
 125     def add_jsonld_child_context(self, obj, contexts):
 126         '''Add JSON-LD context to the encoded JSON.
 127
 128         This is recursive because some of the IDs were relative URLs
 129         and I needed a way to properly compute a the correct base URL.
 130         '''
 131         # pretend strings aren't iterable
 132         if type(obj) in types.StringTypes:
 133             return
 134
 135         # recurse on container types
 136         if isinstance(obj, collections.Sequence):
 137             # how should I update lists?
 138             for v in obj:
 139                 self.add_jsonld_child_context(v, default_base)
 140             return
 141
 142         if isinstance(obj, collections.Mapping):
 143             for v in obj.values():
 144                 self.add_jsonld_child_context(v, default_base)
 145
 146         # we have an object. attach a context to it.
 147         if self._is_encoded_object(obj):
 148             context = self.create_jsonld_context(obj, default_base)
 149             if len(context) > 0:
 150                 obj.setdefault('@context', {}).update(context)
 151
 152     def add_jsonld_namespaces(self, context):
 153         '''Add shortcut namespaces to a context
 154
 155         Only needs to be run on the top-most context
 156         '''
 157         context.update(ENCODED_NAMESPACES)
 158
 159     def create_jsonld_context(self, obj, default_base):
 160         '''Synthesize the context for a encoded type
 161
 162         self.contexts[None] = default context attributes added to any type
 163         self.contexts[type] = context attributes for this type.
 164         '''
 165         context = {'@base': urljoin(default_base, obj['@id']),
 166                     '@vocab': self.get_schema_url(obj)}
 167         # add in defaults
 168         context.update(self.contexts[None])
 169         for t in obj['@type']:
 170             if t in self.contexts:
 171                 context.update(self.contexts[t])
 172         return context
 173
 174     def get_json(self, obj_id, **kwargs):
 175         '''GET an ENCODE object as JSON and return as dict
 176
 177         Uses prepare_url to allow url short-cuts
 178         if no keyword arguments are specified it will default to adding limit=all
 179         Alternative keyword arguments can be passed in and will be sent to the host.
 180
 181         Known keywords are:
 182           limit - (integer or 'all') how many records to return, all for all of them
 183           embed - (bool) if true expands linking ids into their associated object.
 184           format - text/html or application/json
 185         '''
 186         if len(kwargs) == 0:
 187             kwargs['limit'] = 'all'
 188
 189         url = self.prepare_url(obj_id)
 190         LOGGER.info('requesting url: {}'.format(url))
 191
 192         # do the request
 193         headers = {'content-type': 'application/json'}
 194         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 195         response = requests.get(url, auth=self.auth, headers=headers, params=kwargs)
 196         if not response.status_code == requests.codes.ok:
 197             LOGGER.error("Error http status: {}".format(response.status_code))
 198             response.raise_for_status()
 199         return response.json()
 200
 201     def get_jsonld(self, obj_id, **kwargs):
 202         '''Get ENCODE object as JSONLD annotated with classses contexts
 203
 204         see get_json for documentation about what keywords can be passed.
 205         '''
 206         url = self.prepare_url(obj_id)
 207         json = self.get_json(obj_id, **kwargs)
 208         self.add_jsonld_context(json, self.context, url)
 209         return json
 210
 211     def get_object_type(self, obj):
 212         """Return type for a encoded object
 213         """
 214         obj_type = obj.get('@type')
 215         if obj_type and isinstance(obj_type, collections.Sequence):
 216             return obj_type[0]
 217
 218     def get_schema_url(self, obj):
 219         obj_type = self.get_object_type(obj)
 220         if obj_type:
 221             return self.prepare_url(ENCODED_SCHEMA_ROOT + obj_type + '.json')
 222
 223     def _is_encoded_object(self, obj):
 224         '''Test to see if an object is a JSON-LD object
 225
 226         Some of the nested dictionaries lack the @id or @type
 227         information necessary to convert them.
 228         '''
 229         if not isinstance(obj, collections.Iterable):
 230             return False
 231
 232         if '@id' in obj and '@type' in obj:
 233             return True
 234         return False
 235
 236
 237     def patch_json(self, obj_id, changes):
 238         """Given a dictionary of changes push them as a HTTP patch request
 239         """
 240         url = self.prepare_url(obj_id)
 241         payload = json.dumps(changes)
 242         response = requests.patch(url, auth=self.auth, data=payload)
 243         if response.status_code != requests.codes.ok:
 244             LOGGER.error("Error http status: {}".format(response.status_code))
 245             response.raise_for_status()
 246         return response.json()
 247
 248     def put_json(self, obj_id, new_object):
 249         url = self.prepare_url(obj_id)
 250         payload = json.dumps(new_object)
 251         response = requests.put(url, auth=self.auth, data=payload)
 252         if response.status_code != requests.codes.created:
 253             LOGGER.error("Error http status: {}".format(response.status_code))
 254             response.raise_for_status()
 255         return response.json()
 256
 257     def prepare_url(self, request_url):
 258         '''This attempts to provide some convienence for accessing a URL
 259
 260         Given a url fragment it will default to :
 261         * requests over http
 262         * requests to self.server
 263
 264         This allows fairly flexible urls. e.g.
 265
 266         prepare_url('/experiments/ENCSR000AEG')
 267         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 268         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 269
 270         should all return the same url
 271         '''
 272         # clean up potentially messy urls
 273         url = urlparse(request_url)._asdict()
 274         if not url['scheme']:
 275             url['scheme'] = 'http'
 276         if not url['netloc']:
 277             url['netloc'] = self.server
 278         url = urlunparse(url.values())
 279         return url
 280
 281     def validate(self, obj):
 282         obj_type = self.get_object_type(obj)
 283         schema_url = self.get_schema_url(obj)
 284         if not schema_url:
 285             raise ValueError("Unable to construct schema url")
 286
 287         schema = self.schemas.setdefault(obj_type, self.get_json(schema_url))
 288         hidden = obj.copy()
 289         del hidden['@id']
 290         del hidden['@type']
 291         jsonschema.validate(hidden, schema)
 292
 293
 294 if __name__ == '__main__':
 295     # try it
 296     from htsworkflow.util.rdfhelp import get_model, dump_model
 297     from htsworkflow.util.rdfjsonld import load_into_model
 298     from pprint import pprint
 299     model = get_model()
 300     logging.basicConfig(level=logging.DEBUG)
 301     encoded = ENCODED('test.encodedcc.org')
 302     encoded.load_netrc()
 303     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 304     pprint(body)
 305     load_into_model(model, body)
 306     #dump_model(model)