htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5
   6
   7 from __future__ import print_function
   8 import collections
   9 import logging
  10 import json
  11 import jsonschema
  12 import requests
  13 import types
  14 from urlparse import urljoin, urlparse, urlunparse
  15
  16 LOGGER = logging.getLogger(__name__)
  17
  18 ENCODED_CONTEXT = {
  19     # The None context will get added to the root of the tree and will
  20     # provide common defaults.
  21     None: {
  22         # terms in multiple encoded objects
  23         'award': {'@type': '@id'},
  24         'dataset': {'@type': '@id'},
  25         'description': 'rdf:description',
  26         'documents': {'@type': '@id'},
  27         'experiment': {'@type': '@id'},
  28         'href': {'@type': '@id'},
  29         'lab': {'@type': '@id'},
  30         'library': {'@type': '@id'},
  31         'pi': {'@type': '@id'},
  32         'platform': {'@type': '@id'},
  33         'replicates': {'@type': '@id'},
  34         'submitted_by': {'@type': '@id'},
  35         'url': {'@type': '@id'},
  36     },
  37     # Identify and markup contained classes.
  38     # e.g. in the tree there was a sub-dictionary named 'biosample'
  39     # That dictionary had a term 'biosample_term_id, which is the
  40     # term that should be used as the @id.
  41     'biosample': {
  42         'biosample_term_id': {'@type': '@id'},
  43     },
  44     'experiment': {
  45         "assay_term_id": {"@type": "@id"},
  46         "files": {"@type": "@id"},
  47         "original_files": {"@type": "@id"},
  48     },
  49     # I tried to use the JSON-LD mapping capabilities to convert the lab
  50     # contact information into a vcard record, but the encoded model
  51     # didn't lend itself well to the vcard schema
  52     #'lab': {
  53     #    "address1": "vcard:street-address",
  54     #    "address2": "vcard:street-address",
  55     #    "city": "vcard:locality",
  56     #    "state": "vcard:region",
  57     #    "country": "vcard:country"
  58     #},
  59     'library': {
  60         'nucleic_acid_term_id': {'@type': '@id'}
  61     }
  62 }
  63
  64 #FIXME: this needs to be initialized from rdfns
  65 ENCODED_NAMESPACES = {
  66     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  67     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  68     # rdfs:label)
  69     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  70     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  71     "owl": "http://www.w3.org/2002/07/owl#",
  72     "dc": "htp://purl.org/dc/elements/1.1/",
  73     "xsd": "http://www.w3.org/2001/XMLSchema#",
  74     "vcard": "http://www.w3.org/2006/vcard/ns#",
  75
  76     # for some namespaces I made a best guess for the ontology root.
  77     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  78     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  79     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  80     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  81     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  82     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  83     # NTR: New Term Request space for DCC to implement new ontology terms
  84
  85 }
  86
  87 ENCODED_SCHEMA_ROOT = '/profiles/'
  88
  89
  90 class ENCODED:
  91     '''Programatic access encoded, the software powering ENCODE3's submit site.
  92     '''
  93     def __init__(self, server, contexts=None, namespaces=None):
  94         self.server = server
  95         self.scheme = 'https'
  96         self.username = None
  97         self.password = None
  98         self.contexts = contexts if contexts else ENCODED_CONTEXT
  99         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 100         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 101         self.schemas = {}
 102
 103     def get_auth(self):
 104         return (self.username, self.password)
 105     auth = property(get_auth)
 106
 107     def load_netrc(self):
 108         import netrc
 109         session = netrc.netrc()
 110         authenticators = session.authenticators(self.server)
 111         if authenticators:
 112             self.username = authenticators[0]
 113             self.password = authenticators[2]
 114
 115     def add_jsonld_context(self, tree, default_base):
 116         """Add contexts to various objects in the tree.
 117
 118         tree is a json tree returned from the DCC's encoded database.
 119         contexts is a dictionary of dictionaries containing contexts
 120                 for the various  possible encoded classes.
 121         base, if supplied allows setting the base url that relative
 122             urls will be resolved against.
 123         """
 124         self.add_jsonld_child_context(tree, default_base)
 125         self.add_jsonld_namespaces(tree['@context'])
 126
 127     def add_jsonld_child_context(self, obj, default_base):
 128         '''Add JSON-LD context to the encoded JSON.
 129
 130         This is recursive because some of the IDs were relative URLs
 131         and I needed a way to properly compute a the correct base URL.
 132         '''
 133         # pretend strings aren't iterable
 134         if type(obj) in types.StringTypes:
 135             return
 136
 137         # recurse on container types
 138         if isinstance(obj, collections.Sequence):
 139             # how should I update lists?
 140             for v in obj:
 141                 self.add_jsonld_child_context(v, default_base)
 142             return
 143
 144         if isinstance(obj, collections.Mapping):
 145             for v in obj.values():
 146                 self.add_jsonld_child_context(v, default_base)
 147
 148         # we have an object. attach a context to it.
 149         if self._is_encoded_object(obj):
 150             context = self.create_jsonld_context(obj, default_base)
 151             if len(context) > 0:
 152                 obj.setdefault('@context', {}).update(context)
 153
 154     def add_jsonld_namespaces(self, context):
 155         '''Add shortcut namespaces to a context
 156
 157         Only needs to be run on the top-most context
 158         '''
 159         context.update(self.namespaces)
 160
 161     def create_jsonld_context(self, obj, default_base):
 162         '''Synthesize the context for a encoded type
 163
 164         self.contexts[None] = default context attributes added to any type
 165         self.contexts[type] = context attributes for this type.
 166         '''
 167         context = {'@base': urljoin(default_base, obj['@id']),
 168                    '@vocab': self.get_schema_url(obj)}
 169         # add in defaults
 170         context.update(self.contexts[None])
 171         for t in obj['@type']:
 172             if t in self.contexts:
 173                 context.update(self.contexts[t])
 174         return context
 175
 176     def get_json(self, obj_id, **kwargs):
 177         '''GET an ENCODE object as JSON and return as dict
 178
 179         Uses prepare_url to allow url short-cuts
 180         if no keyword arguments are specified it will default to adding limit=all
 181         Alternative keyword arguments can be passed in and will be sent to the host.
 182
 183         Known keywords are:
 184           limit - (integer or 'all') how many records to return, all for all of them
 185           embed - (bool) if true expands linking ids into their associated object.
 186           format - text/html or application/json
 187         '''
 188         if len(kwargs) == 0:
 189             kwargs['limit'] = 'all'
 190
 191         url = self.prepare_url(obj_id)
 192         LOGGER.info('requesting url: {}'.format(url))
 193
 194         # do the request
 195
 196         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 197         response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
 198         if not response.status_code == requests.codes.ok:
 199             LOGGER.error("Error http status: {}".format(response.status_code))
 200             response.raise_for_status()
 201         return response.json()
 202
 203     def get_jsonld(self, obj_id, **kwargs):
 204         '''Get ENCODE object as JSONLD annotated with classses contexts
 205
 206         see get_json for documentation about what keywords can be passed.
 207         '''
 208         url = self.prepare_url(obj_id)
 209         json = self.get_json(obj_id, **kwargs)
 210         self.add_jsonld_context(json, url)
 211         return json
 212
 213     def get_object_type(self, obj):
 214         """Return type for a encoded object
 215         """
 216         obj_type = obj.get('@type')
 217         if not obj_type:
 218             raise ValueError('None type')
 219         if type(obj_type) in types.StringTypes:
 220             raise ValueError('@type should be a list, not a string')
 221         if not isinstance(obj_type, collections.Sequence):
 222             raise ValueError('@type is not a sequence')
 223         return obj_type[0]
 224
 225     def get_schema_url(self, obj):
 226         obj_type = self.get_object_type(obj)
 227         if obj_type:
 228             return self.prepare_url(ENCODED_SCHEMA_ROOT + obj_type + '.json') + '#'
 229
 230     def _is_encoded_object(self, obj):
 231         '''Test to see if an object is a JSON-LD object
 232
 233         Some of the nested dictionaries lack the @id or @type
 234         information necessary to convert them.
 235         '''
 236         if not isinstance(obj, collections.Iterable):
 237             return False
 238
 239         if '@id' in obj and '@type' in obj:
 240             return True
 241         return False
 242
 243     def patch_json(self, obj_id, changes):
 244         """Given a dictionary of changes push them as a HTTP patch request
 245         """
 246         url = self.prepare_url(obj_id)
 247         LOGGER.info('PATCHing to %s', url)
 248         payload = json.dumps(changes)
 249         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 250         if response.status_code != requests.codes.ok:
 251             LOGGER.error("Error http status: {}".format(response.status_code))
 252             LOGGER.error("Response: %s", response.text)
 253             response.raise_for_status()
 254         return response.json()
 255
 256     def put_json(self, obj_id, new_object):
 257         url = self.prepare_url(obj_id)
 258         LOGGER.info('PUTing to %s', url)
 259         payload = json.dumps(new_object)
 260         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 261         if response.status_code != requests.codes.created:
 262             LOGGER.error("Error http status: {}".format(response.status_code))
 263             response.raise_for_status()
 264         return response.json()
 265
 266     def post_json(self, collection_id, new_object):
 267         url = self.prepare_url(collection_id)
 268         LOGGER.info('POSTing to %s', url)
 269         payload = json.dumps(new_object)
 270
 271         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 272         if response.status_code != requests.codes.created:
 273             LOGGER.error("Error http status: {}".format(response.status_code))
 274             response.raise_for_status()
 275         return response.json()
 276
 277     def prepare_url(self, request_url):
 278         '''This attempts to provide some convienence for accessing a URL
 279
 280         Given a url fragment it will default to :
 281         * requests over http
 282         * requests to self.server
 283
 284         This allows fairly flexible urls. e.g.
 285
 286         prepare_url('/experiments/ENCSR000AEG')
 287         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 288         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 289
 290         should all return the same url
 291         '''
 292         # clean up potentially messy urls
 293         url = urlparse(request_url)._asdict()
 294         if not url['scheme']:
 295             url['scheme'] = self.scheme
 296         if not url['netloc']:
 297             url['netloc'] = self.server
 298         url = urlunparse(url.values())
 299         return url
 300
 301     def search_jsonld(self, term, **kwargs):
 302         '''Send search request to ENCODED
 303         '''
 304         url = self.prepare_url('/search/')
 305         result = self.get_json(url, searchTerm=term, **kwargs)
 306         self.convert_search_to_jsonld(result)
 307         return result
 308
 309     def convert_search_to_jsonld(self, result):
 310         '''Add the context to search result
 311
 312         Also remove hard to handle nested attributes
 313           e.g. remove object.term when we have no id
 314         '''
 315         graph = result['@graph']
 316         for i, obj in enumerate(graph):
 317             # suppress nested attributes
 318             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 319
 320         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 321         return result
 322
 323     def validate(self, obj):
 324         obj_type = self.get_object_type(obj)
 325         schema_url = self.get_schema_url(obj)
 326         if not schema_url:
 327             raise ValueError("Unable to construct schema url")
 328
 329         schema = self.schemas.setdefault(obj_type, self.get_json(schema_url))
 330         hidden = obj.copy()
 331         if '@id' in hidden: del hidden['@id']
 332         if '@type' in hidden: del hidden['@type']
 333         jsonschema.validate(hidden, schema)
 334
 335
 336 class Document(object):
 337     """Helper class for registering documents
 338
 339     Usage:
 340     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 341     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 342     lysis.create_if_needed(server, lysis_uuid)
 343     """
 344     award = 'U54HG006998'
 345     lab = '/labs/barbara-wold'
 346
 347     def __init__(self, url, document_type, description, aliases=None):
 348         self.url = url
 349         self.filename = os.path.basename(url)
 350         self.document_type = document_type
 351         self.description = description
 352
 353         self.references = []
 354         self.aliases = aliases if aliases is not None else []
 355         self.content_type = None
 356         self.document = None
 357         self.md5sum = None
 358         self.urls = None
 359         self.uuid = None
 360
 361         self.get_document()
 362
 363     def get_document(self):
 364         if os.path.exists(self.url):
 365             with open(self.url, 'r') as instream:
 366                 assert self.url.endswith('pdf')
 367                 self.content_type = 'application/pdf'
 368                 self.document = instream.read()
 369                 self.md5sum = hashlib.md5(self.document)
 370         else:
 371             req = requests.get(self.url)
 372             if req.status_code == 200:
 373                 self.content_type = req.headers['content-type']
 374                 self.document = req.content
 375                 self.md5sum = hashlib.md5(self.document)
 376                 self.urls = [self.url]
 377
 378     def create_payload(self):
 379         document_payload = {
 380             'attachment': {
 381               'download': self.filename,
 382               'type': self.content_type,
 383               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 384               'md5sum': self.md5sum.hexdigest()
 385             },
 386             'document_type': self.document_type,
 387             'description': self.description,
 388             'award': self.award,
 389             'lab': self.lab,
 390         }
 391         if self.aliases:
 392             document_payload['aliases'] = self.aliases
 393         if self.references:
 394             document_payload['references'] = self.references
 395         if self.urls:
 396             document_payload['urls'] = self.urls
 397
 398         return document_payload
 399
 400     def post(self, server):
 401         document_payload = self.create_payload()
 402         return server.post_json('/documents/', document_payload)
 403
 404     def save(self, filename):
 405         payload = self.create_payload()
 406         with open(filename, 'w') as outstream:
 407             outstream.write(pformat(payload))
 408
 409     def create_if_needed(self, server, uuid):
 410         self.uuid = uuid
 411         if uuid is None:
 412             return self.post(server)
 413         else:
 414             return server.get_json(uuid, embed=False)
 415
 416 if __name__ == '__main__':
 417     # try it
 418     from htsworkflow.util.rdfhelp import get_model, dump_model
 419     from htsworkflow.util.rdfjsonld import load_into_model
 420     from pprint import pprint
 421     model = get_model()
 422     logging.basicConfig(level=logging.DEBUG)
 423     encoded = ENCODED('test.encodedcc.org')
 424     encoded.load_netrc()
 425     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 426     pprint(body)
 427     load_into_model(model, body)
 428     #dump_model(model)