htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import base64
   7 import collections
   8 import hashlib
   9 import logging
  10 import json
  11 import jsonschema
  12 import os
  13 import requests
  14 import types
  15 from urlparse import urljoin, urlparse, urlunparse
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 ENCODED_CONTEXT = {
  20     # The None context will get added to the root of the tree and will
  21     # provide common defaults.
  22     None: {
  23         # terms in multiple encoded objects
  24         'award': {'@type': '@id'},
  25         'dataset': {'@type': '@id'},
  26         'description': 'rdf:description',
  27         'documents': {'@type': '@id'},
  28         'experiment': {'@type': '@id'},
  29         'href': {'@type': '@id'},
  30         'lab': {'@type': '@id'},
  31         'library': {'@type': '@id'},
  32         'pi': {'@type': '@id'},
  33         'platform': {'@type': '@id'},
  34         'replicates': {'@type': '@id'},
  35         'submitted_by': {'@type': '@id'},
  36         'url': {'@type': '@id'},
  37     },
  38     # Identify and markup contained classes.
  39     # e.g. in the tree there was a sub-dictionary named 'biosample'
  40     # That dictionary had a term 'biosample_term_id, which is the
  41     # term that should be used as the @id.
  42     'biosample': {
  43         'biosample_term_id': {'@type': '@id'},
  44     },
  45     'experiment': {
  46         "assay_term_id": {"@type": "@id"},
  47         "files": {"@type": "@id"},
  48         "original_files": {"@type": "@id"},
  49     },
  50     # I tried to use the JSON-LD mapping capabilities to convert the lab
  51     # contact information into a vcard record, but the encoded model
  52     # didn't lend itself well to the vcard schema
  53     #'lab': {
  54     #    "address1": "vcard:street-address",
  55     #    "address2": "vcard:street-address",
  56     #    "city": "vcard:locality",
  57     #    "state": "vcard:region",
  58     #    "country": "vcard:country"
  59     #},
  60     'library': {
  61         'nucleic_acid_term_id': {'@type': '@id'}
  62     }
  63 }
  64
  65 #FIXME: this needs to be initialized from rdfns
  66 ENCODED_NAMESPACES = {
  67     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  68     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  69     # rdfs:label)
  70     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  71     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  72     "owl": "http://www.w3.org/2002/07/owl#",
  73     "dc": "htp://purl.org/dc/elements/1.1/",
  74     "xsd": "http://www.w3.org/2001/XMLSchema#",
  75     "vcard": "http://www.w3.org/2006/vcard/ns#",
  76
  77     # for some namespaces I made a best guess for the ontology root.
  78     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  79     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  80     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  81     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  82     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  83     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  84     # NTR: New Term Request space for DCC to implement new ontology terms
  85
  86 }
  87
  88 ENCODED_SCHEMA_ROOT = '/profiles/'
  89
  90
  91 class ENCODED:
  92     '''Programatic access encoded, the software powering ENCODE3's submit site.
  93     '''
  94     def __init__(self, server, contexts=None, namespaces=None):
  95         self.server = server
  96         self.scheme = 'https'
  97         self.username = None
  98         self.password = None
  99         self.contexts = contexts if contexts else ENCODED_CONTEXT
 100         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 101         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 102         self.schemas = {}
 103
 104     def get_auth(self):
 105         return (self.username, self.password)
 106     auth = property(get_auth)
 107
 108     def load_netrc(self):
 109         import netrc
 110         session = netrc.netrc()
 111         authenticators = session.authenticators(self.server)
 112         if authenticators:
 113             self.username = authenticators[0]
 114             self.password = authenticators[2]
 115
 116     def add_jsonld_context(self, tree, default_base):
 117         """Add contexts to various objects in the tree.
 118
 119         tree is a json tree returned from the DCC's encoded database.
 120         contexts is a dictionary of dictionaries containing contexts
 121                 for the various  possible encoded classes.
 122         base, if supplied allows setting the base url that relative
 123             urls will be resolved against.
 124         """
 125         self.add_jsonld_child_context(tree, default_base)
 126         self.add_jsonld_namespaces(tree['@context'])
 127
 128     def add_jsonld_child_context(self, obj, default_base):
 129         '''Add JSON-LD context to the encoded JSON.
 130
 131         This is recursive because some of the IDs were relative URLs
 132         and I needed a way to properly compute a the correct base URL.
 133         '''
 134         # pretend strings aren't iterable
 135         if type(obj) in types.StringTypes:
 136             return
 137
 138         # recurse on container types
 139         if isinstance(obj, collections.Sequence):
 140             # how should I update lists?
 141             for v in obj:
 142                 self.add_jsonld_child_context(v, default_base)
 143             return
 144
 145         if isinstance(obj, collections.Mapping):
 146             for v in obj.values():
 147                 self.add_jsonld_child_context(v, default_base)
 148
 149         # we have an object. attach a context to it.
 150         if self._is_encoded_object(obj):
 151             context = self.create_jsonld_context(obj, default_base)
 152             if len(context) > 0:
 153                 obj.setdefault('@context', {}).update(context)
 154
 155     def add_jsonld_namespaces(self, context):
 156         '''Add shortcut namespaces to a context
 157
 158         Only needs to be run on the top-most context
 159         '''
 160         context.update(self.namespaces)
 161
 162     def create_jsonld_context(self, obj, default_base):
 163         '''Synthesize the context for a encoded type
 164
 165         self.contexts[None] = default context attributes added to any type
 166         self.contexts[type] = context attributes for this type.
 167         '''
 168         obj_type = self.get_object_type(obj)
 169         context = {'@base': urljoin(default_base, obj['@id']),
 170                    '@vocab': self.get_schema_url(obj_type)}
 171         # add in defaults
 172         context.update(self.contexts[None])
 173         for t in obj['@type']:
 174             if t in self.contexts:
 175                 context.update(self.contexts[t])
 176         return context
 177
 178     def get_json(self, obj_id, **kwargs):
 179         '''GET an ENCODE object as JSON and return as dict
 180
 181         Uses prepare_url to allow url short-cuts
 182         if no keyword arguments are specified it will default to adding limit=all
 183         Alternative keyword arguments can be passed in and will be sent to the host.
 184
 185         Known keywords are:
 186           limit - (integer or 'all') how many records to return, all for all of them
 187           embed - (bool) if true expands linking ids into their associated object.
 188           format - text/html or application/json
 189         '''
 190         if len(kwargs) == 0:
 191             kwargs['limit'] = 'all'
 192
 193         url = self.prepare_url(obj_id)
 194         LOGGER.info('requesting url: {}'.format(url))
 195
 196         # do the request
 197
 198         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 199         response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
 200         if not response.status_code == requests.codes.ok:
 201             LOGGER.error("Error http status: {}".format(response.status_code))
 202             response.raise_for_status()
 203         return response.json()
 204
 205     def get_jsonld(self, obj_id, **kwargs):
 206         '''Get ENCODE object as JSONLD annotated with classses contexts
 207
 208         see get_json for documentation about what keywords can be passed.
 209         '''
 210         url = self.prepare_url(obj_id)
 211         json = self.get_json(obj_id, **kwargs)
 212         self.add_jsonld_context(json, url)
 213         return json
 214
 215     def get_object_type(self, obj):
 216         """Return type for a encoded object
 217         """
 218         obj_type = obj.get('@type')
 219         if not obj_type:
 220             raise ValueError('None type')
 221         if type(obj_type) in types.StringTypes:
 222             raise ValueError('@type should be a list, not a string')
 223         if not isinstance(obj_type, collections.Sequence):
 224             raise ValueError('@type is not a sequence')
 225         return obj_type[0]
 226
 227     def get_schema_url(self, object_type):
 228         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 229
 230     def _is_encoded_object(self, obj):
 231         '''Test to see if an object is a JSON-LD object
 232
 233         Some of the nested dictionaries lack the @id or @type
 234         information necessary to convert them.
 235         '''
 236         if not isinstance(obj, collections.Iterable):
 237             return False
 238
 239         if '@id' in obj and '@type' in obj:
 240             return True
 241         return False
 242
 243     def patch_json(self, obj_id, changes):
 244         """Given a dictionary of changes push them as a HTTP patch request
 245         """
 246         url = self.prepare_url(obj_id)
 247         LOGGER.info('PATCHing to %s', url)
 248         payload = json.dumps(changes)
 249         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 250         if response.status_code != requests.codes.ok:
 251             LOGGER.error("Error http status: {}".format(response.status_code))
 252             LOGGER.error("Response: %s", response.text)
 253             response.raise_for_status()
 254         return response.json()
 255
 256     def put_json(self, obj_id, new_object):
 257         url = self.prepare_url(obj_id)
 258         LOGGER.info('PUTing to %s', url)
 259         payload = json.dumps(new_object)
 260         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 261         if response.status_code != requests.codes.created:
 262             LOGGER.error("Error http status: {}".format(response.status_code))
 263             response.raise_for_status()
 264         return response.json()
 265
 266     def post_json(self, collection_id, new_object):
 267         url = self.prepare_url(collection_id)
 268         LOGGER.info('POSTing to %s', url)
 269         payload = json.dumps(new_object)
 270
 271         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 272         if response.status_code != requests.codes.created:
 273             LOGGER.error("Error http status: {}".format(response.status_code))
 274             response.raise_for_status()
 275         return response.json()
 276
 277     def prepare_url(self, request_url):
 278         '''This attempts to provide some convienence for accessing a URL
 279
 280         Given a url fragment it will default to :
 281         * requests over http
 282         * requests to self.server
 283
 284         This allows fairly flexible urls. e.g.
 285
 286         prepare_url('/experiments/ENCSR000AEG')
 287         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 288         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 289
 290         should all return the same url
 291         '''
 292         # clean up potentially messy urls
 293         url = urlparse(request_url)._asdict()
 294         if not url['scheme']:
 295             url['scheme'] = self.scheme
 296         if not url['netloc']:
 297             url['netloc'] = self.server
 298         url = urlunparse(url.values())
 299         return url
 300
 301     def search_jsonld(self, term, **kwargs):
 302         '''Send search request to ENCODED
 303         '''
 304         url = self.prepare_url('/search/')
 305         result = self.get_json(url, searchTerm=term, **kwargs)
 306         self.convert_search_to_jsonld(result)
 307         return result
 308
 309     def convert_search_to_jsonld(self, result):
 310         '''Add the context to search result
 311
 312         Also remove hard to handle nested attributes
 313           e.g. remove object.term when we have no id
 314         '''
 315         graph = result['@graph']
 316         for i, obj in enumerate(graph):
 317             # suppress nested attributes
 318             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 319
 320         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 321         return result
 322
 323     def validate(self, obj, object_type=None):
 324         object_type = object_type if object_type else self.get_object_type(obj)
 325         schema_url = self.get_schema_url(object_type)
 326         if not schema_url:
 327             raise ValueError("Unable to construct schema url")
 328
 329         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 330         hidden = obj.copy()
 331         if '@id' in hidden:
 332             del hidden['@id']
 333         if '@type' in hidden:
 334             del hidden['@type']
 335         jsonschema.validate(hidden, schema)
 336
 337 class TypedColumnParser(object):
 338     @staticmethod
 339     def parse_sheet_array_type(value):
 340         """Helper function to parse :array columns in sheet
 341         """
 342         return value.split(', ')
 343
 344     @staticmethod
 345     def parse_sheet_integer_type(value):
 346         """Helper function to parse :integer columns in sheet
 347         """
 348         return int(value)
 349
 350     @staticmethod
 351     def parse_sheet_boolean_type(value):
 352         """Helper function to parse :boolean columns in sheet
 353         """
 354         return bool(value)
 355
 356     @staticmethod
 357     def parse_sheet_timestamp_type(value):
 358         """Helper function to parse :date columns in sheet
 359         """
 360         return value.strftime('%Y-%m-%d')
 361
 362     @staticmethod
 363     def parse_sheet_string_type(value):
 364         """Helper function to parse :string columns in sheet (the default)
 365         """
 366         return unicode(value)
 367
 368     def __getitem__(self, name):
 369         parser = {
 370             'array': self.parse_sheet_array_type,
 371             'boolean': self.parse_sheet_boolean_type,
 372             'integer': self.parse_sheet_integer_type,
 373             'date': self.parse_sheet_timestamp_type,
 374             'string': self.parse_sheet_string_type
 375         }.get(name)
 376         if parser:
 377             return parser
 378         else:
 379             raise RuntimeError("unrecognized column type")
 380
 381     def __call__(self, header, value):
 382         header = header.split(':')
 383         column_type = 'string'
 384         if len(header) > 1:
 385             if header[1] == 'skip':
 386                 return None, None
 387             else:
 388                 column_type = header[1]
 389         return header[0], self[column_type](value)
 390
 391 typed_column_parser = TypedColumnParser()
 392
 393 class Document(object):
 394     """Helper class for registering documents
 395
 396     Usage:
 397     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 398     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 399     lysis.create_if_needed(server, lysis_uuid)
 400     """
 401     award = 'U54HG006998'
 402     lab = '/labs/barbara-wold'
 403
 404     def __init__(self, url, document_type, description, aliases=None):
 405         self.url = url
 406         self.filename = os.path.basename(url)
 407         self.document_type = document_type
 408         self.description = description
 409
 410         self.references = []
 411         self.aliases = aliases if aliases is not None else []
 412         self.content_type = None
 413         self.document = None
 414         self.md5sum = None
 415         self.urls = None
 416         self.uuid = None
 417
 418         self.get_document()
 419
 420     def get_document(self):
 421         if os.path.exists(self.url):
 422             with open(self.url, 'r') as instream:
 423                 assert self.url.endswith('pdf')
 424                 self.content_type = 'application/pdf'
 425                 self.document = instream.read()
 426                 self.md5sum = hashlib.md5(self.document)
 427         else:
 428             req = requests.get(self.url)
 429             if req.status_code == 200:
 430                 self.content_type = req.headers['content-type']
 431                 self.document = req.content
 432                 self.md5sum = hashlib.md5(self.document)
 433                 self.urls = [self.url]
 434
 435     def create_payload(self):
 436         document_payload = {
 437             'attachment': {
 438               'download': self.filename,
 439               'type': self.content_type,
 440               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 441               'md5sum': self.md5sum.hexdigest()
 442             },
 443             'document_type': self.document_type,
 444             'description': self.description,
 445             'award': self.award,
 446             'lab': self.lab,
 447         }
 448         if self.aliases:
 449             document_payload['aliases'] = self.aliases
 450         if self.references:
 451             document_payload['references'] = self.references
 452         if self.urls:
 453             document_payload['urls'] = self.urls
 454
 455         return document_payload
 456
 457     def post(self, server):
 458         document_payload = self.create_payload()
 459         return server.post_json('/documents/', document_payload)
 460
 461     def save(self, filename):
 462         payload = self.create_payload()
 463         with open(filename, 'w') as outstream:
 464             outstream.write(pformat(payload))
 465
 466     def create_if_needed(self, server, uuid):
 467         self.uuid = uuid
 468         if uuid is None:
 469             return self.post(server)
 470         else:
 471             return server.get_json(uuid, embed=False)
 472
 473 if __name__ == '__main__':
 474     # try it
 475     from htsworkflow.util.rdfhelp import get_model, dump_model
 476     from htsworkflow.util.rdfjsonld import load_into_model
 477     from pprint import pprint
 478     model = get_model()
 479     logging.basicConfig(level=logging.DEBUG)
 480     encoded = ENCODED('test.encodedcc.org')
 481     encoded.load_netrc()
 482     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 483     pprint(body)
 484     load_into_model(model, body)
 485     #dump_model(model)