htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import base64
   7 import collections
   8 import hashlib
   9 import logging
  10 import json
  11 import jsonschema
  12 import os
  13 import requests
  14 import six
  15 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 ENCODED_CONTEXT = {
  20     # The None context will get added to the root of the tree and will
  21     # provide common defaults.
  22     None: {
  23         # terms in multiple encoded objects
  24         'award': {'@type': '@id'},
  25         'dataset': {'@type': '@id'},
  26         'description': 'rdf:description',
  27         'documents': {'@type': '@id'},
  28         'experiment': {'@type': '@id'},
  29         'href': {'@type': '@id'},
  30         'lab': {'@type': '@id'},
  31         'library': {'@type': '@id'},
  32         'pi': {'@type': '@id'},
  33         'platform': {'@type': '@id'},
  34         'replicates': {'@type': '@id'},
  35         'submitted_by': {'@type': '@id'},
  36         'url': {'@type': '@id'},
  37     },
  38     # Identify and markup contained classes.
  39     # e.g. in the tree there was a sub-dictionary named 'biosample'
  40     # That dictionary had a term 'biosample_term_id, which is the
  41     # term that should be used as the @id.
  42     'biosample': {
  43         'biosample_term_id': {'@type': '@id'},
  44     },
  45     'experiment': {
  46         "assay_term_id": {"@type": "@id"},
  47         "files": {"@type": "@id"},
  48         "original_files": {"@type": "@id"},
  49     },
  50     # I tried to use the JSON-LD mapping capabilities to convert the lab
  51     # contact information into a vcard record, but the encoded model
  52     # didn't lend itself well to the vcard schema
  53     #'lab': {
  54     #    "address1": "vcard:street-address",
  55     #    "address2": "vcard:street-address",
  56     #    "city": "vcard:locality",
  57     #    "state": "vcard:region",
  58     #    "country": "vcard:country"
  59     #},
  60     'library': {
  61         'nucleic_acid_term_id': {'@type': '@id'}
  62     }
  63 }
  64
  65 #FIXME: this needs to be initialized from rdfns
  66 ENCODED_NAMESPACES = {
  67     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  68     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  69     # rdfs:label)
  70     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  71     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  72     "owl": "http://www.w3.org/2002/07/owl#",
  73     "dc": "htp://purl.org/dc/elements/1.1/",
  74     "xsd": "http://www.w3.org/2001/XMLSchema#",
  75     "vcard": "http://www.w3.org/2006/vcard/ns#",
  76
  77     # for some namespaces I made a best guess for the ontology root.
  78     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  79     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  80     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  81     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  82     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  83     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  84     # NTR: New Term Request space for DCC to implement new ontology terms
  85
  86 }
  87
  88 ENCODED_SCHEMA_ROOT = '/profiles/'
  89
  90
  91 class ENCODED:
  92     '''Programatic access encoded, the software powering ENCODE3's submit site.
  93     '''
  94     def __init__(self, server, contexts=None, namespaces=None):
  95         self.server = server
  96         self.scheme = 'https'
  97         self.username = None
  98         self.password = None
  99         self.contexts = contexts if contexts else ENCODED_CONTEXT
 100         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 101         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 102         self.schemas = {}
 103
 104     def get_auth(self):
 105         return (self.username, self.password)
 106     auth = property(get_auth)
 107
 108     def load_netrc(self):
 109         import netrc
 110         session = netrc.netrc()
 111         authenticators = session.authenticators(self.server)
 112         if authenticators:
 113             self.username = authenticators[0]
 114             self.password = authenticators[2]
 115
 116     def add_jsonld_context(self, tree, default_base):
 117         """Add contexts to various objects in the tree.
 118
 119         tree is a json tree returned from the DCC's encoded database.
 120         contexts is a dictionary of dictionaries containing contexts
 121                 for the various  possible encoded classes.
 122         base, if supplied allows setting the base url that relative
 123             urls will be resolved against.
 124         """
 125         self.add_jsonld_child_context(tree, default_base)
 126         self.add_jsonld_namespaces(tree['@context'])
 127
 128     def add_jsonld_child_context(self, obj, default_base):
 129         '''Add JSON-LD context to the encoded JSON.
 130
 131         This is recursive because some of the IDs were relative URLs
 132         and I needed a way to properly compute a the correct base URL.
 133         '''
 134         # pretend strings aren't iterable
 135         if isinstance(obj, six.string_types):
 136             return
 137
 138         # recurse on container types
 139         if isinstance(obj, collections.Sequence):
 140             # how should I update lists?
 141             for v in obj:
 142                 self.add_jsonld_child_context(v, default_base)
 143             return
 144
 145         if isinstance(obj, collections.Mapping):
 146             for v in obj.values():
 147                 self.add_jsonld_child_context(v, default_base)
 148
 149         # we have an object. attach a context to it.
 150         if self._is_encoded_object(obj):
 151             context = self.create_jsonld_context(obj, default_base)
 152             if len(context) > 0:
 153                 obj.setdefault('@context', {}).update(context)
 154
 155     def add_jsonld_namespaces(self, context):
 156         '''Add shortcut namespaces to a context
 157
 158         Only needs to be run on the top-most context
 159         '''
 160         context.update(self.namespaces)
 161
 162     def create_jsonld_context(self, obj, default_base):
 163         '''Synthesize the context for a encoded type
 164
 165         self.contexts[None] = default context attributes added to any type
 166         self.contexts[type] = context attributes for this type.
 167         '''
 168         obj_type = self.get_object_type(obj)
 169         context = {'@base': urljoin(default_base, obj['@id']),
 170                    '@vocab': self.get_schema_url(obj_type)}
 171         # add in defaults
 172         context.update(self.contexts[None])
 173         for t in obj['@type']:
 174             if t in self.contexts:
 175                 context.update(self.contexts[t])
 176         return context
 177
 178     def get_json(self, obj_id, **kwargs):
 179         '''GET an ENCODE object as JSON and return as dict
 180
 181         Uses prepare_url to allow url short-cuts
 182         if no keyword arguments are specified it will default to adding limit=all
 183         Alternative keyword arguments can be passed in and will be sent to the host.
 184
 185         Known keywords are:
 186           limit - (integer or 'all') how many records to return, all for all of them
 187           embed - (bool) if true expands linking ids into their associated object.
 188           format - text/html or application/json
 189         '''
 190         if len(kwargs) == 0:
 191             kwargs['limit'] = 'all'
 192
 193         url = self.prepare_url(obj_id)
 194         LOGGER.info('requesting url: {}'.format(url))
 195
 196         # do the request
 197
 198         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 199         arguments = {}
 200         if self.username and self.password:
 201             arguments['auth'] = self.auth
 202         response = requests.get(url, headers=self.json_headers,
 203                                 params=kwargs,
 204                                 **arguments)
 205         if not response.status_code == requests.codes.ok:
 206             LOGGER.error("Error http status: {}".format(response.status_code))
 207             response.raise_for_status()
 208         return response.json()
 209
 210     def get_jsonld(self, obj_id, **kwargs):
 211         '''Get ENCODE object as JSONLD annotated with classses contexts
 212
 213         see get_json for documentation about what keywords can be passed.
 214         '''
 215         url = self.prepare_url(obj_id)
 216         json = self.get_json(obj_id, **kwargs)
 217         self.add_jsonld_context(json, url)
 218         return json
 219
 220     def get_object_type(self, obj):
 221         """Return type for a encoded object
 222         """
 223         obj_type = obj.get('@type')
 224         if not obj_type:
 225             raise ValueError('None type')
 226         if isinstance(obj_type, six.string_types):
 227             raise ValueError('@type should be a list, not a string')
 228         if not isinstance(obj_type, collections.Sequence):
 229             raise ValueError('@type is not a sequence')
 230         return obj_type[0]
 231
 232     def get_schema_url(self, object_type):
 233         """Create the ENCODED jsonschema url.
 234
 235         Return the ENCODED object schema url be either
 236         object type name or the collection name one posts to.
 237
 238         For example
 239            server.get_schema_url('experiment') and
 240            server.get_schema_url('/experiments/') both resolve to
 241            SERVER/profiles/experiment.json
 242
 243         Arguments:
 244            object_type (str): either ENCODED object name or collection
 245
 246         Returns:
 247            Schema URL
 248         """
 249         collection_to_type = {
 250             '/biosamples/': 'biosample',
 251             '/datasets/': 'dataset',
 252             '/documents/': 'document',
 253             '/experiments/': 'experiment',
 254             '/libraries/': 'library',
 255             '/replicates/': 'replicate',
 256         }
 257         object_type = collection_to_type.get(object_type, object_type)
 258
 259         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 260
 261     def get_accession_name(self, collection):
 262         """Lookup common object accession name given a collection name.
 263         """
 264         collection_to_accession_name = {
 265             '/experiments/': 'experiment_accession',
 266             '/biosamples/': 'biosample_accession',
 267             '/libraries/': 'library_accession',
 268             '/replicates/': 'uuid',
 269         }
 270
 271         accession_name = collection_to_accession_name.get(collection, None)
 272         if accession_name is None:
 273             raise RuntimeError("Update list of collection to accession names for %s",
 274                                collection)
 275
 276         return accession_name
 277
 278     def _is_encoded_object(self, obj):
 279         '''Test to see if an object is a JSON-LD object
 280
 281         Some of the nested dictionaries lack the @id or @type
 282         information necessary to convert them.
 283         '''
 284         if not isinstance(obj, collections.Iterable):
 285             return False
 286
 287         if '@id' in obj and '@type' in obj:
 288             return True
 289         return False
 290
 291     def patch_json(self, obj_id, changes):
 292         """Given a dictionary of changes push them as a HTTP patch request
 293         """
 294         url = self.prepare_url(obj_id)
 295         LOGGER.info('PATCHing to %s', url)
 296         payload = json.dumps(changes)
 297         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 298         if response.status_code != requests.codes.ok:
 299             LOGGER.error("Error http status: {}".format(response.status_code))
 300             LOGGER.error("Response: %s", response.text)
 301             response.raise_for_status()
 302         return response.json()
 303
 304     def put_json(self, obj_id, new_object):
 305         url = self.prepare_url(obj_id)
 306         LOGGER.info('PUTing to %s', url)
 307         payload = json.dumps(new_object)
 308         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 309         if response.status_code != requests.codes.created:
 310             LOGGER.error("Error http status: {}".format(response.status_code))
 311             response.raise_for_status()
 312         return response.json()
 313
 314     def post_json(self, collection_id, new_object):
 315         url = self.prepare_url(collection_id)
 316         LOGGER.info('POSTing to %s', url)
 317         payload = json.dumps(new_object)
 318
 319         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 320         if response.status_code != requests.codes.created:
 321             LOGGER.error("Error http status: {}".format(response.status_code))
 322             response.raise_for_status()
 323         return response.json()
 324
 325     def prepare_url(self, request_url):
 326         '''This attempts to provide some convienence for accessing a URL
 327
 328         Given a url fragment it will default to :
 329         * requests over http
 330         * requests to self.server
 331
 332         This allows fairly flexible urls. e.g.
 333
 334         prepare_url('/experiments/ENCSR000AEG')
 335         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 336         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 337
 338         should all return the same url
 339         '''
 340         # clean up potentially messy urls
 341         url = urlparse(request_url)._asdict()
 342         if not url['scheme']:
 343             url['scheme'] = self.scheme
 344         if not url['netloc']:
 345             url['netloc'] = self.server
 346         url = urlunparse(url.values())
 347         return url
 348
 349     def search_jsonld(self, **kwargs):
 350         '''Send search request to ENCODED
 351
 352         to do a general search do
 353             searchTerm=term
 354         '''
 355         url = self.prepare_url('/search/')
 356         result = self.get_json(url, **kwargs)
 357         self.convert_search_to_jsonld(result)
 358         return result
 359
 360     def convert_search_to_jsonld(self, result):
 361         '''Add the context to search result
 362
 363         Also remove hard to handle nested attributes
 364           e.g. remove object.term when we have no id
 365         '''
 366         graph = result['@graph']
 367         for i, obj in enumerate(graph):
 368             # suppress nested attributes
 369             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 370
 371         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 372         return result
 373
 374     def validate(self, obj, object_type=None):
 375         """Validate an object against the ENCODED schema
 376
 377         Args:
 378             obj (dictionary): object attributes to be submitted to encoded
 379             object_type (string): ENCODED object name.
 380
 381         Raises:
 382             ValidationError: if the object does not conform to the schema.
 383         """
 384         object_type = object_type if object_type else self.get_object_type(obj)
 385         schema_url = self.get_schema_url(object_type)
 386         if not schema_url:
 387             raise ValueError("Unable to construct schema url")
 388
 389         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 390         hidden = obj.copy()
 391         if '@id' in hidden:
 392             del hidden['@id']
 393         if '@type' in hidden:
 394             del hidden['@type']
 395         jsonschema.validate(hidden, schema)
 396
 397 class TypedColumnParser(object):
 398     @staticmethod
 399     def parse_sheet_array_type(value):
 400         """Helper function to parse :array columns in sheet
 401         """
 402         return value.split(', ')
 403
 404     @staticmethod
 405     def parse_sheet_integer_type(value):
 406         """Helper function to parse :integer columns in sheet
 407         """
 408         return int(value)
 409
 410     @staticmethod
 411     def parse_sheet_boolean_type(value):
 412         """Helper function to parse :boolean columns in sheet
 413         """
 414         return bool(value)
 415
 416     @staticmethod
 417     def parse_sheet_timestamp_type(value):
 418         """Helper function to parse :date columns in sheet
 419         """
 420         return value.strftime('%Y-%m-%d')
 421
 422     @staticmethod
 423     def parse_sheet_string_type(value):
 424         """Helper function to parse :string columns in sheet (the default)
 425         """
 426         return unicode(value)
 427
 428     def __getitem__(self, name):
 429         parser = {
 430             'array': self.parse_sheet_array_type,
 431             'boolean': self.parse_sheet_boolean_type,
 432             'integer': self.parse_sheet_integer_type,
 433             'date': self.parse_sheet_timestamp_type,
 434             'string': self.parse_sheet_string_type
 435         }.get(name)
 436         if parser:
 437             return parser
 438         else:
 439             raise RuntimeError("unrecognized column type")
 440
 441     def __call__(self, header, value):
 442         header = header.split(':')
 443         column_type = 'string'
 444         if len(header) > 1:
 445             if header[1] == 'skip':
 446                 return None, None
 447             else:
 448                 column_type = header[1]
 449         return header[0], self[column_type](value)
 450
 451 typed_column_parser = TypedColumnParser()
 452
 453 class Document(object):
 454     """Helper class for registering documents
 455
 456     Usage:
 457     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 458     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 459     lysis.create_if_needed(server, lysis_uuid)
 460     """
 461     award = 'U54HG006998'
 462     lab = '/labs/barbara-wold'
 463
 464     def __init__(self, url, document_type, description, aliases=None):
 465         self.url = url
 466         self.filename = os.path.basename(url)
 467         self.document_type = document_type
 468         self.description = description
 469
 470         self.references = []
 471         self.aliases = None
 472         if aliases:
 473             if isinstance(aliases, list):
 474                 self.aliases = aliases
 475             else:
 476                 raise ValueError("Aliases needs to be a list")
 477         self.content_type = None
 478         self.document = None
 479         self.md5sum = None
 480         self.urls = None
 481         self.uuid = None
 482
 483         self.get_document()
 484
 485     def get_document(self):
 486         if os.path.exists(self.url):
 487             with open(self.url, 'r') as instream:
 488                 assert self.url.endswith('pdf')
 489                 self.content_type = 'application/pdf'
 490                 self.document = instream.read()
 491                 self.md5sum = hashlib.md5(self.document)
 492         else:
 493             req = requests.get(self.url)
 494             if req.status_code == 200:
 495                 self.content_type = req.headers['content-type']
 496                 self.document = req.content
 497                 self.md5sum = hashlib.md5(self.document)
 498                 self.urls = [self.url]
 499
 500     def create_payload(self):
 501         document_payload = {
 502             'attachment': {
 503               'download': self.filename,
 504               'type': self.content_type,
 505               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 506               'md5sum': self.md5sum.hexdigest()
 507             },
 508             'document_type': self.document_type,
 509             'description': self.description,
 510             'award': self.award,
 511             'lab': self.lab,
 512         }
 513         if self.aliases:
 514             document_payload['aliases'] = self.aliases
 515         if self.references:
 516             document_payload['references'] = self.references
 517         if self.urls:
 518             document_payload['urls'] = self.urls
 519
 520         return document_payload
 521
 522     def post(self, server):
 523         document_payload = self.create_payload()
 524         server.validate(document_payload, 'document')
 525         return server.post_json('/documents/', document_payload)
 526
 527     def save(self, filename):
 528         payload = self.create_payload()
 529         with open(filename, 'w') as outstream:
 530             outstream.write(pformat(payload))
 531
 532     def create_if_needed(self, server, uuid):
 533         self.uuid = uuid
 534         if uuid is None:
 535             return self.post(server)
 536         else:
 537             return server.get_json(uuid, embed=False)
 538
 539 if __name__ == '__main__':
 540     # try it
 541     from htsworkflow.util.rdfhelp import get_model, dump_model
 542     from htsworkflow.util.rdfjsonld import load_into_model
 543     from pprint import pprint
 544     model = get_model()
 545     logging.basicConfig(level=logging.DEBUG)
 546     encoded = ENCODED('test.encodedcc.org')
 547     encoded.load_netrc()
 548     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 549     pprint(body)
 550     load_into_model(model, body)
 551     #dump_model(model)