htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import pandas
   7 import base64
   8 import collections
   9 import hashlib
  10 import logging
  11 import json
  12 import jsonschema
  13 import os
  14 import re
  15 import requests
  16 import six
  17 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
  18
  19 LOGGER = logging.getLogger(__name__)
  20
  21 ENCODED_CONTEXT = {
  22     # The None context will get added to the root of the tree and will
  23     # provide common defaults.
  24     None: {
  25         # terms in multiple encoded objects
  26         'award': {'@type': '@id'},
  27         'dataset': {'@type': '@id'},
  28         'description': 'rdf:description',
  29         'documents': {'@type': '@id'},
  30         'experiment': {'@type': '@id'},
  31         'href': {'@type': '@id'},
  32         'lab': {'@type': '@id'},
  33         'library': {'@type': '@id'},
  34         'pi': {'@type': '@id'},
  35         'platform': {'@type': '@id'},
  36         'replicates': {'@type': '@id'},
  37         'submitted_by': {'@type': '@id'},
  38         'url': {'@type': '@id'},
  39     },
  40     # Identify and markup contained classes.
  41     # e.g. in the tree there was a sub-dictionary named 'biosample'
  42     # That dictionary had a term 'biosample_term_id, which is the
  43     # term that should be used as the @id.
  44     'Biosample': {
  45         'biosample_term_id': {'@type': '@id'},
  46     },
  47     'Experiment': {
  48         "assay_term_id": {"@type": "@id"},
  49         "files": {"@type": "@id"},
  50         "original_files": {"@type": "@id"},
  51     },
  52     # I tried to use the JSON-LD mapping capabilities to convert the lab
  53     # contact information into a vcard record, but the encoded model
  54     # didn't lend itself well to the vcard schema
  55     #'lab': {
  56     #    "address1": "vcard:street-address",
  57     #    "address2": "vcard:street-address",
  58     #    "city": "vcard:locality",
  59     #    "state": "vcard:region",
  60     #    "country": "vcard:country"
  61     #},
  62     'Library': {
  63         'nucleic_acid_term_id': {'@type': '@id'}
  64     }
  65 }
  66
  67 #FIXME: this needs to be initialized from rdfns
  68 ENCODED_NAMESPACES = {
  69     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  70     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  71     # rdfs:label)
  72     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  73     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  74     "owl": "http://www.w3.org/2002/07/owl#",
  75     "dc": "htp://purl.org/dc/elements/1.1/",
  76     "xsd": "http://www.w3.org/2001/XMLSchema#",
  77     "vcard": "http://www.w3.org/2006/vcard/ns#",
  78
  79     # for some namespaces I made a best guess for the ontology root.
  80     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  81     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  82     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  83     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  84     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  85     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  86     # NTR: New Term Request space for DCC to implement new ontology terms
  87
  88 }
  89
  90 ENCODED_SCHEMA_ROOT = '/profiles/'
  91
  92
  93 class ENCODED:
  94     '''Programatic access encoded, the software powering ENCODE3's submit site.
  95     '''
  96     def __init__(self, server, contexts=None, namespaces=None):
  97         self.server = server
  98         self.scheme = 'https'
  99         self.username = None
 100         self.password = None
 101         self.contexts = contexts if contexts else ENCODED_CONTEXT
 102         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 103         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 104         self.schemas = {}
 105
 106     def get_auth(self):
 107         return (self.username, self.password)
 108     auth = property(get_auth)
 109
 110     def load_netrc(self):
 111         import netrc
 112         session = netrc.netrc()
 113         authenticators = session.authenticators(self.server)
 114         if authenticators:
 115             self.username = authenticators[0]
 116             self.password = authenticators[2]
 117
 118     def add_jsonld_context(self, tree, default_base):
 119         """Add contexts to various objects in the tree.
 120
 121         tree is a json tree returned from the DCC's encoded database.
 122         contexts is a dictionary of dictionaries containing contexts
 123                 for the various  possible encoded classes.
 124         base, if supplied allows setting the base url that relative
 125             urls will be resolved against.
 126         """
 127         self.add_jsonld_child_context(tree, default_base)
 128         self.add_jsonld_namespaces(tree['@context'])
 129
 130     def add_jsonld_child_context(self, obj, default_base):
 131         '''Add JSON-LD context to the encoded JSON.
 132
 133         This is recursive because some of the IDs were relative URLs
 134         and I needed a way to properly compute a the correct base URL.
 135         '''
 136         # pretend strings aren't iterable
 137         if isinstance(obj, six.string_types):
 138             return
 139
 140         # recurse on container types
 141         if isinstance(obj, collections.Sequence):
 142             # how should I update lists?
 143             for v in obj:
 144                 self.add_jsonld_child_context(v, default_base)
 145             return
 146
 147         if isinstance(obj, collections.Mapping):
 148             for v in obj.values():
 149                 self.add_jsonld_child_context(v, default_base)
 150
 151         # we have an object. attach a context to it.
 152         if self._is_encoded_object(obj):
 153             context = self.create_jsonld_context(obj, default_base)
 154             if len(context) > 0:
 155                 # this is a total hack for relese 33 of
 156                 # encoded. They changed their model and
 157                 # i'm not sure what to do about it.
 158                 if obj.get('@context') == '/terms/':
 159                     del obj['@context']
 160                 obj.setdefault('@context', {}).update(context)
 161
 162     def add_jsonld_namespaces(self, context):
 163         '''Add shortcut namespaces to a context
 164
 165         Only needs to be run on the top-most context
 166         '''
 167         context.update(self.namespaces)
 168
 169     def create_jsonld_context(self, obj, default_base):
 170         '''Synthesize the context for a encoded type
 171
 172         self.contexts[None] = default context attributes added to any type
 173         self.contexts[type] = context attributes for this type.
 174         '''
 175         obj_type = self.get_object_type(obj)
 176         context = {'@base': urljoin(default_base, obj['@id']),
 177                    '@vocab': self.get_schema_url(obj_type)}
 178         # add in defaults
 179         context.update(self.contexts[None])
 180         for t in obj['@type']:
 181             if t in self.contexts:
 182                 context.update(self.contexts[t])
 183         return context
 184
 185     def get_json(self, obj_id, **kwargs):
 186         '''GET an ENCODE object as JSON and return as dict
 187
 188         Uses prepare_url to allow url short-cuts
 189         if no keyword arguments are specified it will default to adding limit=all
 190         Alternative keyword arguments can be passed in and will be sent to the host.
 191
 192         Known keywords are:
 193           limit - (integer or 'all') how many records to return, all for all of them
 194           embed - (bool) if true expands linking ids into their associated object.
 195           format - text/html or application/json
 196         '''
 197         if len(kwargs) == 0:
 198             kwargs['limit'] = 'all'
 199
 200         url = self.prepare_url(obj_id)
 201         LOGGER.info('requesting url: {}'.format(url))
 202
 203         # do the request
 204
 205         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 206         arguments = {}
 207         if self.username and self.password:
 208             arguments['auth'] = self.auth
 209         response = requests.get(url, headers=self.json_headers,
 210                                 params=kwargs,
 211                                 **arguments)
 212         if not response.status_code == requests.codes.ok:
 213             LOGGER.error("Error http status: {}".format(response.status_code))
 214             response.raise_for_status()
 215         return response.json()
 216
 217     def get_jsonld(self, obj_id, **kwargs):
 218         '''Get ENCODE object as JSONLD annotated with classses contexts
 219
 220         see get_json for documentation about what keywords can be passed.
 221         '''
 222         url = self.prepare_url(obj_id)
 223         json = self.get_json(obj_id, **kwargs)
 224         self.add_jsonld_context(json, url)
 225         return json
 226
 227     def get_object_type(self, obj):
 228         """Return type for a encoded object
 229         """
 230         obj_type = obj.get('@type')
 231         if not obj_type:
 232             raise ValueError('None type')
 233         if isinstance(obj_type, six.string_types):
 234             raise ValueError('@type should be a list, not a string')
 235         if not isinstance(obj_type, collections.Sequence):
 236             raise ValueError('@type is not a sequence')
 237         return obj_type[0]
 238
 239     def get_schema_url(self, object_type):
 240         """Create the ENCODED jsonschema url.
 241
 242         Return the ENCODED object schema url be either
 243         object type name or the collection name one posts to.
 244
 245         For example
 246            server.get_schema_url('experiment') and
 247            server.get_schema_url('/experiments/') both resolve to
 248            SERVER/profiles/experiment.json
 249
 250         Arguments:
 251            object_type (str): either ENCODED object name or collection
 252
 253         Returns:
 254            Schema URL
 255         """
 256         collection_to_type = {
 257             '/biosamples/': 'biosample',
 258             '/datasets/': 'dataset',
 259             '/documents/': 'document',
 260             '/experiments/': 'experiment',
 261             '/libraries/': 'library',
 262             '/replicates/': 'replicate',
 263         }
 264         object_type = collection_to_type.get(object_type, object_type)
 265
 266         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 267
 268     def get_accession_name(self, collection):
 269         """Lookup common object accession name given a collection name.
 270         """
 271         collection_to_accession_name = {
 272             '/experiments/': 'experiment_accession',
 273             '/biosamples/': 'biosample_accession',
 274             '/libraries/': 'library_accession',
 275             '/replicates/': 'uuid',
 276         }
 277
 278         accession_name = collection_to_accession_name.get(collection, None)
 279         if accession_name is None:
 280             raise RuntimeError("Update list of collection to accession names for %s",
 281                                collection)
 282
 283         return accession_name
 284
 285     def _is_encoded_object(self, obj):
 286         '''Test to see if an object is a JSON-LD object
 287
 288         Some of the nested dictionaries lack the @id or @type
 289         information necessary to convert them.
 290         '''
 291         if not isinstance(obj, collections.Iterable):
 292             return False
 293
 294         if '@id' in obj and '@type' in obj:
 295             return True
 296         return False
 297
 298     def patch_json(self, obj_id, changes):
 299         """Given a dictionary of changes push them as a HTTP patch request
 300         """
 301         url = self.prepare_url(obj_id)
 302         LOGGER.info('PATCHing to %s', url)
 303         payload = json.dumps(changes)
 304         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 305         if response.status_code != requests.codes.ok:
 306             LOGGER.error("Error http status: {}".format(response.status_code))
 307             LOGGER.error("Response: %s", response.text)
 308             response.raise_for_status()
 309         return response.json()
 310
 311     def put_json(self, obj_id, new_object):
 312         url = self.prepare_url(obj_id)
 313         LOGGER.info('PUTing to %s', url)
 314         payload = json.dumps(new_object)
 315         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 316         if response.status_code != requests.codes.created:
 317             LOGGER.error("Error http status: {}".format(response.status_code))
 318             response.raise_for_status()
 319         return response.json()
 320
 321     def post_json(self, collection_id, new_object):
 322         url = self.prepare_url(collection_id)
 323         LOGGER.info('POSTing to %s', url)
 324         payload = json.dumps(new_object)
 325
 326         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 327         if response.status_code != requests.codes.created:
 328             LOGGER.error("http status: {}".format(response.status_code))
 329             LOGGER.error("message: {}".format(response.content))
 330             response.raise_for_status()
 331         return response.json()
 332
 333     def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
 334         """Create new ENCODED objects using metadata encoded in pandas DataFrame
 335
 336         The DataFrame column names need to encode the attribute names,
 337         and in some cases also include some additional type information.
 338         (see TypedColumnParser)
 339
 340         Arguments:
 341            collection (str): name of collection to create new objects in
 342            sheet (pandas.DataFrame): DataFrame with objects to create,
 343                assuming the appropriate accession number is empty.
 344                additional the accession number and uuid is updated if the object
 345                is created.
 346            dry_run (bool): whether or not to skip the code to post the objects
 347            verbose (bool): print the http responses.
 348
 349         Returns:
 350            list of created objects.
 351
 352         Raises:
 353            jsonschema.ValidationError if the object doesn't validate against
 354               the encoded jsonschema.
 355         """
 356         accession_name = self.get_accession_name(collection)
 357
 358         created = []
 359         columns = sheet.columns
 360         tosubmit = sheet[pandas.isnull(sheet[accession_name])]
 361
 362         for i in tosubmit.index:
 363             row = tosubmit.ix[i]
 364             new_object = {}
 365             for k in columns:
 366                 if pandas.notnull(row[k]):
 367                     name, value = typed_column_parser(k, row[k])
 368                     if name is None:
 369                         continue
 370                     new_object[name] = value
 371
 372             try:
 373                 self.validate(new_object, collection)
 374             except jsonschema.ValidationError as e:
 375                 LOGGER.error("Validation error row %s", i)
 376                 raise e
 377
 378             accession = row[accession_name]
 379             description = row.get('description', None)
 380
 381             if not dry_run:
 382                 response = self.post_json(collection, new_object)
 383                 if verbose:
 384                     print("Reponse {}".format(response))
 385
 386                 obj = response['@graph'][0]
 387                 created.append(obj)
 388                 accession = obj.get('accession')
 389                 uuid = obj.get('uuid')
 390
 391                 if accession:
 392                     sheet[accession_name][i] = accession
 393                 else:
 394                     accession = uuid
 395
 396                 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
 397                     sheet['uuid'][i] = uuid
 398
 399                 print("row {} created: {}".format(i, accession))
 400             else:
 401                 created.append(new_object)
 402             LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
 403
 404         return created
 405
 406     def prepare_url(self, request_url):
 407         '''This attempts to provide some convienence for accessing a URL
 408
 409         Given a url fragment it will default to :
 410         * requests over http
 411         * requests to self.server
 412
 413         This allows fairly flexible urls. e.g.
 414
 415         prepare_url('/experiments/ENCSR000AEG')
 416         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 417         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 418
 419         should all return the same url
 420         '''
 421         # clean up potentially messy urls
 422         url = urlparse(request_url)._asdict()
 423         if not url['scheme']:
 424             url['scheme'] = self.scheme
 425         if not url['netloc']:
 426             url['netloc'] = self.server
 427         url = urlunparse(url.values())
 428         return url
 429
 430     def search_jsonld(self, **kwargs):
 431         '''Send search request to ENCODED
 432
 433         to do a general search do
 434             searchTerm=term
 435         '''
 436         url = self.prepare_url('/search/')
 437         result = self.get_json(url, **kwargs)
 438         self.convert_search_to_jsonld(result)
 439         return result
 440
 441     def convert_search_to_jsonld(self, result):
 442         '''Add the context to search result
 443
 444         Also remove hard to handle nested attributes
 445           e.g. remove object.term when we have no id
 446         '''
 447         graph = result['@graph']
 448         for i, obj in enumerate(graph):
 449             # suppress nested attributes
 450             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 451
 452         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 453         return result
 454
 455     def validate(self, obj, object_type=None):
 456         """Validate an object against the ENCODED schema
 457
 458         Args:
 459             obj (dictionary): object attributes to be submitted to encoded
 460             object_type (string): ENCODED object name.
 461
 462         Raises:
 463             ValidationError: if the object does not conform to the schema.
 464         """
 465         object_type = object_type if object_type else self.get_object_type(obj)
 466         schema_url = self.get_schema_url(object_type)
 467         if not schema_url:
 468             raise ValueError("Unable to construct schema url")
 469
 470         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 471         hidden = obj.copy()
 472         if '@id' in hidden:
 473             del hidden['@id']
 474         if '@type' in hidden:
 475             del hidden['@type']
 476         jsonschema.validate(hidden, schema)
 477
 478 class TypedColumnParser(object):
 479     @staticmethod
 480     def parse_sheet_array_type(value):
 481         """Helper function to parse :array columns in sheet
 482         """
 483         return re.split(',\s*', value)
 484
 485     @staticmethod
 486     def parse_sheet_integer_type(value):
 487         """Helper function to parse :integer columns in sheet
 488         """
 489         return int(value)
 490
 491     @staticmethod
 492     def parse_sheet_boolean_type(value):
 493         """Helper function to parse :boolean columns in sheet
 494         """
 495         return bool(value)
 496
 497     @staticmethod
 498     def parse_sheet_timestamp_type(value):
 499         """Helper function to parse :date columns in sheet
 500         """
 501         return value.strftime('%Y-%m-%d')
 502
 503     @staticmethod
 504     def parse_sheet_string_type(value):
 505         """Helper function to parse :string columns in sheet (the default)
 506         """
 507         return str(value)
 508
 509     def __getitem__(self, name):
 510         parser = {
 511             'array': self.parse_sheet_array_type,
 512             'boolean': self.parse_sheet_boolean_type,
 513             'integer': self.parse_sheet_integer_type,
 514             'date': self.parse_sheet_timestamp_type,
 515             'string': self.parse_sheet_string_type
 516         }.get(name)
 517         if parser:
 518             return parser
 519         else:
 520             raise RuntimeError("unrecognized column type")
 521
 522     def __call__(self, header, value):
 523         header = header.split(':')
 524         column_type = 'string'
 525         if len(header) > 1:
 526             if header[1] == 'skip':
 527                 return None, None
 528             else:
 529                 column_type = header[1]
 530         return header[0], self[column_type](value)
 531
 532 typed_column_parser = TypedColumnParser()
 533
 534 class Document(object):
 535     """Helper class for registering documents
 536
 537     Usage:
 538     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 539     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 540     lysis.create_if_needed(server, lysis_uuid)
 541     """
 542     award = 'U54HG006998'
 543     lab = '/labs/barbara-wold'
 544
 545     def __init__(self, url, document_type, description, aliases=None):
 546         self.url = url
 547         self.filename = os.path.basename(url)
 548         self.document_type = document_type
 549         self.description = description
 550
 551         self.references = []
 552         self.aliases = None
 553         if aliases:
 554             if isinstance(aliases, list):
 555                 self.aliases = aliases
 556             else:
 557                 raise ValueError("Aliases needs to be a list")
 558         self.content_type = None
 559         self.document = None
 560         self.md5sum = None
 561         self.urls = None
 562         self.uuid = None
 563
 564         self.get_document()
 565
 566     def get_document(self):
 567         if os.path.exists(self.url):
 568             with open(self.url, 'rb') as instream:
 569                 assert self.url.endswith('pdf')
 570                 self.content_type = 'application/pdf'
 571                 self.document = instream.read()
 572                 self.md5sum = hashlib.md5(self.document)
 573         else:
 574             req = requests.get(self.url)
 575             if req.status_code == 200:
 576                 self.content_type = req.headers['content-type']
 577                 self.document = req.content
 578                 self.md5sum = hashlib.md5(self.document)
 579                 self.urls = [self.url]
 580
 581     def create_payload(self):
 582         document_payload = {
 583             'attachment': {
 584               'download': self.filename,
 585               'type': self.content_type,
 586               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
 587               'md5sum': self.md5sum.hexdigest()
 588             },
 589             'document_type': self.document_type,
 590             'description': self.description,
 591             'award': self.award,
 592             'lab': self.lab,
 593         }
 594         if self.aliases:
 595             document_payload['aliases'] = self.aliases
 596         if self.references:
 597             document_payload['references'] = self.references
 598         if self.urls:
 599             document_payload['urls'] = self.urls
 600
 601         return document_payload
 602
 603     def post(self, server):
 604         document_payload = self.create_payload()
 605         server.validate(document_payload, 'document')
 606         return server.post_json('/documents/', document_payload)
 607
 608     def save(self, filename):
 609         payload = self.create_payload()
 610         with open(filename, 'w') as outstream:
 611             outstream.write(pformat(payload))
 612
 613     def create_if_needed(self, server, uuid):
 614         self.uuid = uuid
 615         if uuid is None:
 616             return self.post(server)
 617         else:
 618             return server.get_json(uuid, embed=False)
 619
 620 if __name__ == '__main__':
 621     # try it
 622     from htsworkflow.util.rdfhelp import get_model, dump_model
 623     from htsworkflow.util.rdfjsonld import load_into_model
 624     from pprint import pprint
 625     model = get_model()
 626     logging.basicConfig(level=logging.DEBUG)
 627     encoded = ENCODED('test.encodedcc.org')
 628     encoded.load_netrc()
 629     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 630     pprint(body)
 631     load_into_model(model, body)
 632     #dump_model(model)