htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import pandas
   7 import base64
   8 import collections
   9 import hashlib
  10 import logging
  11 import json
  12 import jsonschema
  13 import numpy
  14 import os
  15 import re
  16 import requests
  17 import six
  18 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
  19
  20 LOGGER = logging.getLogger(__name__)
  21
  22 ENCODED_CONTEXT = {
  23     # The None context will get added to the root of the tree and will
  24     # provide common defaults.
  25     None: {
  26         # terms in multiple encoded objects
  27         'award': {'@type': '@id'},
  28         'dataset': {'@type': '@id'},
  29         'description': 'rdf:description',
  30         'documents': {'@type': '@id'},
  31         'experiment': {'@type': '@id'},
  32         'href': {'@type': '@id'},
  33         'lab': {'@type': '@id'},
  34         'library': {'@type': '@id'},
  35         'pi': {'@type': '@id'},
  36         'platform': {'@type': '@id'},
  37         'replicates': {'@type': '@id'},
  38         'submitted_by': {'@type': '@id'},
  39         'url': {'@type': '@id'},
  40     },
  41     # Identify and markup contained classes.
  42     # e.g. in the tree there was a sub-dictionary named 'biosample'
  43     # That dictionary had a term 'biosample_term_id, which is the
  44     # term that should be used as the @id.
  45     'Biosample': {
  46         'biosample_term_id': {'@type': '@id'},
  47     },
  48     'Experiment': {
  49         "assay_term_id": {"@type": "@id"},
  50         "files": {"@type": "@id"},
  51         "original_files": {"@type": "@id"},
  52     },
  53     # I tried to use the JSON-LD mapping capabilities to convert the lab
  54     # contact information into a vcard record, but the encoded model
  55     # didn't lend itself well to the vcard schema
  56     #'lab': {
  57     #    "address1": "vcard:street-address",
  58     #    "address2": "vcard:street-address",
  59     #    "city": "vcard:locality",
  60     #    "state": "vcard:region",
  61     #    "country": "vcard:country"
  62     #},
  63     'Library': {
  64         'nucleic_acid_term_id': {'@type': '@id'}
  65     }
  66 }
  67
  68 #FIXME: this needs to be initialized from rdfns
  69 ENCODED_NAMESPACES = {
  70     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  71     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  72     # rdfs:label)
  73     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  74     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  75     "owl": "http://www.w3.org/2002/07/owl#",
  76     "dc": "htp://purl.org/dc/elements/1.1/",
  77     "xsd": "http://www.w3.org/2001/XMLSchema#",
  78     "vcard": "http://www.w3.org/2006/vcard/ns#",
  79
  80     # for some namespaces I made a best guess for the ontology root.
  81     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  82     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  83     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  84     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  85     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  86     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  87     # NTR: New Term Request space for DCC to implement new ontology terms
  88
  89 }
  90
  91 ENCODED_SCHEMA_ROOT = '/profiles/'
  92
  93
  94 class ENCODED:
  95     '''Programatic access encoded, the software powering ENCODE3's submit site.
  96     '''
  97     def __init__(self, server, contexts=None, namespaces=None):
  98         self.server = server
  99         self.scheme = 'https'
 100         self.username = None
 101         self.password = None
 102         self.contexts = contexts if contexts else ENCODED_CONTEXT
 103         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 104         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 105         self.schemas = {}
 106
 107     def get_auth(self):
 108         return (self.username, self.password)
 109     auth = property(get_auth)
 110
 111     def load_netrc(self):
 112         import netrc
 113         session = netrc.netrc()
 114         authenticators = session.authenticators(self.server)
 115         if authenticators:
 116             self.username = authenticators[0]
 117             self.password = authenticators[2]
 118
 119     def add_jsonld_context(self, tree, default_base):
 120         """Add contexts to various objects in the tree.
 121
 122         tree is a json tree returned from the DCC's encoded database.
 123         contexts is a dictionary of dictionaries containing contexts
 124                 for the various  possible encoded classes.
 125         base, if supplied allows setting the base url that relative
 126             urls will be resolved against.
 127         """
 128         self.add_jsonld_child_context(tree, default_base)
 129         self.add_jsonld_namespaces(tree['@context'])
 130
 131     def add_jsonld_child_context(self, obj, default_base):
 132         '''Add JSON-LD context to the encoded JSON.
 133
 134         This is recursive because some of the IDs were relative URLs
 135         and I needed a way to properly compute a the correct base URL.
 136         '''
 137         # pretend strings aren't iterable
 138         if isinstance(obj, six.string_types):
 139             return
 140
 141         # recurse on container types
 142         if isinstance(obj, collections.Sequence):
 143             # how should I update lists?
 144             for v in obj:
 145                 self.add_jsonld_child_context(v, default_base)
 146             return
 147
 148         if isinstance(obj, collections.Mapping):
 149             for v in obj.values():
 150                 self.add_jsonld_child_context(v, default_base)
 151
 152         # we have an object. attach a context to it.
 153         if self._is_encoded_object(obj):
 154             context = self.create_jsonld_context(obj, default_base)
 155             if len(context) > 0:
 156                 # this is a total hack for relese 33 of
 157                 # encoded. They changed their model and
 158                 # i'm not sure what to do about it.
 159                 if obj.get('@context') == '/terms/':
 160                     del obj['@context']
 161                 obj.setdefault('@context', {}).update(context)
 162
 163     def add_jsonld_namespaces(self, context):
 164         '''Add shortcut namespaces to a context
 165
 166         Only needs to be run on the top-most context
 167         '''
 168         context.update(self.namespaces)
 169
 170     def create_jsonld_context(self, obj, default_base):
 171         '''Synthesize the context for a encoded type
 172
 173         self.contexts[None] = default context attributes added to any type
 174         self.contexts[type] = context attributes for this type.
 175         '''
 176         obj_type = self.get_object_type(obj)
 177         context = {'@base': urljoin(default_base, obj['@id']),
 178                    '@vocab': self.get_schema_url(obj_type)}
 179         # add in defaults
 180         context.update(self.contexts[None])
 181         for t in obj['@type']:
 182             if t in self.contexts:
 183                 context.update(self.contexts[t])
 184         return context
 185
 186     def get_json(self, obj_id, **kwargs):
 187         '''GET an ENCODE object as JSON and return as dict
 188
 189         Uses prepare_url to allow url short-cuts
 190         if no keyword arguments are specified it will default to adding limit=all
 191         Alternative keyword arguments can be passed in and will be sent to the host.
 192
 193         Known keywords are:
 194           limit - (integer or 'all') how many records to return, all for all of them
 195           embed - (bool) if true expands linking ids into their associated object.
 196           format - text/html or application/json
 197         '''
 198         if len(kwargs) == 0:
 199             kwargs['limit'] = 'all'
 200
 201         url = self.prepare_url(obj_id)
 202         LOGGER.info('requesting url: {}'.format(url))
 203
 204         # do the request
 205
 206         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 207         arguments = {}
 208         if self.username and self.password:
 209             arguments['auth'] = self.auth
 210         response = requests.get(url, headers=self.json_headers,
 211                                 params=kwargs,
 212                                 **arguments)
 213         if not response.status_code == requests.codes.ok:
 214             LOGGER.error("Error http status: {}".format(response.status_code))
 215             response.raise_for_status()
 216         return response.json()
 217
 218     def get_jsonld(self, obj_id, **kwargs):
 219         '''Get ENCODE object as JSONLD annotated with classses contexts
 220
 221         see get_json for documentation about what keywords can be passed.
 222         '''
 223         url = self.prepare_url(obj_id)
 224         json = self.get_json(obj_id, **kwargs)
 225         self.add_jsonld_context(json, url)
 226         return json
 227
 228     def get_object_type(self, obj):
 229         """Return type for a encoded object
 230         """
 231         obj_type = obj.get('@type')
 232         if not obj_type:
 233             raise ValueError('None type')
 234         if isinstance(obj_type, six.string_types):
 235             raise ValueError('@type should be a list, not a string')
 236         if not isinstance(obj_type, collections.Sequence):
 237             raise ValueError('@type is not a sequence')
 238         return obj_type[0]
 239
 240     def get_schema_url(self, object_type):
 241         """Create the ENCODED jsonschema url.
 242
 243         Return the ENCODED object schema url be either
 244         object type name or the collection name one posts to.
 245
 246         For example
 247            server.get_schema_url('experiment') and
 248            server.get_schema_url('/experiments/') both resolve to
 249            SERVER/profiles/experiment.json
 250
 251         Arguments:
 252            object_type (str): either ENCODED object name or collection
 253
 254         Returns:
 255            Schema URL
 256         """
 257         collection_to_type = {
 258             '/biosamples/': 'biosample',
 259             '/datasets/': 'dataset',
 260             '/documents/': 'document',
 261             '/experiments/': 'experiment',
 262             '/libraries/': 'library',
 263             '/replicates/': 'replicate',
 264             '/file/': 'file',
 265         }
 266         object_type = collection_to_type.get(object_type, object_type)
 267
 268         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 269
 270     def get_accession_name(self, collection):
 271         """Lookup common object accession name given a collection name.
 272         """
 273         collection_to_accession_name = {
 274             '/experiments/': 'experiment_accession',
 275             '/biosamples/': 'biosample_accession',
 276             '/libraries/': 'library_accession',
 277             '/replicates/': 'uuid',
 278         }
 279
 280         accession_name = collection_to_accession_name.get(collection, None)
 281         if accession_name is None:
 282             raise RuntimeError("Update list of collection to accession names for %s",
 283                                collection)
 284
 285         return accession_name
 286
 287     def _is_encoded_object(self, obj):
 288         '''Test to see if an object is a JSON-LD object
 289
 290         Some of the nested dictionaries lack the @id or @type
 291         information necessary to convert them.
 292         '''
 293         if not isinstance(obj, collections.Iterable):
 294             return False
 295
 296         if '@id' in obj and '@type' in obj:
 297             return True
 298         return False
 299
 300     def patch_json(self, obj_id, changes):
 301         """Given a dictionary of changes push them as a HTTP patch request
 302         """
 303         url = self.prepare_url(obj_id)
 304         LOGGER.info('PATCHing to %s', url)
 305         payload = json.dumps(changes)
 306         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 307         if response.status_code != requests.codes.ok:
 308             LOGGER.error("Error http status: {}".format(response.status_code))
 309             LOGGER.error("Response: %s", response.text)
 310             response.raise_for_status()
 311         return response.json()
 312
 313     def put_json(self, obj_id, new_object):
 314         url = self.prepare_url(obj_id)
 315         LOGGER.info('PUTing to %s', url)
 316         payload = json.dumps(new_object)
 317         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 318         if response.status_code != requests.codes.created:
 319             LOGGER.error("Error http status: {}".format(response.status_code))
 320             response.raise_for_status()
 321         return response.json()
 322
 323     def post_json(self, collection_id, new_object):
 324         url = self.prepare_url(collection_id)
 325         LOGGER.info('POSTing to %s', url)
 326         payload = json.dumps(new_object)
 327
 328         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 329         if response.status_code != requests.codes.created:
 330             LOGGER.error("http status: {}".format(response.status_code))
 331             LOGGER.error("message: {}".format(response.content))
 332             response.raise_for_status()
 333         return response.json()
 334
 335     def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
 336         """Create new ENCODED objects using metadata encoded in pandas DataFrame
 337
 338         The DataFrame column names need to encode the attribute names,
 339         and in some cases also include some additional type information.
 340         (see TypedColumnParser)
 341
 342         Arguments:
 343            collection (str): name of collection to create new objects in
 344            sheet (pandas.DataFrame): DataFrame with objects to create,
 345                assuming the appropriate accession number is empty.
 346                additional the accession number and uuid is updated if the object
 347                is created.
 348            dry_run (bool): whether or not to skip the code to post the objects
 349            verbose (bool): print the http responses.
 350
 351         Returns:
 352            list of created objects.
 353
 354         Raises:
 355            jsonschema.ValidationError if the object doesn't validate against
 356               the encoded jsonschema.
 357         """
 358         accession_name = self.get_accession_name(collection)
 359
 360         to_create = self.prepare_objects_from_sheet(collection, sheet)
 361
 362         created = []
 363         accessions = []
 364         uuids = []
 365         for i, new_object in to_create:
 366             if new_object:
 367                 accession = new_object.get('accession')
 368                 uuid = new_object.get('uuid')
 369                 description = new_object.get('description')
 370
 371                 posted_object = self.post_object_from_row(
 372                     collection, i, new_object, dry_run, verbose
 373                 )
 374                 created.append(posted_object)
 375
 376                 if posted_object:
 377                     accession = posted_object.get('accession')
 378                     uuid = posted_object.get('uuid')
 379                     description = posted_object.get('description')
 380
 381                 accessions.append(accession)
 382                 uuids.append(uuid)
 383
 384                 LOGGER.info('row {} ({}) -> {}'.format(
 385                     (i+2), description, accession))
 386                 # +2 comes from python row index + 1 to convert to
 387                 # one based indexing + 1 to account for
 388                 # row removed by header parsing
 389             else:
 390                 accessions.append(numpy.nan)
 391                 uuids.append(numpy.nan)
 392
 393         if accession_name in sheet.columns:
 394             sheet[accession_name] = accessions
 395         if 'uuid' in sheet.columns:
 396             sheet['uuid'] = uuids
 397
 398         return created
 399
 400     def prepare_objects_from_sheet(self, collection, sheet):
 401         accession_name = self.get_accession_name(collection)
 402         to_create = []
 403         for i, row in sheet.iterrows():
 404             new_object = {}
 405             for name, value in row.items():
 406                 if pandas.notnull(value):
 407                     name, value = typed_column_parser(name, value)
 408                     if name is None:
 409                         continue
 410                     new_object[name] = value
 411
 412             if new_object and new_object.get(accession_name) is None:
 413                 try:
 414                     self.validate(new_object, collection)
 415                 except jsonschema.ValidationError as e:
 416                     LOGGER.error("Validation error row %s", i)
 417                     raise e
 418                 to_create.append((i, new_object))
 419
 420             else:
 421                 to_create.append((i, None))
 422
 423         return to_create
 424
 425     def post_object_from_row(self, collection, i, new_object,
 426                              dry_run=True, verbose=True):
 427         accession_name = self.get_accession_name(collection)
 428
 429         if not dry_run:
 430             response = self.post_json(collection, new_object)
 431             if verbose:
 432                 print("Reponse {}".format(response))
 433
 434             obj = response['@graph'][0]
 435
 436             accession = obj.get(accession_name)
 437             if not accession:
 438                 accession = obj.get('uuid')
 439
 440             print("row {} created: {}".format(i, accession))
 441             return obj
 442         else:
 443             new_object[accession_name] = 'would create'
 444             return new_object
 445
 446     def prepare_url(self, request_url):
 447         '''This attempts to provide some convienence for accessing a URL
 448
 449         Given a url fragment it will default to :
 450         * requests over http
 451         * requests to self.server
 452
 453         This allows fairly flexible urls. e.g.
 454
 455         prepare_url('/experiments/ENCSR000AEG')
 456         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 457         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 458
 459         should all return the same url
 460         '''
 461         # clean up potentially messy urls
 462         url = urlparse(request_url)._asdict()
 463         if not url['scheme']:
 464             url['scheme'] = self.scheme
 465         if not url['netloc']:
 466             url['netloc'] = self.server
 467         url = urlunparse(url.values())
 468         return url
 469
 470     def search_jsonld(self, **kwargs):
 471         '''Send search request to ENCODED
 472
 473         to do a general search do
 474             searchTerm=term
 475         '''
 476         url = self.prepare_url('/search/')
 477         result = self.get_json(url, **kwargs)
 478         self.convert_search_to_jsonld(result)
 479         return result
 480
 481     def convert_search_to_jsonld(self, result):
 482         '''Add the context to search result
 483
 484         Also remove hard to handle nested attributes
 485           e.g. remove object.term when we have no id
 486         '''
 487         graph = result['@graph']
 488         for i, obj in enumerate(graph):
 489             # suppress nested attributes
 490             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 491
 492         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 493         return result
 494
 495     def validate(self, obj, object_type=None):
 496         """Validate an object against the ENCODED schema
 497
 498         Args:
 499             obj (dictionary): object attributes to be submitted to encoded
 500             object_type (string): ENCODED object name.
 501
 502         Raises:
 503             ValidationError: if the object does not conform to the schema.
 504         """
 505         object_type = object_type if object_type else self.get_object_type(obj)
 506         schema_url = self.get_schema_url(object_type)
 507         if not schema_url:
 508             raise ValueError("Unable to construct schema url")
 509
 510         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 511         hidden = obj.copy()
 512         if '@id' in hidden:
 513             del hidden['@id']
 514         if '@type' in hidden:
 515             del hidden['@type']
 516         jsonschema.validate(hidden, schema)
 517
 518 class TypedColumnParser(object):
 519     @staticmethod
 520     def parse_sheet_array_type(value):
 521         """Helper function to parse :array columns in sheet
 522         """
 523         return re.split(',\s*', value)
 524
 525     @staticmethod
 526     def parse_sheet_integer_type(value):
 527         """Helper function to parse :integer columns in sheet
 528         """
 529         return int(value)
 530
 531     @staticmethod
 532     def parse_sheet_boolean_type(value):
 533         """Helper function to parse :boolean columns in sheet
 534         """
 535         return bool(value)
 536
 537     @staticmethod
 538     def parse_sheet_timestamp_type(value):
 539         """Helper function to parse :date columns in sheet
 540         """
 541         return value.strftime('%Y-%m-%d')
 542
 543     @staticmethod
 544     def parse_sheet_string_type(value):
 545         """Helper function to parse :string columns in sheet (the default)
 546         """
 547         return str(value)
 548
 549     def __getitem__(self, name):
 550         parser = {
 551             'array': self.parse_sheet_array_type,
 552             'boolean': self.parse_sheet_boolean_type,
 553             'integer': self.parse_sheet_integer_type,
 554             'date': self.parse_sheet_timestamp_type,
 555             'string': self.parse_sheet_string_type
 556         }.get(name)
 557         if parser:
 558             return parser
 559         else:
 560             raise RuntimeError("unrecognized column type")
 561
 562     def __call__(self, header, value):
 563         header = header.split(':')
 564         column_type = 'string'
 565         if len(header) > 1:
 566             if header[1] == 'skip':
 567                 return None, None
 568             else:
 569                 column_type = header[1]
 570         return header[0], self[column_type](value)
 571
 572 typed_column_parser = TypedColumnParser()
 573
 574 class Document(object):
 575     """Helper class for registering documents
 576
 577     Usage:
 578     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 579     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 580     lysis.create_if_needed(server, lysis_uuid)
 581     """
 582     award = 'U54HG006998'
 583     lab = '/labs/barbara-wold'
 584
 585     def __init__(self, url, document_type, description, aliases=None):
 586         self.url = url
 587         self.filename = os.path.basename(url)
 588         self.document_type = document_type
 589         self.description = description
 590
 591         self.references = []
 592         self.aliases = None
 593         if aliases:
 594             if isinstance(aliases, list):
 595                 self.aliases = aliases
 596             else:
 597                 raise ValueError("Aliases needs to be a list")
 598         self.content_type = None
 599         self.document = None
 600         self.md5sum = None
 601         self.urls = None
 602         self.uuid = None
 603
 604         self.get_document()
 605
 606     def get_document(self):
 607         if os.path.exists(self.url):
 608             with open(self.url, 'rb') as instream:
 609                 assert self.url.endswith('pdf')
 610                 self.content_type = 'application/pdf'
 611                 self.document = instream.read()
 612                 self.md5sum = hashlib.md5(self.document)
 613         else:
 614             req = requests.get(self.url)
 615             if req.status_code == 200:
 616                 self.content_type = req.headers['content-type']
 617                 self.document = req.content
 618                 self.md5sum = hashlib.md5(self.document)
 619                 self.urls = [self.url]
 620
 621     def create_payload(self):
 622         document_payload = {
 623             'attachment': {
 624               'download': self.filename,
 625               'type': self.content_type,
 626               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
 627               'md5sum': self.md5sum.hexdigest()
 628             },
 629             'document_type': self.document_type,
 630             'description': self.description,
 631             'award': self.award,
 632             'lab': self.lab,
 633         }
 634         if self.aliases:
 635             document_payload['aliases'] = self.aliases
 636         if self.references:
 637             document_payload['references'] = self.references
 638         if self.urls:
 639             document_payload['urls'] = self.urls
 640
 641         return document_payload
 642
 643     def post(self, server):
 644         document_payload = self.create_payload()
 645         server.validate(document_payload, 'document')
 646         return server.post_json('/documents/', document_payload)
 647
 648     def save(self, filename):
 649         payload = self.create_payload()
 650         with open(filename, 'w') as outstream:
 651             outstream.write(pformat(payload))
 652
 653     def create_if_needed(self, server, uuid):
 654         self.uuid = uuid
 655         if uuid is None:
 656             return self.post(server)
 657         else:
 658             return server.get_json(uuid, embed=False)
 659
 660 if __name__ == '__main__':
 661     # try it
 662     from htsworkflow.util.rdfhelp import get_model, dump_model
 663     from htsworkflow.util.rdfjsonld import load_into_model
 664     from pprint import pprint
 665     model = get_model()
 666     logging.basicConfig(level=logging.DEBUG)
 667     encoded = ENCODED('test.encodedcc.org')
 668     encoded.load_netrc()
 669     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 670     pprint(body)
 671     load_into_model(model, body)
 672     #dump_model(model)