htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import pandas
   7 import base64
   8 import collections
   9 import hashlib
  10 import logging
  11 import json
  12 import jsonschema
  13 import os
  14 import re
  15 import requests
  16 import six
  17 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
  18
  19 LOGGER = logging.getLogger(__name__)
  20
  21 ENCODED_CONTEXT = {
  22     # The None context will get added to the root of the tree and will
  23     # provide common defaults.
  24     None: {
  25         # terms in multiple encoded objects
  26         'award': {'@type': '@id'},
  27         'dataset': {'@type': '@id'},
  28         'description': 'rdf:description',
  29         'documents': {'@type': '@id'},
  30         'experiment': {'@type': '@id'},
  31         'href': {'@type': '@id'},
  32         'lab': {'@type': '@id'},
  33         'library': {'@type': '@id'},
  34         'pi': {'@type': '@id'},
  35         'platform': {'@type': '@id'},
  36         'replicates': {'@type': '@id'},
  37         'submitted_by': {'@type': '@id'},
  38         'url': {'@type': '@id'},
  39     },
  40     # Identify and markup contained classes.
  41     # e.g. in the tree there was a sub-dictionary named 'biosample'
  42     # That dictionary had a term 'biosample_term_id, which is the
  43     # term that should be used as the @id.
  44     'Biosample': {
  45         'biosample_term_id': {'@type': '@id'},
  46     },
  47     'Experiment': {
  48         "assay_term_id": {"@type": "@id"},
  49         "files": {"@type": "@id"},
  50         "original_files": {"@type": "@id"},
  51     },
  52     # I tried to use the JSON-LD mapping capabilities to convert the lab
  53     # contact information into a vcard record, but the encoded model
  54     # didn't lend itself well to the vcard schema
  55     #'lab': {
  56     #    "address1": "vcard:street-address",
  57     #    "address2": "vcard:street-address",
  58     #    "city": "vcard:locality",
  59     #    "state": "vcard:region",
  60     #    "country": "vcard:country"
  61     #},
  62     'Library': {
  63         'nucleic_acid_term_id': {'@type': '@id'}
  64     }
  65 }
  66
  67 #FIXME: this needs to be initialized from rdfns
  68 ENCODED_NAMESPACES = {
  69     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  70     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  71     # rdfs:label)
  72     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  73     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  74     "owl": "http://www.w3.org/2002/07/owl#",
  75     "dc": "htp://purl.org/dc/elements/1.1/",
  76     "xsd": "http://www.w3.org/2001/XMLSchema#",
  77     "vcard": "http://www.w3.org/2006/vcard/ns#",
  78
  79     # for some namespaces I made a best guess for the ontology root.
  80     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  81     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  82     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  83     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  84     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  85     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  86     # NTR: New Term Request space for DCC to implement new ontology terms
  87
  88 }
  89
  90 ENCODED_SCHEMA_ROOT = '/profiles/'
  91
  92
  93 class ENCODED:
  94     '''Programatic access encoded, the software powering ENCODE3's submit site.
  95     '''
  96     def __init__(self, server, contexts=None, namespaces=None):
  97         self.server = server
  98         self.scheme = 'https'
  99         self.username = None
 100         self.password = None
 101         self.contexts = contexts if contexts else ENCODED_CONTEXT
 102         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 103         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 104         self.schemas = {}
 105
 106     def get_auth(self):
 107         return (self.username, self.password)
 108     auth = property(get_auth)
 109
 110     def load_netrc(self):
 111         import netrc
 112         session = netrc.netrc()
 113         authenticators = session.authenticators(self.server)
 114         if authenticators:
 115             self.username = authenticators[0]
 116             self.password = authenticators[2]
 117
 118     def add_jsonld_context(self, tree, default_base):
 119         """Add contexts to various objects in the tree.
 120
 121         tree is a json tree returned from the DCC's encoded database.
 122         contexts is a dictionary of dictionaries containing contexts
 123                 for the various  possible encoded classes.
 124         base, if supplied allows setting the base url that relative
 125             urls will be resolved against.
 126         """
 127         self.add_jsonld_child_context(tree, default_base)
 128         self.add_jsonld_namespaces(tree['@context'])
 129
 130     def add_jsonld_child_context(self, obj, default_base):
 131         '''Add JSON-LD context to the encoded JSON.
 132
 133         This is recursive because some of the IDs were relative URLs
 134         and I needed a way to properly compute a the correct base URL.
 135         '''
 136         # pretend strings aren't iterable
 137         if isinstance(obj, six.string_types):
 138             return
 139
 140         # recurse on container types
 141         if isinstance(obj, collections.Sequence):
 142             # how should I update lists?
 143             for v in obj:
 144                 self.add_jsonld_child_context(v, default_base)
 145             return
 146
 147         if isinstance(obj, collections.Mapping):
 148             for v in obj.values():
 149                 self.add_jsonld_child_context(v, default_base)
 150
 151         # we have an object. attach a context to it.
 152         if self._is_encoded_object(obj):
 153             context = self.create_jsonld_context(obj, default_base)
 154             if len(context) > 0:
 155                 # this is a total hack for relese 33 of
 156                 # encoded. They changed their model and
 157                 # i'm not sure what to do about it.
 158                 if obj.get('@context') == '/terms/':
 159                     del obj['@context']
 160                 obj.setdefault('@context', {}).update(context)
 161
 162     def add_jsonld_namespaces(self, context):
 163         '''Add shortcut namespaces to a context
 164
 165         Only needs to be run on the top-most context
 166         '''
 167         context.update(self.namespaces)
 168
 169     def create_jsonld_context(self, obj, default_base):
 170         '''Synthesize the context for a encoded type
 171
 172         self.contexts[None] = default context attributes added to any type
 173         self.contexts[type] = context attributes for this type.
 174         '''
 175         obj_type = self.get_object_type(obj)
 176         context = {'@base': urljoin(default_base, obj['@id']),
 177                    '@vocab': self.get_schema_url(obj_type)}
 178         # add in defaults
 179         context.update(self.contexts[None])
 180         for t in obj['@type']:
 181             if t in self.contexts:
 182                 context.update(self.contexts[t])
 183         return context
 184
 185     def get_json(self, obj_id, **kwargs):
 186         '''GET an ENCODE object as JSON and return as dict
 187
 188         Uses prepare_url to allow url short-cuts
 189         if no keyword arguments are specified it will default to adding limit=all
 190         Alternative keyword arguments can be passed in and will be sent to the host.
 191
 192         Known keywords are:
 193           limit - (integer or 'all') how many records to return, all for all of them
 194           embed - (bool) if true expands linking ids into their associated object.
 195           format - text/html or application/json
 196         '''
 197         if len(kwargs) == 0:
 198             kwargs['limit'] = 'all'
 199
 200         url = self.prepare_url(obj_id)
 201         LOGGER.info('requesting url: {}'.format(url))
 202
 203         # do the request
 204
 205         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 206         arguments = {}
 207         if self.username and self.password:
 208             arguments['auth'] = self.auth
 209         response = requests.get(url, headers=self.json_headers,
 210                                 params=kwargs,
 211                                 **arguments)
 212         if not response.status_code == requests.codes.ok:
 213             LOGGER.error("Error http status: {}".format(response.status_code))
 214             response.raise_for_status()
 215         return response.json()
 216
 217     def get_jsonld(self, obj_id, **kwargs):
 218         '''Get ENCODE object as JSONLD annotated with classses contexts
 219
 220         see get_json for documentation about what keywords can be passed.
 221         '''
 222         url = self.prepare_url(obj_id)
 223         json = self.get_json(obj_id, **kwargs)
 224         self.add_jsonld_context(json, url)
 225         return json
 226
 227     def get_object_type(self, obj):
 228         """Return type for a encoded object
 229         """
 230         obj_type = obj.get('@type')
 231         if not obj_type:
 232             raise ValueError('None type')
 233         if isinstance(obj_type, six.string_types):
 234             raise ValueError('@type should be a list, not a string')
 235         if not isinstance(obj_type, collections.Sequence):
 236             raise ValueError('@type is not a sequence')
 237         return obj_type[0]
 238
 239     def get_schema_url(self, object_type):
 240         """Create the ENCODED jsonschema url.
 241
 242         Return the ENCODED object schema url be either
 243         object type name or the collection name one posts to.
 244
 245         For example
 246            server.get_schema_url('experiment') and
 247            server.get_schema_url('/experiments/') both resolve to
 248            SERVER/profiles/experiment.json
 249
 250         Arguments:
 251            object_type (str): either ENCODED object name or collection
 252
 253         Returns:
 254            Schema URL
 255         """
 256         collection_to_type = {
 257             '/biosamples/': 'biosample',
 258             '/datasets/': 'dataset',
 259             '/documents/': 'document',
 260             '/experiments/': 'experiment',
 261             '/libraries/': 'library',
 262             '/replicates/': 'replicate',
 263             '/file/': 'file',
 264         }
 265         object_type = collection_to_type.get(object_type, object_type)
 266
 267         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 268
 269     def get_accession_name(self, collection):
 270         """Lookup common object accession name given a collection name.
 271         """
 272         collection_to_accession_name = {
 273             '/experiments/': 'experiment_accession',
 274             '/biosamples/': 'biosample_accession',
 275             '/libraries/': 'library_accession',
 276             '/replicates/': 'uuid',
 277         }
 278
 279         accession_name = collection_to_accession_name.get(collection, None)
 280         if accession_name is None:
 281             raise RuntimeError("Update list of collection to accession names for %s",
 282                                collection)
 283
 284         return accession_name
 285
 286     def _is_encoded_object(self, obj):
 287         '''Test to see if an object is a JSON-LD object
 288
 289         Some of the nested dictionaries lack the @id or @type
 290         information necessary to convert them.
 291         '''
 292         if not isinstance(obj, collections.Iterable):
 293             return False
 294
 295         if '@id' in obj and '@type' in obj:
 296             return True
 297         return False
 298
 299     def patch_json(self, obj_id, changes):
 300         """Given a dictionary of changes push them as a HTTP patch request
 301         """
 302         url = self.prepare_url(obj_id)
 303         LOGGER.info('PATCHing to %s', url)
 304         payload = json.dumps(changes)
 305         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 306         if response.status_code != requests.codes.ok:
 307             LOGGER.error("Error http status: {}".format(response.status_code))
 308             LOGGER.error("Response: %s", response.text)
 309             response.raise_for_status()
 310         return response.json()
 311
 312     def put_json(self, obj_id, new_object):
 313         url = self.prepare_url(obj_id)
 314         LOGGER.info('PUTing to %s', url)
 315         payload = json.dumps(new_object)
 316         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 317         if response.status_code != requests.codes.created:
 318             LOGGER.error("Error http status: {}".format(response.status_code))
 319             response.raise_for_status()
 320         return response.json()
 321
 322     def post_json(self, collection_id, new_object):
 323         url = self.prepare_url(collection_id)
 324         LOGGER.info('POSTing to %s', url)
 325         payload = json.dumps(new_object)
 326
 327         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 328         if response.status_code != requests.codes.created:
 329             LOGGER.error("http status: {}".format(response.status_code))
 330             LOGGER.error("message: {}".format(response.content))
 331             response.raise_for_status()
 332         return response.json()
 333
 334     def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
 335         """Create new ENCODED objects using metadata encoded in pandas DataFrame
 336
 337         The DataFrame column names need to encode the attribute names,
 338         and in some cases also include some additional type information.
 339         (see TypedColumnParser)
 340
 341         Arguments:
 342            collection (str): name of collection to create new objects in
 343            sheet (pandas.DataFrame): DataFrame with objects to create,
 344                assuming the appropriate accession number is empty.
 345                additional the accession number and uuid is updated if the object
 346                is created.
 347            dry_run (bool): whether or not to skip the code to post the objects
 348            verbose (bool): print the http responses.
 349
 350         Returns:
 351            list of created objects.
 352
 353         Raises:
 354            jsonschema.ValidationError if the object doesn't validate against
 355               the encoded jsonschema.
 356         """
 357         accession_name = self.get_accession_name(collection)
 358
 359         created = []
 360         columns = sheet.columns
 361         tosubmit = sheet[pandas.isnull(sheet[accession_name])]
 362
 363         for i in tosubmit.index:
 364             row = tosubmit.ix[i]
 365             new_object = {}
 366             for k in columns:
 367                 if pandas.notnull(row[k]):
 368                     name, value = typed_column_parser(k, row[k])
 369                     if name is None:
 370                         continue
 371                     new_object[name] = value
 372
 373             try:
 374                 self.validate(new_object, collection)
 375             except jsonschema.ValidationError as e:
 376                 LOGGER.error("Validation error row %s", i)
 377                 raise e
 378
 379             accession = row[accession_name]
 380             description = row.get('description', None)
 381
 382             if not dry_run:
 383                 response = self.post_json(collection, new_object)
 384                 if verbose:
 385                     print("Reponse {}".format(response))
 386
 387                 obj = response['@graph'][0]
 388                 created.append(obj)
 389                 accession = obj.get('accession')
 390                 uuid = obj.get('uuid')
 391
 392                 if accession:
 393                     sheet[accession_name][i] = accession
 394                 else:
 395                     accession = uuid
 396
 397                 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
 398                     sheet['uuid'][i] = uuid
 399
 400                 print("row {} created: {}".format(i, accession))
 401             else:
 402                 created.append(new_object)
 403             LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
 404
 405         return created
 406
 407     def prepare_url(self, request_url):
 408         '''This attempts to provide some convienence for accessing a URL
 409
 410         Given a url fragment it will default to :
 411         * requests over http
 412         * requests to self.server
 413
 414         This allows fairly flexible urls. e.g.
 415
 416         prepare_url('/experiments/ENCSR000AEG')
 417         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 418         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 419
 420         should all return the same url
 421         '''
 422         # clean up potentially messy urls
 423         url = urlparse(request_url)._asdict()
 424         if not url['scheme']:
 425             url['scheme'] = self.scheme
 426         if not url['netloc']:
 427             url['netloc'] = self.server
 428         url = urlunparse(url.values())
 429         return url
 430
 431     def search_jsonld(self, **kwargs):
 432         '''Send search request to ENCODED
 433
 434         to do a general search do
 435             searchTerm=term
 436         '''
 437         url = self.prepare_url('/search/')
 438         result = self.get_json(url, **kwargs)
 439         self.convert_search_to_jsonld(result)
 440         return result
 441
 442     def convert_search_to_jsonld(self, result):
 443         '''Add the context to search result
 444
 445         Also remove hard to handle nested attributes
 446           e.g. remove object.term when we have no id
 447         '''
 448         graph = result['@graph']
 449         for i, obj in enumerate(graph):
 450             # suppress nested attributes
 451             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 452
 453         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 454         return result
 455
 456     def validate(self, obj, object_type=None):
 457         """Validate an object against the ENCODED schema
 458
 459         Args:
 460             obj (dictionary): object attributes to be submitted to encoded
 461             object_type (string): ENCODED object name.
 462
 463         Raises:
 464             ValidationError: if the object does not conform to the schema.
 465         """
 466         object_type = object_type if object_type else self.get_object_type(obj)
 467         schema_url = self.get_schema_url(object_type)
 468         if not schema_url:
 469             raise ValueError("Unable to construct schema url")
 470
 471         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 472         hidden = obj.copy()
 473         if '@id' in hidden:
 474             del hidden['@id']
 475         if '@type' in hidden:
 476             del hidden['@type']
 477         jsonschema.validate(hidden, schema)
 478
 479 class TypedColumnParser(object):
 480     @staticmethod
 481     def parse_sheet_array_type(value):
 482         """Helper function to parse :array columns in sheet
 483         """
 484         return re.split(',\s*', value)
 485
 486     @staticmethod
 487     def parse_sheet_integer_type(value):
 488         """Helper function to parse :integer columns in sheet
 489         """
 490         return int(value)
 491
 492     @staticmethod
 493     def parse_sheet_boolean_type(value):
 494         """Helper function to parse :boolean columns in sheet
 495         """
 496         return bool(value)
 497
 498     @staticmethod
 499     def parse_sheet_timestamp_type(value):
 500         """Helper function to parse :date columns in sheet
 501         """
 502         return value.strftime('%Y-%m-%d')
 503
 504     @staticmethod
 505     def parse_sheet_string_type(value):
 506         """Helper function to parse :string columns in sheet (the default)
 507         """
 508         return str(value)
 509
 510     def __getitem__(self, name):
 511         parser = {
 512             'array': self.parse_sheet_array_type,
 513             'boolean': self.parse_sheet_boolean_type,
 514             'integer': self.parse_sheet_integer_type,
 515             'date': self.parse_sheet_timestamp_type,
 516             'string': self.parse_sheet_string_type
 517         }.get(name)
 518         if parser:
 519             return parser
 520         else:
 521             raise RuntimeError("unrecognized column type")
 522
 523     def __call__(self, header, value):
 524         header = header.split(':')
 525         column_type = 'string'
 526         if len(header) > 1:
 527             if header[1] == 'skip':
 528                 return None, None
 529             else:
 530                 column_type = header[1]
 531         return header[0], self[column_type](value)
 532
 533 typed_column_parser = TypedColumnParser()
 534
 535 class Document(object):
 536     """Helper class for registering documents
 537
 538     Usage:
 539     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 540     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 541     lysis.create_if_needed(server, lysis_uuid)
 542     """
 543     award = 'U54HG006998'
 544     lab = '/labs/barbara-wold'
 545
 546     def __init__(self, url, document_type, description, aliases=None):
 547         self.url = url
 548         self.filename = os.path.basename(url)
 549         self.document_type = document_type
 550         self.description = description
 551
 552         self.references = []
 553         self.aliases = None
 554         if aliases:
 555             if isinstance(aliases, list):
 556                 self.aliases = aliases
 557             else:
 558                 raise ValueError("Aliases needs to be a list")
 559         self.content_type = None
 560         self.document = None
 561         self.md5sum = None
 562         self.urls = None
 563         self.uuid = None
 564
 565         self.get_document()
 566
 567     def get_document(self):
 568         if os.path.exists(self.url):
 569             with open(self.url, 'rb') as instream:
 570                 assert self.url.endswith('pdf')
 571                 self.content_type = 'application/pdf'
 572                 self.document = instream.read()
 573                 self.md5sum = hashlib.md5(self.document)
 574         else:
 575             req = requests.get(self.url)
 576             if req.status_code == 200:
 577                 self.content_type = req.headers['content-type']
 578                 self.document = req.content
 579                 self.md5sum = hashlib.md5(self.document)
 580                 self.urls = [self.url]
 581
 582     def create_payload(self):
 583         document_payload = {
 584             'attachment': {
 585               'download': self.filename,
 586               'type': self.content_type,
 587               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
 588               'md5sum': self.md5sum.hexdigest()
 589             },
 590             'document_type': self.document_type,
 591             'description': self.description,
 592             'award': self.award,
 593             'lab': self.lab,
 594         }
 595         if self.aliases:
 596             document_payload['aliases'] = self.aliases
 597         if self.references:
 598             document_payload['references'] = self.references
 599         if self.urls:
 600             document_payload['urls'] = self.urls
 601
 602         return document_payload
 603
 604     def post(self, server):
 605         document_payload = self.create_payload()
 606         server.validate(document_payload, 'document')
 607         return server.post_json('/documents/', document_payload)
 608
 609     def save(self, filename):
 610         payload = self.create_payload()
 611         with open(filename, 'w') as outstream:
 612             outstream.write(pformat(payload))
 613
 614     def create_if_needed(self, server, uuid):
 615         self.uuid = uuid
 616         if uuid is None:
 617             return self.post(server)
 618         else:
 619             return server.get_json(uuid, embed=False)
 620
 621 if __name__ == '__main__':
 622     # try it
 623     from htsworkflow.util.rdfhelp import get_model, dump_model
 624     from htsworkflow.util.rdfjsonld import load_into_model
 625     from pprint import pprint
 626     model = get_model()
 627     logging.basicConfig(level=logging.DEBUG)
 628     encoded = ENCODED('test.encodedcc.org')
 629     encoded.load_netrc()
 630     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 631     pprint(body)
 632     load_into_model(model, body)
 633     #dump_model(model)