htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import pandas
   7 import base64
   8 import collections
   9 import hashlib
  10 import logging
  11 import json
  12 import jsonschema
  13 import os
  14 import requests
  15 import six
  16 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
  17
  18 LOGGER = logging.getLogger(__name__)
  19
  20 ENCODED_CONTEXT = {
  21     # The None context will get added to the root of the tree and will
  22     # provide common defaults.
  23     None: {
  24         # terms in multiple encoded objects
  25         'award': {'@type': '@id'},
  26         'dataset': {'@type': '@id'},
  27         'description': 'rdf:description',
  28         'documents': {'@type': '@id'},
  29         'experiment': {'@type': '@id'},
  30         'href': {'@type': '@id'},
  31         'lab': {'@type': '@id'},
  32         'library': {'@type': '@id'},
  33         'pi': {'@type': '@id'},
  34         'platform': {'@type': '@id'},
  35         'replicates': {'@type': '@id'},
  36         'submitted_by': {'@type': '@id'},
  37         'url': {'@type': '@id'},
  38     },
  39     # Identify and markup contained classes.
  40     # e.g. in the tree there was a sub-dictionary named 'biosample'
  41     # That dictionary had a term 'biosample_term_id, which is the
  42     # term that should be used as the @id.
  43     'biosample': {
  44         'biosample_term_id': {'@type': '@id'},
  45     },
  46     'experiment': {
  47         "assay_term_id": {"@type": "@id"},
  48         "files": {"@type": "@id"},
  49         "original_files": {"@type": "@id"},
  50     },
  51     # I tried to use the JSON-LD mapping capabilities to convert the lab
  52     # contact information into a vcard record, but the encoded model
  53     # didn't lend itself well to the vcard schema
  54     #'lab': {
  55     #    "address1": "vcard:street-address",
  56     #    "address2": "vcard:street-address",
  57     #    "city": "vcard:locality",
  58     #    "state": "vcard:region",
  59     #    "country": "vcard:country"
  60     #},
  61     'library': {
  62         'nucleic_acid_term_id': {'@type': '@id'}
  63     }
  64 }
  65
  66 #FIXME: this needs to be initialized from rdfns
  67 ENCODED_NAMESPACES = {
  68     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  69     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  70     # rdfs:label)
  71     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  72     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  73     "owl": "http://www.w3.org/2002/07/owl#",
  74     "dc": "htp://purl.org/dc/elements/1.1/",
  75     "xsd": "http://www.w3.org/2001/XMLSchema#",
  76     "vcard": "http://www.w3.org/2006/vcard/ns#",
  77
  78     # for some namespaces I made a best guess for the ontology root.
  79     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  80     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  81     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  82     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  83     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  84     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  85     # NTR: New Term Request space for DCC to implement new ontology terms
  86
  87 }
  88
  89 ENCODED_SCHEMA_ROOT = '/profiles/'
  90
  91
  92 class ENCODED:
  93     '''Programatic access encoded, the software powering ENCODE3's submit site.
  94     '''
  95     def __init__(self, server, contexts=None, namespaces=None):
  96         self.server = server
  97         self.scheme = 'https'
  98         self.username = None
  99         self.password = None
 100         self.contexts = contexts if contexts else ENCODED_CONTEXT
 101         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 102         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 103         self.schemas = {}
 104
 105     def get_auth(self):
 106         return (self.username, self.password)
 107     auth = property(get_auth)
 108
 109     def load_netrc(self):
 110         import netrc
 111         session = netrc.netrc()
 112         authenticators = session.authenticators(self.server)
 113         if authenticators:
 114             self.username = authenticators[0]
 115             self.password = authenticators[2]
 116
 117     def add_jsonld_context(self, tree, default_base):
 118         """Add contexts to various objects in the tree.
 119
 120         tree is a json tree returned from the DCC's encoded database.
 121         contexts is a dictionary of dictionaries containing contexts
 122                 for the various  possible encoded classes.
 123         base, if supplied allows setting the base url that relative
 124             urls will be resolved against.
 125         """
 126         self.add_jsonld_child_context(tree, default_base)
 127         self.add_jsonld_namespaces(tree['@context'])
 128
 129     def add_jsonld_child_context(self, obj, default_base):
 130         '''Add JSON-LD context to the encoded JSON.
 131
 132         This is recursive because some of the IDs were relative URLs
 133         and I needed a way to properly compute a the correct base URL.
 134         '''
 135         # pretend strings aren't iterable
 136         if isinstance(obj, six.string_types):
 137             return
 138
 139         # recurse on container types
 140         if isinstance(obj, collections.Sequence):
 141             # how should I update lists?
 142             for v in obj:
 143                 self.add_jsonld_child_context(v, default_base)
 144             return
 145
 146         if isinstance(obj, collections.Mapping):
 147             for v in obj.values():
 148                 self.add_jsonld_child_context(v, default_base)
 149
 150         # we have an object. attach a context to it.
 151         if self._is_encoded_object(obj):
 152             context = self.create_jsonld_context(obj, default_base)
 153             if len(context) > 0:
 154                 obj.setdefault('@context', {}).update(context)
 155
 156     def add_jsonld_namespaces(self, context):
 157         '''Add shortcut namespaces to a context
 158
 159         Only needs to be run on the top-most context
 160         '''
 161         context.update(self.namespaces)
 162
 163     def create_jsonld_context(self, obj, default_base):
 164         '''Synthesize the context for a encoded type
 165
 166         self.contexts[None] = default context attributes added to any type
 167         self.contexts[type] = context attributes for this type.
 168         '''
 169         obj_type = self.get_object_type(obj)
 170         context = {'@base': urljoin(default_base, obj['@id']),
 171                    '@vocab': self.get_schema_url(obj_type)}
 172         # add in defaults
 173         context.update(self.contexts[None])
 174         for t in obj['@type']:
 175             if t in self.contexts:
 176                 context.update(self.contexts[t])
 177         return context
 178
 179     def get_json(self, obj_id, **kwargs):
 180         '''GET an ENCODE object as JSON and return as dict
 181
 182         Uses prepare_url to allow url short-cuts
 183         if no keyword arguments are specified it will default to adding limit=all
 184         Alternative keyword arguments can be passed in and will be sent to the host.
 185
 186         Known keywords are:
 187           limit - (integer or 'all') how many records to return, all for all of them
 188           embed - (bool) if true expands linking ids into their associated object.
 189           format - text/html or application/json
 190         '''
 191         if len(kwargs) == 0:
 192             kwargs['limit'] = 'all'
 193
 194         url = self.prepare_url(obj_id)
 195         LOGGER.info('requesting url: {}'.format(url))
 196
 197         # do the request
 198
 199         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 200         arguments = {}
 201         if self.username and self.password:
 202             arguments['auth'] = self.auth
 203         response = requests.get(url, headers=self.json_headers,
 204                                 params=kwargs,
 205                                 **arguments)
 206         if not response.status_code == requests.codes.ok:
 207             LOGGER.error("Error http status: {}".format(response.status_code))
 208             response.raise_for_status()
 209         return response.json()
 210
 211     def get_jsonld(self, obj_id, **kwargs):
 212         '''Get ENCODE object as JSONLD annotated with classses contexts
 213
 214         see get_json for documentation about what keywords can be passed.
 215         '''
 216         url = self.prepare_url(obj_id)
 217         json = self.get_json(obj_id, **kwargs)
 218         self.add_jsonld_context(json, url)
 219         return json
 220
 221     def get_object_type(self, obj):
 222         """Return type for a encoded object
 223         """
 224         obj_type = obj.get('@type')
 225         if not obj_type:
 226             raise ValueError('None type')
 227         if isinstance(obj_type, six.string_types):
 228             raise ValueError('@type should be a list, not a string')
 229         if not isinstance(obj_type, collections.Sequence):
 230             raise ValueError('@type is not a sequence')
 231         return obj_type[0]
 232
 233     def get_schema_url(self, object_type):
 234         """Create the ENCODED jsonschema url.
 235
 236         Return the ENCODED object schema url be either
 237         object type name or the collection name one posts to.
 238
 239         For example
 240            server.get_schema_url('experiment') and
 241            server.get_schema_url('/experiments/') both resolve to
 242            SERVER/profiles/experiment.json
 243
 244         Arguments:
 245            object_type (str): either ENCODED object name or collection
 246
 247         Returns:
 248            Schema URL
 249         """
 250         collection_to_type = {
 251             '/biosamples/': 'biosample',
 252             '/datasets/': 'dataset',
 253             '/documents/': 'document',
 254             '/experiments/': 'experiment',
 255             '/libraries/': 'library',
 256             '/replicates/': 'replicate',
 257         }
 258         object_type = collection_to_type.get(object_type, object_type)
 259
 260         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 261
 262     def get_accession_name(self, collection):
 263         """Lookup common object accession name given a collection name.
 264         """
 265         collection_to_accession_name = {
 266             '/experiments/': 'experiment_accession',
 267             '/biosamples/': 'biosample_accession',
 268             '/libraries/': 'library_accession',
 269             '/replicates/': 'uuid',
 270         }
 271
 272         accession_name = collection_to_accession_name.get(collection, None)
 273         if accession_name is None:
 274             raise RuntimeError("Update list of collection to accession names for %s",
 275                                collection)
 276
 277         return accession_name
 278
 279     def _is_encoded_object(self, obj):
 280         '''Test to see if an object is a JSON-LD object
 281
 282         Some of the nested dictionaries lack the @id or @type
 283         information necessary to convert them.
 284         '''
 285         if not isinstance(obj, collections.Iterable):
 286             return False
 287
 288         if '@id' in obj and '@type' in obj:
 289             return True
 290         return False
 291
 292     def patch_json(self, obj_id, changes):
 293         """Given a dictionary of changes push them as a HTTP patch request
 294         """
 295         url = self.prepare_url(obj_id)
 296         LOGGER.info('PATCHing to %s', url)
 297         payload = json.dumps(changes)
 298         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 299         if response.status_code != requests.codes.ok:
 300             LOGGER.error("Error http status: {}".format(response.status_code))
 301             LOGGER.error("Response: %s", response.text)
 302             response.raise_for_status()
 303         return response.json()
 304
 305     def put_json(self, obj_id, new_object):
 306         url = self.prepare_url(obj_id)
 307         LOGGER.info('PUTing to %s', url)
 308         payload = json.dumps(new_object)
 309         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 310         if response.status_code != requests.codes.created:
 311             LOGGER.error("Error http status: {}".format(response.status_code))
 312             response.raise_for_status()
 313         return response.json()
 314
 315     def post_json(self, collection_id, new_object):
 316         url = self.prepare_url(collection_id)
 317         LOGGER.info('POSTing to %s', url)
 318         payload = json.dumps(new_object)
 319
 320         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 321         if response.status_code != requests.codes.created:
 322             LOGGER.error("http status: {}".format(response.status_code))
 323             LOGGER.error("message: {}".format(response.content))
 324             response.raise_for_status()
 325         return response.json()
 326
 327     def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
 328         """Create new ENCODED objects using metadata encoded in pandas DataFrame
 329
 330         The DataFrame column names need to encode the attribute names,
 331         and in some cases also include some additional type information.
 332         (see TypedColumnParser)
 333
 334         Arguments:
 335            collection (str): name of collection to create new objects in
 336            sheet (pandas.DataFrame): DataFrame with objects to create,
 337                assuming the appropriate accession number is empty.
 338                additional the accession number and uuid is updated if the object
 339                is created.
 340            dry_run (bool): whether or not to skip the code to post the objects
 341            verbose (bool): print the http responses.
 342
 343         Returns:
 344            list of created objects.
 345
 346         Raises:
 347            jsonschema.ValidationError if the object doesn't validate against
 348               the encoded jsonschema.
 349         """
 350         accession_name = self.get_accession_name(collection)
 351
 352         created = []
 353         columns = sheet.columns
 354         tosubmit = sheet[pandas.isnull(sheet[accession_name])]
 355
 356         for i in tosubmit.index:
 357             row = tosubmit.ix[i]
 358             new_object = {}
 359             for k in columns:
 360                 if pandas.notnull(row[k]):
 361                     name, value = typed_column_parser(k, row[k])
 362                     if name is None:
 363                         continue
 364                     new_object[name] = value
 365
 366             try:
 367                 self.validate(new_object, collection)
 368             except jsonschema.ValidationError as e:
 369                 LOGGER.error("Validation error row %s", i)
 370                 raise e
 371
 372             accession = row[accession_name]
 373             description = row.get('description', None)
 374
 375             if not dry_run:
 376                 response = self.post_json(collection, new_object)
 377                 if verbose:
 378                     print("Reponse {}".format(response))
 379
 380                 obj = response['@graph'][0]
 381                 created.append(obj)
 382                 accession = obj.get('accession')
 383                 uuid = obj.get('uuid')
 384
 385                 if accession:
 386                     sheet[accession_name][i] = accession
 387                 else:
 388                     accession = uuid
 389
 390                 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
 391                     sheet['uuid'][i] = uuid
 392
 393                 print("row {} created: {}".format(i, accession))
 394             else:
 395                 created.append(new_object)
 396             LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
 397
 398         return created
 399
 400     def prepare_url(self, request_url):
 401         '''This attempts to provide some convienence for accessing a URL
 402
 403         Given a url fragment it will default to :
 404         * requests over http
 405         * requests to self.server
 406
 407         This allows fairly flexible urls. e.g.
 408
 409         prepare_url('/experiments/ENCSR000AEG')
 410         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 411         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 412
 413         should all return the same url
 414         '''
 415         # clean up potentially messy urls
 416         url = urlparse(request_url)._asdict()
 417         if not url['scheme']:
 418             url['scheme'] = self.scheme
 419         if not url['netloc']:
 420             url['netloc'] = self.server
 421         url = urlunparse(url.values())
 422         return url
 423
 424     def search_jsonld(self, **kwargs):
 425         '''Send search request to ENCODED
 426
 427         to do a general search do
 428             searchTerm=term
 429         '''
 430         url = self.prepare_url('/search/')
 431         result = self.get_json(url, **kwargs)
 432         self.convert_search_to_jsonld(result)
 433         return result
 434
 435     def convert_search_to_jsonld(self, result):
 436         '''Add the context to search result
 437
 438         Also remove hard to handle nested attributes
 439           e.g. remove object.term when we have no id
 440         '''
 441         graph = result['@graph']
 442         for i, obj in enumerate(graph):
 443             # suppress nested attributes
 444             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 445
 446         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 447         return result
 448
 449     def validate(self, obj, object_type=None):
 450         """Validate an object against the ENCODED schema
 451
 452         Args:
 453             obj (dictionary): object attributes to be submitted to encoded
 454             object_type (string): ENCODED object name.
 455
 456         Raises:
 457             ValidationError: if the object does not conform to the schema.
 458         """
 459         object_type = object_type if object_type else self.get_object_type(obj)
 460         schema_url = self.get_schema_url(object_type)
 461         if not schema_url:
 462             raise ValueError("Unable to construct schema url")
 463
 464         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 465         hidden = obj.copy()
 466         if '@id' in hidden:
 467             del hidden['@id']
 468         if '@type' in hidden:
 469             del hidden['@type']
 470         jsonschema.validate(hidden, schema)
 471
 472 class TypedColumnParser(object):
 473     @staticmethod
 474     def parse_sheet_array_type(value):
 475         """Helper function to parse :array columns in sheet
 476         """
 477         return value.split(', ')
 478
 479     @staticmethod
 480     def parse_sheet_integer_type(value):
 481         """Helper function to parse :integer columns in sheet
 482         """
 483         return int(value)
 484
 485     @staticmethod
 486     def parse_sheet_boolean_type(value):
 487         """Helper function to parse :boolean columns in sheet
 488         """
 489         return bool(value)
 490
 491     @staticmethod
 492     def parse_sheet_timestamp_type(value):
 493         """Helper function to parse :date columns in sheet
 494         """
 495         return value.strftime('%Y-%m-%d')
 496
 497     @staticmethod
 498     def parse_sheet_string_type(value):
 499         """Helper function to parse :string columns in sheet (the default)
 500         """
 501         return unicode(value)
 502
 503     def __getitem__(self, name):
 504         parser = {
 505             'array': self.parse_sheet_array_type,
 506             'boolean': self.parse_sheet_boolean_type,
 507             'integer': self.parse_sheet_integer_type,
 508             'date': self.parse_sheet_timestamp_type,
 509             'string': self.parse_sheet_string_type
 510         }.get(name)
 511         if parser:
 512             return parser
 513         else:
 514             raise RuntimeError("unrecognized column type")
 515
 516     def __call__(self, header, value):
 517         header = header.split(':')
 518         column_type = 'string'
 519         if len(header) > 1:
 520             if header[1] == 'skip':
 521                 return None, None
 522             else:
 523                 column_type = header[1]
 524         return header[0], self[column_type](value)
 525
 526 typed_column_parser = TypedColumnParser()
 527
 528 class Document(object):
 529     """Helper class for registering documents
 530
 531     Usage:
 532     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 533     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 534     lysis.create_if_needed(server, lysis_uuid)
 535     """
 536     award = 'U54HG006998'
 537     lab = '/labs/barbara-wold'
 538
 539     def __init__(self, url, document_type, description, aliases=None):
 540         self.url = url
 541         self.filename = os.path.basename(url)
 542         self.document_type = document_type
 543         self.description = description
 544
 545         self.references = []
 546         self.aliases = None
 547         if aliases:
 548             if isinstance(aliases, list):
 549                 self.aliases = aliases
 550             else:
 551                 raise ValueError("Aliases needs to be a list")
 552         self.content_type = None
 553         self.document = None
 554         self.md5sum = None
 555         self.urls = None
 556         self.uuid = None
 557
 558         self.get_document()
 559
 560     def get_document(self):
 561         if os.path.exists(self.url):
 562             with open(self.url, 'r') as instream:
 563                 assert self.url.endswith('pdf')
 564                 self.content_type = 'application/pdf'
 565                 self.document = instream.read()
 566                 self.md5sum = hashlib.md5(self.document)
 567         else:
 568             req = requests.get(self.url)
 569             if req.status_code == 200:
 570                 self.content_type = req.headers['content-type']
 571                 self.document = req.content
 572                 self.md5sum = hashlib.md5(self.document)
 573                 self.urls = [self.url]
 574
 575     def create_payload(self):
 576         document_payload = {
 577             'attachment': {
 578               'download': self.filename,
 579               'type': self.content_type,
 580               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 581               'md5sum': self.md5sum.hexdigest()
 582             },
 583             'document_type': self.document_type,
 584             'description': self.description,
 585             'award': self.award,
 586             'lab': self.lab,
 587         }
 588         if self.aliases:
 589             document_payload['aliases'] = self.aliases
 590         if self.references:
 591             document_payload['references'] = self.references
 592         if self.urls:
 593             document_payload['urls'] = self.urls
 594
 595         return document_payload
 596
 597     def post(self, server):
 598         document_payload = self.create_payload()
 599         server.validate(document_payload, 'document')
 600         return server.post_json('/documents/', document_payload)
 601
 602     def save(self, filename):
 603         payload = self.create_payload()
 604         with open(filename, 'w') as outstream:
 605             outstream.write(pformat(payload))
 606
 607     def create_if_needed(self, server, uuid):
 608         self.uuid = uuid
 609         if uuid is None:
 610             return self.post(server)
 611         else:
 612             return server.get_json(uuid, embed=False)
 613
 614 if __name__ == '__main__':
 615     # try it
 616     from htsworkflow.util.rdfhelp import get_model, dump_model
 617     from htsworkflow.util.rdfjsonld import load_into_model
 618     from pprint import pprint
 619     model = get_model()
 620     logging.basicConfig(level=logging.DEBUG)
 621     encoded = ENCODED('test.encodedcc.org')
 622     encoded.load_netrc()
 623     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 624     pprint(body)
 625     load_into_model(model, body)
 626     #dump_model(model)