htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5
   6
   7 from __future__ import print_function
   8 import collections
   9 import logging
  10 import json
  11 import jsonschema
  12 import requests
  13 import types
  14 from urlparse import urljoin, urlparse, urlunparse
  15
  16 LOGGER = logging.getLogger(__name__)
  17
  18 ENCODED_CONTEXT = {
  19     # The None context will get added to the root of the tree and will
  20     # provide common defaults.
  21     None: {
  22         # terms in multiple encoded objects
  23         'award': {'@type': '@id'},
  24         'dataset': {'@type': '@id'},
  25         'description': 'rdf:description',
  26         'documents': {'@type': '@id'},
  27         'experiment': {'@type': '@id'},
  28         'href': {'@type': '@id'},
  29         'lab': {'@type': '@id'},
  30         'library': {'@type': '@id'},
  31         'pi': {'@type': '@id'},
  32         'platform': {'@type': '@id'},
  33         'replicates': {'@type': '@id'},
  34         'submitted_by': {'@type': '@id'},
  35         'url': {'@type': '@id'},
  36     },
  37     # Identify and markup contained classes.
  38     # e.g. in the tree there was a sub-dictionary named 'biosample'
  39     # That dictionary had a term 'biosample_term_id, which is the
  40     # term that should be used as the @id.
  41     'biosample': {
  42         'biosample_term_id': {'@type': '@id'},
  43     },
  44     'experiment': {
  45         "assay_term_id": {"@type": "@id"},
  46         "files": {"@type": "@id"},
  47         "original_files": {"@type": "@id"},
  48     },
  49     # I tried to use the JSON-LD mapping capabilities to convert the lab
  50     # contact information into a vcard record, but the encoded model
  51     # didn't lend itself well to the vcard schema
  52     #'lab': {
  53     #    "address1": "vcard:street-address",
  54     #    "address2": "vcard:street-address",
  55     #    "city": "vcard:locality",
  56     #    "state": "vcard:region",
  57     #    "country": "vcard:country"
  58     #},
  59     'library': {
  60         'nucleic_acid_term_id': {'@type': '@id'}
  61     }
  62 }
  63
  64 #FIXME: this needs to be initialized from rdfns
  65 ENCODED_NAMESPACES = {
  66     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  67     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  68     # rdfs:label)
  69     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  70     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  71     "owl": "http://www.w3.org/2002/07/owl#",
  72     "dc": "htp://purl.org/dc/elements/1.1/",
  73     "xsd": "http://www.w3.org/2001/XMLSchema#",
  74     "vcard": "http://www.w3.org/2006/vcard/ns#",
  75
  76     # for some namespaces I made a best guess for the ontology root.
  77     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  78     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  79     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  80     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  81     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  82     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  83     # NTR: New Term Request space for DCC to implement new ontology terms
  84
  85 }
  86
  87 ENCODED_SCHEMA_ROOT = '/profiles/'
  88
  89
  90 class ENCODED:
  91     '''Programatic access encoded, the software powering ENCODE3's submit site.
  92     '''
  93     def __init__(self, server, contexts=None, namespaces=None):
  94         self.server = server
  95         self.scheme = 'https'
  96         self.username = None
  97         self.password = None
  98         self.contexts = contexts if contexts else ENCODED_CONTEXT
  99         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 100         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 101         self.schemas = {}
 102
 103     def get_auth(self):
 104         return (self.username, self.password)
 105     auth = property(get_auth)
 106
 107     def load_netrc(self):
 108         import netrc
 109         session = netrc.netrc()
 110         authenticators = session.authenticators(self.server)
 111         if authenticators:
 112             self.username = authenticators[0]
 113             self.password = authenticators[2]
 114
 115     def add_jsonld_context(self, tree, default_base):
 116         """Add contexts to various objects in the tree.
 117
 118         tree is a json tree returned from the DCC's encoded database.
 119         contexts is a dictionary of dictionaries containing contexts
 120                 for the various  possible encoded classes.
 121         base, if supplied allows setting the base url that relative
 122             urls will be resolved against.
 123         """
 124         self.add_jsonld_child_context(tree, default_base)
 125         self.add_jsonld_namespaces(tree['@context'])
 126
 127     def add_jsonld_child_context(self, obj, default_base):
 128         '''Add JSON-LD context to the encoded JSON.
 129
 130         This is recursive because some of the IDs were relative URLs
 131         and I needed a way to properly compute a the correct base URL.
 132         '''
 133         # pretend strings aren't iterable
 134         if type(obj) in types.StringTypes:
 135             return
 136
 137         # recurse on container types
 138         if isinstance(obj, collections.Sequence):
 139             # how should I update lists?
 140             for v in obj:
 141                 self.add_jsonld_child_context(v, default_base)
 142             return
 143
 144         if isinstance(obj, collections.Mapping):
 145             for v in obj.values():
 146                 self.add_jsonld_child_context(v, default_base)
 147
 148         # we have an object. attach a context to it.
 149         if self._is_encoded_object(obj):
 150             context = self.create_jsonld_context(obj, default_base)
 151             if len(context) > 0:
 152                 obj.setdefault('@context', {}).update(context)
 153
 154     def add_jsonld_namespaces(self, context):
 155         '''Add shortcut namespaces to a context
 156
 157         Only needs to be run on the top-most context
 158         '''
 159         context.update(self.namespaces)
 160
 161     def create_jsonld_context(self, obj, default_base):
 162         '''Synthesize the context for a encoded type
 163
 164         self.contexts[None] = default context attributes added to any type
 165         self.contexts[type] = context attributes for this type.
 166         '''
 167         context = {'@base': urljoin(default_base, obj['@id']),
 168                    '@vocab': self.get_schema_url(obj)}
 169         # add in defaults
 170         context.update(self.contexts[None])
 171         for t in obj['@type']:
 172             if t in self.contexts:
 173                 context.update(self.contexts[t])
 174         return context
 175
 176     def get_json(self, obj_id, **kwargs):
 177         '''GET an ENCODE object as JSON and return as dict
 178
 179         Uses prepare_url to allow url short-cuts
 180         if no keyword arguments are specified it will default to adding limit=all
 181         Alternative keyword arguments can be passed in and will be sent to the host.
 182
 183         Known keywords are:
 184           limit - (integer or 'all') how many records to return, all for all of them
 185           embed - (bool) if true expands linking ids into their associated object.
 186           format - text/html or application/json
 187         '''
 188         if len(kwargs) == 0:
 189             kwargs['limit'] = 'all'
 190
 191         url = self.prepare_url(obj_id)
 192         LOGGER.info('requesting url: {}'.format(url))
 193
 194         # do the request
 195
 196         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 197         response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
 198         if not response.status_code == requests.codes.ok:
 199             LOGGER.error("Error http status: {}".format(response.status_code))
 200             response.raise_for_status()
 201         return response.json()
 202
 203     def get_jsonld(self, obj_id, **kwargs):
 204         '''Get ENCODE object as JSONLD annotated with classses contexts
 205
 206         see get_json for documentation about what keywords can be passed.
 207         '''
 208         url = self.prepare_url(obj_id)
 209         json = self.get_json(obj_id, **kwargs)
 210         self.add_jsonld_context(json, url)
 211         return json
 212
 213     def get_object_type(self, obj):
 214         """Return type for a encoded object
 215         """
 216         obj_type = obj.get('@type')
 217         if not obj_type:
 218             raise ValueError('None type')
 219         if type(obj_type) in types.StringTypes:
 220             raise ValueError('@type should be a list, not a string')
 221         if not isinstance(obj_type, collections.Sequence):
 222             raise ValueError('@type is not a sequence')
 223         return obj_type[0]
 224
 225     def get_schema_url(self, object_type):
 226         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 227
 228     def _is_encoded_object(self, obj):
 229         '''Test to see if an object is a JSON-LD object
 230
 231         Some of the nested dictionaries lack the @id or @type
 232         information necessary to convert them.
 233         '''
 234         if not isinstance(obj, collections.Iterable):
 235             return False
 236
 237         if '@id' in obj and '@type' in obj:
 238             return True
 239         return False
 240
 241     def patch_json(self, obj_id, changes):
 242         """Given a dictionary of changes push them as a HTTP patch request
 243         """
 244         url = self.prepare_url(obj_id)
 245         LOGGER.info('PATCHing to %s', url)
 246         payload = json.dumps(changes)
 247         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 248         if response.status_code != requests.codes.ok:
 249             LOGGER.error("Error http status: {}".format(response.status_code))
 250             LOGGER.error("Response: %s", response.text)
 251             response.raise_for_status()
 252         return response.json()
 253
 254     def put_json(self, obj_id, new_object):
 255         url = self.prepare_url(obj_id)
 256         LOGGER.info('PUTing to %s', url)
 257         payload = json.dumps(new_object)
 258         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 259         if response.status_code != requests.codes.created:
 260             LOGGER.error("Error http status: {}".format(response.status_code))
 261             response.raise_for_status()
 262         return response.json()
 263
 264     def post_json(self, collection_id, new_object):
 265         url = self.prepare_url(collection_id)
 266         LOGGER.info('POSTing to %s', url)
 267         payload = json.dumps(new_object)
 268
 269         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 270         if response.status_code != requests.codes.created:
 271             LOGGER.error("Error http status: {}".format(response.status_code))
 272             response.raise_for_status()
 273         return response.json()
 274
 275     def prepare_url(self, request_url):
 276         '''This attempts to provide some convienence for accessing a URL
 277
 278         Given a url fragment it will default to :
 279         * requests over http
 280         * requests to self.server
 281
 282         This allows fairly flexible urls. e.g.
 283
 284         prepare_url('/experiments/ENCSR000AEG')
 285         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 286         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 287
 288         should all return the same url
 289         '''
 290         # clean up potentially messy urls
 291         url = urlparse(request_url)._asdict()
 292         if not url['scheme']:
 293             url['scheme'] = self.scheme
 294         if not url['netloc']:
 295             url['netloc'] = self.server
 296         url = urlunparse(url.values())
 297         return url
 298
 299     def search_jsonld(self, term, **kwargs):
 300         '''Send search request to ENCODED
 301         '''
 302         url = self.prepare_url('/search/')
 303         result = self.get_json(url, searchTerm=term, **kwargs)
 304         self.convert_search_to_jsonld(result)
 305         return result
 306
 307     def convert_search_to_jsonld(self, result):
 308         '''Add the context to search result
 309
 310         Also remove hard to handle nested attributes
 311           e.g. remove object.term when we have no id
 312         '''
 313         graph = result['@graph']
 314         for i, obj in enumerate(graph):
 315             # suppress nested attributes
 316             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 317
 318         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 319         return result
 320
 321     def validate(self, obj, object_type=None):
 322         object_type = object_type if object_type else self.get_object_type(obj)
 323         schema_url = self.get_schema_url(object_type)
 324         if not schema_url:
 325             raise ValueError("Unable to construct schema url")
 326
 327         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 328         hidden = obj.copy()
 329         if '@id' in hidden:
 330             del hidden['@id']
 331         if '@type' in hidden:
 332             del hidden['@type']
 333         jsonschema.validate(hidden, schema)
 334
 335 class TypedColumnParser(object):
 336     @staticmethod
 337     def parse_sheet_array_type(value):
 338         """Helper function to parse :array columns in sheet
 339         """
 340         return value.split(', ')
 341
 342     @staticmethod
 343     def parse_sheet_integer_type(value):
 344         """Helper function to parse :integer columns in sheet
 345         """
 346         return int(value)
 347
 348     @staticmethod
 349     def parse_sheet_boolean_type(value):
 350         """Helper function to parse :boolean columns in sheet
 351         """
 352         return bool(value)
 353
 354     @staticmethod
 355     def parse_sheet_timestamp_type(value):
 356         """Helper function to parse :date columns in sheet
 357         """
 358         return value.strftime('%Y-%m-%d')
 359
 360     @staticmethod
 361     def parse_sheet_string_type(value):
 362         """Helper function to parse :string columns in sheet (the default)
 363         """
 364         return unicode(value)
 365
 366     def __getitem__(self, name):
 367         parser = {
 368             'array': self.parse_sheet_array_type,
 369             'boolean': self.parse_sheet_boolean_type,
 370             'integer': self.parse_sheet_integer_type,
 371             'date': self.parse_sheet_timestamp_type,
 372             'string': self.parse_sheet_string_type
 373         }.get(name)
 374         if parser:
 375             return parser
 376         else:
 377             raise RuntimeError("unrecognized column type")
 378
 379     def __call__(self, header, value):
 380         header = header.split(':')
 381         column_type = 'string'
 382         if len(header) > 1:
 383             if header[1] == 'skip':
 384                 return None, None
 385             else:
 386                 column_type = header[1]
 387         return header[0], self[column_type](value)
 388
 389 typed_column_parser = TypedColumnParser()
 390
 391 class Document(object):
 392     """Helper class for registering documents
 393
 394     Usage:
 395     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 396     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 397     lysis.create_if_needed(server, lysis_uuid)
 398     """
 399     award = 'U54HG006998'
 400     lab = '/labs/barbara-wold'
 401
 402     def __init__(self, url, document_type, description, aliases=None):
 403         self.url = url
 404         self.filename = os.path.basename(url)
 405         self.document_type = document_type
 406         self.description = description
 407
 408         self.references = []
 409         self.aliases = aliases if aliases is not None else []
 410         self.content_type = None
 411         self.document = None
 412         self.md5sum = None
 413         self.urls = None
 414         self.uuid = None
 415
 416         self.get_document()
 417
 418     def get_document(self):
 419         if os.path.exists(self.url):
 420             with open(self.url, 'r') as instream:
 421                 assert self.url.endswith('pdf')
 422                 self.content_type = 'application/pdf'
 423                 self.document = instream.read()
 424                 self.md5sum = hashlib.md5(self.document)
 425         else:
 426             req = requests.get(self.url)
 427             if req.status_code == 200:
 428                 self.content_type = req.headers['content-type']
 429                 self.document = req.content
 430                 self.md5sum = hashlib.md5(self.document)
 431                 self.urls = [self.url]
 432
 433     def create_payload(self):
 434         document_payload = {
 435             'attachment': {
 436               'download': self.filename,
 437               'type': self.content_type,
 438               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 439               'md5sum': self.md5sum.hexdigest()
 440             },
 441             'document_type': self.document_type,
 442             'description': self.description,
 443             'award': self.award,
 444             'lab': self.lab,
 445         }
 446         if self.aliases:
 447             document_payload['aliases'] = self.aliases
 448         if self.references:
 449             document_payload['references'] = self.references
 450         if self.urls:
 451             document_payload['urls'] = self.urls
 452
 453         return document_payload
 454
 455     def post(self, server):
 456         document_payload = self.create_payload()
 457         return server.post_json('/documents/', document_payload)
 458
 459     def save(self, filename):
 460         payload = self.create_payload()
 461         with open(filename, 'w') as outstream:
 462             outstream.write(pformat(payload))
 463
 464     def create_if_needed(self, server, uuid):
 465         self.uuid = uuid
 466         if uuid is None:
 467             return self.post(server)
 468         else:
 469             return server.get_json(uuid, embed=False)
 470
 471 if __name__ == '__main__':
 472     # try it
 473     from htsworkflow.util.rdfhelp import get_model, dump_model
 474     from htsworkflow.util.rdfjsonld import load_into_model
 475     from pprint import pprint
 476     model = get_model()
 477     logging.basicConfig(level=logging.DEBUG)
 478     encoded = ENCODED('test.encodedcc.org')
 479     encoded.load_netrc()
 480     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 481     pprint(body)
 482     load_into_model(model, body)
 483     #dump_model(model)