htsworkflow/submission/encoded.py

   1 """Interface with encoded software for ENCODE3 data submission & warehouse
   2
   3 This allows retrieving blocks
   4 """
   5 from __future__ import print_function
   6 import base64
   7 import collections
   8 import hashlib
   9 import logging
  10 import json
  11 import jsonschema
  12 import os
  13 import requests
  14 import types
  15 from urlparse import urljoin, urlparse, urlunparse
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 ENCODED_CONTEXT = {
  20     # The None context will get added to the root of the tree and will
  21     # provide common defaults.
  22     None: {
  23         # terms in multiple encoded objects
  24         'award': {'@type': '@id'},
  25         'dataset': {'@type': '@id'},
  26         'description': 'rdf:description',
  27         'documents': {'@type': '@id'},
  28         'experiment': {'@type': '@id'},
  29         'href': {'@type': '@id'},
  30         'lab': {'@type': '@id'},
  31         'library': {'@type': '@id'},
  32         'pi': {'@type': '@id'},
  33         'platform': {'@type': '@id'},
  34         'replicates': {'@type': '@id'},
  35         'submitted_by': {'@type': '@id'},
  36         'url': {'@type': '@id'},
  37     },
  38     # Identify and markup contained classes.
  39     # e.g. in the tree there was a sub-dictionary named 'biosample'
  40     # That dictionary had a term 'biosample_term_id, which is the
  41     # term that should be used as the @id.
  42     'biosample': {
  43         'biosample_term_id': {'@type': '@id'},
  44     },
  45     'experiment': {
  46         "assay_term_id": {"@type": "@id"},
  47         "files": {"@type": "@id"},
  48         "original_files": {"@type": "@id"},
  49     },
  50     # I tried to use the JSON-LD mapping capabilities to convert the lab
  51     # contact information into a vcard record, but the encoded model
  52     # didn't lend itself well to the vcard schema
  53     #'lab': {
  54     #    "address1": "vcard:street-address",
  55     #    "address2": "vcard:street-address",
  56     #    "city": "vcard:locality",
  57     #    "state": "vcard:region",
  58     #    "country": "vcard:country"
  59     #},
  60     'library': {
  61         'nucleic_acid_term_id': {'@type': '@id'}
  62     }
  63 }
  64
  65 #FIXME: this needs to be initialized from rdfns
  66 ENCODED_NAMESPACES = {
  67     # JSON-LD lets you define namespaces so you can used the shorted url syntax.
  68     # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
  69     # rdfs:label)
  70     "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
  71     "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
  72     "owl": "http://www.w3.org/2002/07/owl#",
  73     "dc": "htp://purl.org/dc/elements/1.1/",
  74     "xsd": "http://www.w3.org/2001/XMLSchema#",
  75     "vcard": "http://www.w3.org/2006/vcard/ns#",
  76
  77     # for some namespaces I made a best guess for the ontology root.
  78     "EFO": "http://www.ebi.ac.uk/efo/",  # EFO ontology
  79     "OBO": "http://purl.obolibrary.org/obo/",  # OBO ontology
  80     "OBI": "http://purl.obolibrary.org/obo/OBI_",  # Ontology for Biomedical Investigations
  81     # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
  82     "SO": "http://purl.obolibrary.org/obo/SO_",  # Sequence ontology
  83     # SO: available from http://www.berkeleybop.org/ontologies/so.owl
  84     # NTR: New Term Request space for DCC to implement new ontology terms
  85
  86 }
  87
  88 ENCODED_SCHEMA_ROOT = '/profiles/'
  89
  90
  91 class ENCODED:
  92     '''Programatic access encoded, the software powering ENCODE3's submit site.
  93     '''
  94     def __init__(self, server, contexts=None, namespaces=None):
  95         self.server = server
  96         self.scheme = 'https'
  97         self.username = None
  98         self.password = None
  99         self.contexts = contexts if contexts else ENCODED_CONTEXT
 100         self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
 101         self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
 102         self.schemas = {}
 103
 104     def get_auth(self):
 105         return (self.username, self.password)
 106     auth = property(get_auth)
 107
 108     def load_netrc(self):
 109         import netrc
 110         session = netrc.netrc()
 111         authenticators = session.authenticators(self.server)
 112         if authenticators:
 113             self.username = authenticators[0]
 114             self.password = authenticators[2]
 115
 116     def add_jsonld_context(self, tree, default_base):
 117         """Add contexts to various objects in the tree.
 118
 119         tree is a json tree returned from the DCC's encoded database.
 120         contexts is a dictionary of dictionaries containing contexts
 121                 for the various  possible encoded classes.
 122         base, if supplied allows setting the base url that relative
 123             urls will be resolved against.
 124         """
 125         self.add_jsonld_child_context(tree, default_base)
 126         self.add_jsonld_namespaces(tree['@context'])
 127
 128     def add_jsonld_child_context(self, obj, default_base):
 129         '''Add JSON-LD context to the encoded JSON.
 130
 131         This is recursive because some of the IDs were relative URLs
 132         and I needed a way to properly compute a the correct base URL.
 133         '''
 134         # pretend strings aren't iterable
 135         if type(obj) in types.StringTypes:
 136             return
 137
 138         # recurse on container types
 139         if isinstance(obj, collections.Sequence):
 140             # how should I update lists?
 141             for v in obj:
 142                 self.add_jsonld_child_context(v, default_base)
 143             return
 144
 145         if isinstance(obj, collections.Mapping):
 146             for v in obj.values():
 147                 self.add_jsonld_child_context(v, default_base)
 148
 149         # we have an object. attach a context to it.
 150         if self._is_encoded_object(obj):
 151             context = self.create_jsonld_context(obj, default_base)
 152             if len(context) > 0:
 153                 obj.setdefault('@context', {}).update(context)
 154
 155     def add_jsonld_namespaces(self, context):
 156         '''Add shortcut namespaces to a context
 157
 158         Only needs to be run on the top-most context
 159         '''
 160         context.update(self.namespaces)
 161
 162     def create_jsonld_context(self, obj, default_base):
 163         '''Synthesize the context for a encoded type
 164
 165         self.contexts[None] = default context attributes added to any type
 166         self.contexts[type] = context attributes for this type.
 167         '''
 168         context = {'@base': urljoin(default_base, obj['@id']),
 169                    '@vocab': self.get_schema_url(obj)}
 170         # add in defaults
 171         context.update(self.contexts[None])
 172         for t in obj['@type']:
 173             if t in self.contexts:
 174                 context.update(self.contexts[t])
 175         return context
 176
 177     def get_json(self, obj_id, **kwargs):
 178         '''GET an ENCODE object as JSON and return as dict
 179
 180         Uses prepare_url to allow url short-cuts
 181         if no keyword arguments are specified it will default to adding limit=all
 182         Alternative keyword arguments can be passed in and will be sent to the host.
 183
 184         Known keywords are:
 185           limit - (integer or 'all') how many records to return, all for all of them
 186           embed - (bool) if true expands linking ids into their associated object.
 187           format - text/html or application/json
 188         '''
 189         if len(kwargs) == 0:
 190             kwargs['limit'] = 'all'
 191
 192         url = self.prepare_url(obj_id)
 193         LOGGER.info('requesting url: {}'.format(url))
 194
 195         # do the request
 196
 197         LOGGER.debug('username: %s, password: %s', self.username, self.password)
 198         response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
 199         if not response.status_code == requests.codes.ok:
 200             LOGGER.error("Error http status: {}".format(response.status_code))
 201             response.raise_for_status()
 202         return response.json()
 203
 204     def get_jsonld(self, obj_id, **kwargs):
 205         '''Get ENCODE object as JSONLD annotated with classses contexts
 206
 207         see get_json for documentation about what keywords can be passed.
 208         '''
 209         url = self.prepare_url(obj_id)
 210         json = self.get_json(obj_id, **kwargs)
 211         self.add_jsonld_context(json, url)
 212         return json
 213
 214     def get_object_type(self, obj):
 215         """Return type for a encoded object
 216         """
 217         obj_type = obj.get('@type')
 218         if not obj_type:
 219             raise ValueError('None type')
 220         if type(obj_type) in types.StringTypes:
 221             raise ValueError('@type should be a list, not a string')
 222         if not isinstance(obj_type, collections.Sequence):
 223             raise ValueError('@type is not a sequence')
 224         return obj_type[0]
 225
 226     def get_schema_url(self, object_type):
 227         return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
 228
 229     def _is_encoded_object(self, obj):
 230         '''Test to see if an object is a JSON-LD object
 231
 232         Some of the nested dictionaries lack the @id or @type
 233         information necessary to convert them.
 234         '''
 235         if not isinstance(obj, collections.Iterable):
 236             return False
 237
 238         if '@id' in obj and '@type' in obj:
 239             return True
 240         return False
 241
 242     def patch_json(self, obj_id, changes):
 243         """Given a dictionary of changes push them as a HTTP patch request
 244         """
 245         url = self.prepare_url(obj_id)
 246         LOGGER.info('PATCHing to %s', url)
 247         payload = json.dumps(changes)
 248         response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
 249         if response.status_code != requests.codes.ok:
 250             LOGGER.error("Error http status: {}".format(response.status_code))
 251             LOGGER.error("Response: %s", response.text)
 252             response.raise_for_status()
 253         return response.json()
 254
 255     def put_json(self, obj_id, new_object):
 256         url = self.prepare_url(obj_id)
 257         LOGGER.info('PUTing to %s', url)
 258         payload = json.dumps(new_object)
 259         response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
 260         if response.status_code != requests.codes.created:
 261             LOGGER.error("Error http status: {}".format(response.status_code))
 262             response.raise_for_status()
 263         return response.json()
 264
 265     def post_json(self, collection_id, new_object):
 266         url = self.prepare_url(collection_id)
 267         LOGGER.info('POSTing to %s', url)
 268         payload = json.dumps(new_object)
 269
 270         response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
 271         if response.status_code != requests.codes.created:
 272             LOGGER.error("Error http status: {}".format(response.status_code))
 273             response.raise_for_status()
 274         return response.json()
 275
 276     def prepare_url(self, request_url):
 277         '''This attempts to provide some convienence for accessing a URL
 278
 279         Given a url fragment it will default to :
 280         * requests over http
 281         * requests to self.server
 282
 283         This allows fairly flexible urls. e.g.
 284
 285         prepare_url('/experiments/ENCSR000AEG')
 286         prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
 287         prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
 288
 289         should all return the same url
 290         '''
 291         # clean up potentially messy urls
 292         url = urlparse(request_url)._asdict()
 293         if not url['scheme']:
 294             url['scheme'] = self.scheme
 295         if not url['netloc']:
 296             url['netloc'] = self.server
 297         url = urlunparse(url.values())
 298         return url
 299
 300     def search_jsonld(self, term, **kwargs):
 301         '''Send search request to ENCODED
 302         '''
 303         url = self.prepare_url('/search/')
 304         result = self.get_json(url, searchTerm=term, **kwargs)
 305         self.convert_search_to_jsonld(result)
 306         return result
 307
 308     def convert_search_to_jsonld(self, result):
 309         '''Add the context to search result
 310
 311         Also remove hard to handle nested attributes
 312           e.g. remove object.term when we have no id
 313         '''
 314         graph = result['@graph']
 315         for i, obj in enumerate(graph):
 316             # suppress nested attributes
 317             graph[i] = {k: v for k, v in obj.items() if '.' not in k}
 318
 319         self.add_jsonld_context(result, self.prepare_url(result['@id']))
 320         return result
 321
 322     def validate(self, obj, object_type=None):
 323         object_type = object_type if object_type else self.get_object_type(obj)
 324         schema_url = self.get_schema_url(object_type)
 325         if not schema_url:
 326             raise ValueError("Unable to construct schema url")
 327
 328         schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
 329         hidden = obj.copy()
 330         if '@id' in hidden:
 331             del hidden['@id']
 332         if '@type' in hidden:
 333             del hidden['@type']
 334         jsonschema.validate(hidden, schema)
 335
 336 class TypedColumnParser(object):
 337     @staticmethod
 338     def parse_sheet_array_type(value):
 339         """Helper function to parse :array columns in sheet
 340         """
 341         return value.split(', ')
 342
 343     @staticmethod
 344     def parse_sheet_integer_type(value):
 345         """Helper function to parse :integer columns in sheet
 346         """
 347         return int(value)
 348
 349     @staticmethod
 350     def parse_sheet_boolean_type(value):
 351         """Helper function to parse :boolean columns in sheet
 352         """
 353         return bool(value)
 354
 355     @staticmethod
 356     def parse_sheet_timestamp_type(value):
 357         """Helper function to parse :date columns in sheet
 358         """
 359         return value.strftime('%Y-%m-%d')
 360
 361     @staticmethod
 362     def parse_sheet_string_type(value):
 363         """Helper function to parse :string columns in sheet (the default)
 364         """
 365         return unicode(value)
 366
 367     def __getitem__(self, name):
 368         parser = {
 369             'array': self.parse_sheet_array_type,
 370             'boolean': self.parse_sheet_boolean_type,
 371             'integer': self.parse_sheet_integer_type,
 372             'date': self.parse_sheet_timestamp_type,
 373             'string': self.parse_sheet_string_type
 374         }.get(name)
 375         if parser:
 376             return parser
 377         else:
 378             raise RuntimeError("unrecognized column type")
 379
 380     def __call__(self, header, value):
 381         header = header.split(':')
 382         column_type = 'string'
 383         if len(header) > 1:
 384             if header[1] == 'skip':
 385                 return None, None
 386             else:
 387                 column_type = header[1]
 388         return header[0], self[column_type](value)
 389
 390 typed_column_parser = TypedColumnParser()
 391
 392 class Document(object):
 393     """Helper class for registering documents
 394
 395     Usage:
 396     lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
 397     lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
 398     lysis.create_if_needed(server, lysis_uuid)
 399     """
 400     award = 'U54HG006998'
 401     lab = '/labs/barbara-wold'
 402
 403     def __init__(self, url, document_type, description, aliases=None):
 404         self.url = url
 405         self.filename = os.path.basename(url)
 406         self.document_type = document_type
 407         self.description = description
 408
 409         self.references = []
 410         self.aliases = aliases if aliases is not None else []
 411         self.content_type = None
 412         self.document = None
 413         self.md5sum = None
 414         self.urls = None
 415         self.uuid = None
 416
 417         self.get_document()
 418
 419     def get_document(self):
 420         if os.path.exists(self.url):
 421             with open(self.url, 'r') as instream:
 422                 assert self.url.endswith('pdf')
 423                 self.content_type = 'application/pdf'
 424                 self.document = instream.read()
 425                 self.md5sum = hashlib.md5(self.document)
 426         else:
 427             req = requests.get(self.url)
 428             if req.status_code == 200:
 429                 self.content_type = req.headers['content-type']
 430                 self.document = req.content
 431                 self.md5sum = hashlib.md5(self.document)
 432                 self.urls = [self.url]
 433
 434     def create_payload(self):
 435         document_payload = {
 436             'attachment': {
 437               'download': self.filename,
 438               'type': self.content_type,
 439               'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
 440               'md5sum': self.md5sum.hexdigest()
 441             },
 442             'document_type': self.document_type,
 443             'description': self.description,
 444             'award': self.award,
 445             'lab': self.lab,
 446         }
 447         if self.aliases:
 448             document_payload['aliases'] = self.aliases
 449         if self.references:
 450             document_payload['references'] = self.references
 451         if self.urls:
 452             document_payload['urls'] = self.urls
 453
 454         return document_payload
 455
 456     def post(self, server):
 457         document_payload = self.create_payload()
 458         return server.post_json('/documents/', document_payload)
 459
 460     def save(self, filename):
 461         payload = self.create_payload()
 462         with open(filename, 'w') as outstream:
 463             outstream.write(pformat(payload))
 464
 465     def create_if_needed(self, server, uuid):
 466         self.uuid = uuid
 467         if uuid is None:
 468             return self.post(server)
 469         else:
 470             return server.get_json(uuid, embed=False)
 471
 472 if __name__ == '__main__':
 473     # try it
 474     from htsworkflow.util.rdfhelp import get_model, dump_model
 475     from htsworkflow.util.rdfjsonld import load_into_model
 476     from pprint import pprint
 477     model = get_model()
 478     logging.basicConfig(level=logging.DEBUG)
 479     encoded = ENCODED('test.encodedcc.org')
 480     encoded.load_netrc()
 481     body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
 482     pprint(body)
 483     load_into_model(model, body)
 484     #dump_model(model)