1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
15 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
17 LOGGER = logging.getLogger(__name__)
20 # The None context will get added to the root of the tree and will
21 # provide common defaults.
23 # terms in multiple encoded objects
24 'award': {'@type': '@id'},
25 'dataset': {'@type': '@id'},
26 'description': 'rdf:description',
27 'documents': {'@type': '@id'},
28 'experiment': {'@type': '@id'},
29 'href': {'@type': '@id'},
30 'lab': {'@type': '@id'},
31 'library': {'@type': '@id'},
32 'pi': {'@type': '@id'},
33 'platform': {'@type': '@id'},
34 'replicates': {'@type': '@id'},
35 'submitted_by': {'@type': '@id'},
36 'url': {'@type': '@id'},
38 # Identify and markup contained classes.
39 # e.g. in the tree there was a sub-dictionary named 'biosample'
40 # That dictionary had a term 'biosample_term_id, which is the
41 # term that should be used as the @id.
43 'biosample_term_id': {'@type': '@id'},
46 "assay_term_id": {"@type": "@id"},
47 "files": {"@type": "@id"},
48 "original_files": {"@type": "@id"},
50 # I tried to use the JSON-LD mapping capabilities to convert the lab
51 # contact information into a vcard record, but the encoded model
52 # didn't lend itself well to the vcard schema
54 # "address1": "vcard:street-address",
55 # "address2": "vcard:street-address",
56 # "city": "vcard:locality",
57 # "state": "vcard:region",
58 # "country": "vcard:country"
61 'nucleic_acid_term_id': {'@type': '@id'}
65 #FIXME: this needs to be initialized from rdfns
66 ENCODED_NAMESPACES = {
67 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
68 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
70 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
71 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
72 "owl": "http://www.w3.org/2002/07/owl#",
73 "dc": "htp://purl.org/dc/elements/1.1/",
74 "xsd": "http://www.w3.org/2001/XMLSchema#",
75 "vcard": "http://www.w3.org/2006/vcard/ns#",
77 # for some namespaces I made a best guess for the ontology root.
78 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
79 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
80 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
81 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
82 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
83 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
84 # NTR: New Term Request space for DCC to implement new ontology terms
88 ENCODED_SCHEMA_ROOT = '/profiles/'
92 '''Programatic access encoded, the software powering ENCODE3's submit site.
94 def __init__(self, server, contexts=None, namespaces=None):
99 self.contexts = contexts if contexts else ENCODED_CONTEXT
100 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
101 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
105 return (self.username, self.password)
106 auth = property(get_auth)
108 def load_netrc(self):
110 session = netrc.netrc()
111 authenticators = session.authenticators(self.server)
113 self.username = authenticators[0]
114 self.password = authenticators[2]
116 def add_jsonld_context(self, tree, default_base):
117 """Add contexts to various objects in the tree.
119 tree is a json tree returned from the DCC's encoded database.
120 contexts is a dictionary of dictionaries containing contexts
121 for the various possible encoded classes.
122 base, if supplied allows setting the base url that relative
123 urls will be resolved against.
125 self.add_jsonld_child_context(tree, default_base)
126 self.add_jsonld_namespaces(tree['@context'])
128 def add_jsonld_child_context(self, obj, default_base):
129 '''Add JSON-LD context to the encoded JSON.
131 This is recursive because some of the IDs were relative URLs
132 and I needed a way to properly compute a the correct base URL.
134 # pretend strings aren't iterable
135 if isinstance(obj, six.string_types):
138 # recurse on container types
139 if isinstance(obj, collections.Sequence):
140 # how should I update lists?
142 self.add_jsonld_child_context(v, default_base)
145 if isinstance(obj, collections.Mapping):
146 for v in obj.values():
147 self.add_jsonld_child_context(v, default_base)
149 # we have an object. attach a context to it.
150 if self._is_encoded_object(obj):
151 context = self.create_jsonld_context(obj, default_base)
153 obj.setdefault('@context', {}).update(context)
155 def add_jsonld_namespaces(self, context):
156 '''Add shortcut namespaces to a context
158 Only needs to be run on the top-most context
160 context.update(self.namespaces)
162 def create_jsonld_context(self, obj, default_base):
163 '''Synthesize the context for a encoded type
165 self.contexts[None] = default context attributes added to any type
166 self.contexts[type] = context attributes for this type.
168 obj_type = self.get_object_type(obj)
169 context = {'@base': urljoin(default_base, obj['@id']),
170 '@vocab': self.get_schema_url(obj_type)}
172 context.update(self.contexts[None])
173 for t in obj['@type']:
174 if t in self.contexts:
175 context.update(self.contexts[t])
178 def get_json(self, obj_id, **kwargs):
179 '''GET an ENCODE object as JSON and return as dict
181 Uses prepare_url to allow url short-cuts
182 if no keyword arguments are specified it will default to adding limit=all
183 Alternative keyword arguments can be passed in and will be sent to the host.
186 limit - (integer or 'all') how many records to return, all for all of them
187 embed - (bool) if true expands linking ids into their associated object.
188 format - text/html or application/json
191 kwargs['limit'] = 'all'
193 url = self.prepare_url(obj_id)
194 LOGGER.info('requesting url: {}'.format(url))
198 LOGGER.debug('username: %s, password: %s', self.username, self.password)
200 if self.username and self.password:
201 arguments['auth'] = self.auth
202 response = requests.get(url, headers=self.json_headers,
205 if not response.status_code == requests.codes.ok:
206 LOGGER.error("Error http status: {}".format(response.status_code))
207 response.raise_for_status()
208 return response.json()
210 def get_jsonld(self, obj_id, **kwargs):
211 '''Get ENCODE object as JSONLD annotated with classses contexts
213 see get_json for documentation about what keywords can be passed.
215 url = self.prepare_url(obj_id)
216 json = self.get_json(obj_id, **kwargs)
217 self.add_jsonld_context(json, url)
220 def get_object_type(self, obj):
221 """Return type for a encoded object
223 obj_type = obj.get('@type')
225 raise ValueError('None type')
226 if isinstance(obj_type, six.string_types):
227 raise ValueError('@type should be a list, not a string')
228 if not isinstance(obj_type, collections.Sequence):
229 raise ValueError('@type is not a sequence')
232 def get_schema_url(self, object_type):
233 """Create the ENCODED jsonschema url.
235 Return the ENCODED object schema url be either
236 object type name or the collection name one posts to.
239 server.get_schema_url('experiment') and
240 server.get_schema_url('/experiments/') both resolve to
241 SERVER/profiles/experiment.json
244 object_type (str): either ENCODED object name or collection
249 collection_to_type = {
250 '/biosamples/': 'biosample',
251 '/datasets/': 'dataset',
252 '/documents/': 'document',
253 '/experiments/': 'experiment',
254 '/libraries/': 'library',
255 '/replicates/': 'replicate',
257 object_type = collection_to_type.get(object_type, object_type)
259 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
261 def get_accession_name(self, collection):
262 """Lookup common object accession name given a collection name.
264 collection_to_accession_name = {
265 '/experiments/': 'experiment_accession',
266 '/biosamples/': 'biosample_accession',
267 '/libraries/': 'library_accession',
268 '/replicates/': 'uuid',
271 accession_name = collection_to_accession_name.get(collection, None)
272 if accession_name is None:
273 raise RuntimeError("Update list of collection to accession names for %s",
276 return accession_name
278 def _is_encoded_object(self, obj):
279 '''Test to see if an object is a JSON-LD object
281 Some of the nested dictionaries lack the @id or @type
282 information necessary to convert them.
284 if not isinstance(obj, collections.Iterable):
287 if '@id' in obj and '@type' in obj:
291 def patch_json(self, obj_id, changes):
292 """Given a dictionary of changes push them as a HTTP patch request
294 url = self.prepare_url(obj_id)
295 LOGGER.info('PATCHing to %s', url)
296 payload = json.dumps(changes)
297 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
298 if response.status_code != requests.codes.ok:
299 LOGGER.error("Error http status: {}".format(response.status_code))
300 LOGGER.error("Response: %s", response.text)
301 response.raise_for_status()
302 return response.json()
304 def put_json(self, obj_id, new_object):
305 url = self.prepare_url(obj_id)
306 LOGGER.info('PUTing to %s', url)
307 payload = json.dumps(new_object)
308 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
309 if response.status_code != requests.codes.created:
310 LOGGER.error("Error http status: {}".format(response.status_code))
311 response.raise_for_status()
312 return response.json()
314 def post_json(self, collection_id, new_object):
315 url = self.prepare_url(collection_id)
316 LOGGER.info('POSTing to %s', url)
317 payload = json.dumps(new_object)
319 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
320 if response.status_code != requests.codes.created:
321 LOGGER.error("Error http status: {}".format(response.status_code))
322 response.raise_for_status()
323 return response.json()
325 def prepare_url(self, request_url):
326 '''This attempts to provide some convienence for accessing a URL
328 Given a url fragment it will default to :
330 * requests to self.server
332 This allows fairly flexible urls. e.g.
334 prepare_url('/experiments/ENCSR000AEG')
335 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
336 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
338 should all return the same url
340 # clean up potentially messy urls
341 url = urlparse(request_url)._asdict()
342 if not url['scheme']:
343 url['scheme'] = self.scheme
344 if not url['netloc']:
345 url['netloc'] = self.server
346 url = urlunparse(url.values())
349 def search_jsonld(self, **kwargs):
350 '''Send search request to ENCODED
352 to do a general search do
355 url = self.prepare_url('/search/')
356 result = self.get_json(url, **kwargs)
357 self.convert_search_to_jsonld(result)
360 def convert_search_to_jsonld(self, result):
361 '''Add the context to search result
363 Also remove hard to handle nested attributes
364 e.g. remove object.term when we have no id
366 graph = result['@graph']
367 for i, obj in enumerate(graph):
368 # suppress nested attributes
369 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
371 self.add_jsonld_context(result, self.prepare_url(result['@id']))
374 def validate(self, obj, object_type=None):
375 """Validate an object against the ENCODED schema
378 obj (dictionary): object attributes to be submitted to encoded
379 object_type (string): ENCODED object name.
382 ValidationError: if the object does not conform to the schema.
384 object_type = object_type if object_type else self.get_object_type(obj)
385 schema_url = self.get_schema_url(object_type)
387 raise ValueError("Unable to construct schema url")
389 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
393 if '@type' in hidden:
395 jsonschema.validate(hidden, schema)
397 class TypedColumnParser(object):
399 def parse_sheet_array_type(value):
400 """Helper function to parse :array columns in sheet
402 return value.split(', ')
405 def parse_sheet_integer_type(value):
406 """Helper function to parse :integer columns in sheet
411 def parse_sheet_boolean_type(value):
412 """Helper function to parse :boolean columns in sheet
417 def parse_sheet_timestamp_type(value):
418 """Helper function to parse :date columns in sheet
420 return value.strftime('%Y-%m-%d')
423 def parse_sheet_string_type(value):
424 """Helper function to parse :string columns in sheet (the default)
426 return unicode(value)
428 def __getitem__(self, name):
430 'array': self.parse_sheet_array_type,
431 'boolean': self.parse_sheet_boolean_type,
432 'integer': self.parse_sheet_integer_type,
433 'date': self.parse_sheet_timestamp_type,
434 'string': self.parse_sheet_string_type
439 raise RuntimeError("unrecognized column type")
441 def __call__(self, header, value):
442 header = header.split(':')
443 column_type = 'string'
445 if header[1] == 'skip':
448 column_type = header[1]
449 return header[0], self[column_type](value)
451 typed_column_parser = TypedColumnParser()
453 class Document(object):
454 """Helper class for registering documents
457 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
458 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
459 lysis.create_if_needed(server, lysis_uuid)
461 award = 'U54HG006998'
462 lab = '/labs/barbara-wold'
464 def __init__(self, url, document_type, description, aliases=None):
466 self.filename = os.path.basename(url)
467 self.document_type = document_type
468 self.description = description
473 if isinstance(aliases, list):
474 self.aliases = aliases
476 raise ValueError("Aliases needs to be a list")
477 self.content_type = None
485 def get_document(self):
486 if os.path.exists(self.url):
487 with open(self.url, 'r') as instream:
488 assert self.url.endswith('pdf')
489 self.content_type = 'application/pdf'
490 self.document = instream.read()
491 self.md5sum = hashlib.md5(self.document)
493 req = requests.get(self.url)
494 if req.status_code == 200:
495 self.content_type = req.headers['content-type']
496 self.document = req.content
497 self.md5sum = hashlib.md5(self.document)
498 self.urls = [self.url]
500 def create_payload(self):
503 'download': self.filename,
504 'type': self.content_type,
505 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
506 'md5sum': self.md5sum.hexdigest()
508 'document_type': self.document_type,
509 'description': self.description,
514 document_payload['aliases'] = self.aliases
516 document_payload['references'] = self.references
518 document_payload['urls'] = self.urls
520 return document_payload
522 def post(self, server):
523 document_payload = self.create_payload()
524 server.validate(document_payload, 'document')
525 return server.post_json('/documents/', document_payload)
527 def save(self, filename):
528 payload = self.create_payload()
529 with open(filename, 'w') as outstream:
530 outstream.write(pformat(payload))
532 def create_if_needed(self, server, uuid):
535 return self.post(server)
537 return server.get_json(uuid, embed=False)
539 if __name__ == '__main__':
541 from htsworkflow.util.rdfhelp import get_model, dump_model
542 from htsworkflow.util.rdfjsonld import load_into_model
543 from pprint import pprint
545 logging.basicConfig(level=logging.DEBUG)
546 encoded = ENCODED('test.encodedcc.org')
548 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
550 load_into_model(model, body)