1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
15 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
17 LOGGER = logging.getLogger(__name__)
20 # The None context will get added to the root of the tree and will
21 # provide common defaults.
23 # terms in multiple encoded objects
24 'award': {'@type': '@id'},
25 'dataset': {'@type': '@id'},
26 'description': 'rdf:description',
27 'documents': {'@type': '@id'},
28 'experiment': {'@type': '@id'},
29 'href': {'@type': '@id'},
30 'lab': {'@type': '@id'},
31 'library': {'@type': '@id'},
32 'pi': {'@type': '@id'},
33 'platform': {'@type': '@id'},
34 'replicates': {'@type': '@id'},
35 'submitted_by': {'@type': '@id'},
36 'url': {'@type': '@id'},
38 # Identify and markup contained classes.
39 # e.g. in the tree there was a sub-dictionary named 'biosample'
40 # That dictionary had a term 'biosample_term_id, which is the
41 # term that should be used as the @id.
43 'biosample_term_id': {'@type': '@id'},
46 "assay_term_id": {"@type": "@id"},
47 "files": {"@type": "@id"},
48 "original_files": {"@type": "@id"},
50 # I tried to use the JSON-LD mapping capabilities to convert the lab
51 # contact information into a vcard record, but the encoded model
52 # didn't lend itself well to the vcard schema
54 # "address1": "vcard:street-address",
55 # "address2": "vcard:street-address",
56 # "city": "vcard:locality",
57 # "state": "vcard:region",
58 # "country": "vcard:country"
61 'nucleic_acid_term_id': {'@type': '@id'}
65 #FIXME: this needs to be initialized from rdfns
66 ENCODED_NAMESPACES = {
67 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
68 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
70 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
71 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
72 "owl": "http://www.w3.org/2002/07/owl#",
73 "dc": "htp://purl.org/dc/elements/1.1/",
74 "xsd": "http://www.w3.org/2001/XMLSchema#",
75 "vcard": "http://www.w3.org/2006/vcard/ns#",
77 # for some namespaces I made a best guess for the ontology root.
78 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
79 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
80 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
81 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
82 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
83 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
84 # NTR: New Term Request space for DCC to implement new ontology terms
88 ENCODED_SCHEMA_ROOT = '/profiles/'
92 '''Programatic access encoded, the software powering ENCODE3's submit site.
94 def __init__(self, server, contexts=None, namespaces=None):
99 self.contexts = contexts if contexts else ENCODED_CONTEXT
100 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
101 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
105 return (self.username, self.password)
106 auth = property(get_auth)
108 def load_netrc(self):
110 session = netrc.netrc()
111 authenticators = session.authenticators(self.server)
113 self.username = authenticators[0]
114 self.password = authenticators[2]
116 def add_jsonld_context(self, tree, default_base):
117 """Add contexts to various objects in the tree.
119 tree is a json tree returned from the DCC's encoded database.
120 contexts is a dictionary of dictionaries containing contexts
121 for the various possible encoded classes.
122 base, if supplied allows setting the base url that relative
123 urls will be resolved against.
125 self.add_jsonld_child_context(tree, default_base)
126 self.add_jsonld_namespaces(tree['@context'])
128 def add_jsonld_child_context(self, obj, default_base):
129 '''Add JSON-LD context to the encoded JSON.
131 This is recursive because some of the IDs were relative URLs
132 and I needed a way to properly compute a the correct base URL.
134 # pretend strings aren't iterable
135 if isinstance(obj, six.string_types):
138 # recurse on container types
139 if isinstance(obj, collections.Sequence):
140 # how should I update lists?
142 self.add_jsonld_child_context(v, default_base)
145 if isinstance(obj, collections.Mapping):
146 for v in obj.values():
147 self.add_jsonld_child_context(v, default_base)
149 # we have an object. attach a context to it.
150 if self._is_encoded_object(obj):
151 context = self.create_jsonld_context(obj, default_base)
153 obj.setdefault('@context', {}).update(context)
155 def add_jsonld_namespaces(self, context):
156 '''Add shortcut namespaces to a context
158 Only needs to be run on the top-most context
160 context.update(self.namespaces)
162 def create_jsonld_context(self, obj, default_base):
163 '''Synthesize the context for a encoded type
165 self.contexts[None] = default context attributes added to any type
166 self.contexts[type] = context attributes for this type.
168 obj_type = self.get_object_type(obj)
169 context = {'@base': urljoin(default_base, obj['@id']),
170 '@vocab': self.get_schema_url(obj_type)}
172 context.update(self.contexts[None])
173 for t in obj['@type']:
174 if t in self.contexts:
175 context.update(self.contexts[t])
178 def get_json(self, obj_id, **kwargs):
179 '''GET an ENCODE object as JSON and return as dict
181 Uses prepare_url to allow url short-cuts
182 if no keyword arguments are specified it will default to adding limit=all
183 Alternative keyword arguments can be passed in and will be sent to the host.
186 limit - (integer or 'all') how many records to return, all for all of them
187 embed - (bool) if true expands linking ids into their associated object.
188 format - text/html or application/json
191 kwargs['limit'] = 'all'
193 url = self.prepare_url(obj_id)
194 LOGGER.info('requesting url: {}'.format(url))
198 LOGGER.debug('username: %s, password: %s', self.username, self.password)
200 if self.username and self.password:
201 arguments['auth'] = self.auth
202 response = requests.get(url, headers=self.json_headers,
205 if not response.status_code == requests.codes.ok:
206 LOGGER.error("Error http status: {}".format(response.status_code))
207 response.raise_for_status()
208 return response.json()
210 def get_jsonld(self, obj_id, **kwargs):
211 '''Get ENCODE object as JSONLD annotated with classses contexts
213 see get_json for documentation about what keywords can be passed.
215 url = self.prepare_url(obj_id)
216 json = self.get_json(obj_id, **kwargs)
217 self.add_jsonld_context(json, url)
220 def get_object_type(self, obj):
221 """Return type for a encoded object
223 obj_type = obj.get('@type')
225 raise ValueError('None type')
226 if isinstance(obj_type, six.string_types):
227 raise ValueError('@type should be a list, not a string')
228 if not isinstance(obj_type, collections.Sequence):
229 raise ValueError('@type is not a sequence')
232 def get_schema_url(self, object_type):
233 """Create the ENCODED jsonschema url.
235 Return the ENCODED object schema url be either
236 object type name or the collection name one posts to.
239 server.get_schema_url('experiment') and
240 server.get_schema_url('/experiments/') both resolve to
241 SERVER/profiles/experiment.json
244 object_type (str): either ENCODED object name or collection
249 collection_to_type = {
250 '/biosamples/': 'biosample',
251 '/datasets/': 'dataset',
252 '/documents/': 'document',
253 '/experiments/': 'experiment',
254 '/libraries/': 'library',
255 '/replicates/': 'replicate',
257 object_type = collection_to_type.get(object_type, object_type)
259 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
261 def _is_encoded_object(self, obj):
262 '''Test to see if an object is a JSON-LD object
264 Some of the nested dictionaries lack the @id or @type
265 information necessary to convert them.
267 if not isinstance(obj, collections.Iterable):
270 if '@id' in obj and '@type' in obj:
274 def patch_json(self, obj_id, changes):
275 """Given a dictionary of changes push them as a HTTP patch request
277 url = self.prepare_url(obj_id)
278 LOGGER.info('PATCHing to %s', url)
279 payload = json.dumps(changes)
280 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
281 if response.status_code != requests.codes.ok:
282 LOGGER.error("Error http status: {}".format(response.status_code))
283 LOGGER.error("Response: %s", response.text)
284 response.raise_for_status()
285 return response.json()
287 def put_json(self, obj_id, new_object):
288 url = self.prepare_url(obj_id)
289 LOGGER.info('PUTing to %s', url)
290 payload = json.dumps(new_object)
291 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
292 if response.status_code != requests.codes.created:
293 LOGGER.error("Error http status: {}".format(response.status_code))
294 response.raise_for_status()
295 return response.json()
297 def post_json(self, collection_id, new_object):
298 url = self.prepare_url(collection_id)
299 LOGGER.info('POSTing to %s', url)
300 payload = json.dumps(new_object)
302 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
303 if response.status_code != requests.codes.created:
304 LOGGER.error("Error http status: {}".format(response.status_code))
305 response.raise_for_status()
306 return response.json()
308 def prepare_url(self, request_url):
309 '''This attempts to provide some convienence for accessing a URL
311 Given a url fragment it will default to :
313 * requests to self.server
315 This allows fairly flexible urls. e.g.
317 prepare_url('/experiments/ENCSR000AEG')
318 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
319 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
321 should all return the same url
323 # clean up potentially messy urls
324 url = urlparse(request_url)._asdict()
325 if not url['scheme']:
326 url['scheme'] = self.scheme
327 if not url['netloc']:
328 url['netloc'] = self.server
329 url = urlunparse(url.values())
332 def search_jsonld(self, **kwargs):
333 '''Send search request to ENCODED
335 to do a general search do
338 url = self.prepare_url('/search/')
339 result = self.get_json(url, **kwargs)
340 self.convert_search_to_jsonld(result)
343 def convert_search_to_jsonld(self, result):
344 '''Add the context to search result
346 Also remove hard to handle nested attributes
347 e.g. remove object.term when we have no id
349 graph = result['@graph']
350 for i, obj in enumerate(graph):
351 # suppress nested attributes
352 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
354 self.add_jsonld_context(result, self.prepare_url(result['@id']))
357 def validate(self, obj, object_type=None):
358 object_type = object_type if object_type else self.get_object_type(obj)
359 schema_url = self.get_schema_url(object_type)
361 raise ValueError("Unable to construct schema url")
363 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
367 if '@type' in hidden:
369 jsonschema.validate(hidden, schema)
371 class TypedColumnParser(object):
373 def parse_sheet_array_type(value):
374 """Helper function to parse :array columns in sheet
376 return value.split(', ')
379 def parse_sheet_integer_type(value):
380 """Helper function to parse :integer columns in sheet
385 def parse_sheet_boolean_type(value):
386 """Helper function to parse :boolean columns in sheet
391 def parse_sheet_timestamp_type(value):
392 """Helper function to parse :date columns in sheet
394 return value.strftime('%Y-%m-%d')
397 def parse_sheet_string_type(value):
398 """Helper function to parse :string columns in sheet (the default)
400 return unicode(value)
402 def __getitem__(self, name):
404 'array': self.parse_sheet_array_type,
405 'boolean': self.parse_sheet_boolean_type,
406 'integer': self.parse_sheet_integer_type,
407 'date': self.parse_sheet_timestamp_type,
408 'string': self.parse_sheet_string_type
413 raise RuntimeError("unrecognized column type")
415 def __call__(self, header, value):
416 header = header.split(':')
417 column_type = 'string'
419 if header[1] == 'skip':
422 column_type = header[1]
423 return header[0], self[column_type](value)
425 typed_column_parser = TypedColumnParser()
427 class Document(object):
428 """Helper class for registering documents
431 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
432 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
433 lysis.create_if_needed(server, lysis_uuid)
435 award = 'U54HG006998'
436 lab = '/labs/barbara-wold'
438 def __init__(self, url, document_type, description, aliases=None):
440 self.filename = os.path.basename(url)
441 self.document_type = document_type
442 self.description = description
445 self.aliases = aliases if aliases is not None else []
446 self.content_type = None
454 def get_document(self):
455 if os.path.exists(self.url):
456 with open(self.url, 'r') as instream:
457 assert self.url.endswith('pdf')
458 self.content_type = 'application/pdf'
459 self.document = instream.read()
460 self.md5sum = hashlib.md5(self.document)
462 req = requests.get(self.url)
463 if req.status_code == 200:
464 self.content_type = req.headers['content-type']
465 self.document = req.content
466 self.md5sum = hashlib.md5(self.document)
467 self.urls = [self.url]
469 def create_payload(self):
472 'download': self.filename,
473 'type': self.content_type,
474 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
475 'md5sum': self.md5sum.hexdigest()
477 'document_type': self.document_type,
478 'description': self.description,
483 document_payload['aliases'] = self.aliases
485 document_payload['references'] = self.references
487 document_payload['urls'] = self.urls
489 return document_payload
491 def post(self, server):
492 document_payload = self.create_payload()
493 return server.post_json('/documents/', document_payload)
495 def save(self, filename):
496 payload = self.create_payload()
497 with open(filename, 'w') as outstream:
498 outstream.write(pformat(payload))
500 def create_if_needed(self, server, uuid):
503 return self.post(server)
505 return server.get_json(uuid, embed=False)
507 if __name__ == '__main__':
509 from htsworkflow.util.rdfhelp import get_model, dump_model
510 from htsworkflow.util.rdfjsonld import load_into_model
511 from pprint import pprint
513 logging.basicConfig(level=logging.DEBUG)
514 encoded = ENCODED('test.encodedcc.org')
516 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
518 load_into_model(model, body)