1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
15 from urlparse import urljoin, urlparse, urlunparse
17 LOGGER = logging.getLogger(__name__)
20 # The None context will get added to the root of the tree and will
21 # provide common defaults.
23 # terms in multiple encoded objects
24 'award': {'@type': '@id'},
25 'dataset': {'@type': '@id'},
26 'description': 'rdf:description',
27 'documents': {'@type': '@id'},
28 'experiment': {'@type': '@id'},
29 'href': {'@type': '@id'},
30 'lab': {'@type': '@id'},
31 'library': {'@type': '@id'},
32 'pi': {'@type': '@id'},
33 'platform': {'@type': '@id'},
34 'replicates': {'@type': '@id'},
35 'submitted_by': {'@type': '@id'},
36 'url': {'@type': '@id'},
38 # Identify and markup contained classes.
39 # e.g. in the tree there was a sub-dictionary named 'biosample'
40 # That dictionary had a term 'biosample_term_id, which is the
41 # term that should be used as the @id.
43 'biosample_term_id': {'@type': '@id'},
46 "assay_term_id": {"@type": "@id"},
47 "files": {"@type": "@id"},
48 "original_files": {"@type": "@id"},
50 # I tried to use the JSON-LD mapping capabilities to convert the lab
51 # contact information into a vcard record, but the encoded model
52 # didn't lend itself well to the vcard schema
54 # "address1": "vcard:street-address",
55 # "address2": "vcard:street-address",
56 # "city": "vcard:locality",
57 # "state": "vcard:region",
58 # "country": "vcard:country"
61 'nucleic_acid_term_id': {'@type': '@id'}
65 #FIXME: this needs to be initialized from rdfns
66 ENCODED_NAMESPACES = {
67 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
68 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
70 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
71 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
72 "owl": "http://www.w3.org/2002/07/owl#",
73 "dc": "htp://purl.org/dc/elements/1.1/",
74 "xsd": "http://www.w3.org/2001/XMLSchema#",
75 "vcard": "http://www.w3.org/2006/vcard/ns#",
77 # for some namespaces I made a best guess for the ontology root.
78 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
79 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
80 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
81 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
82 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
83 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
84 # NTR: New Term Request space for DCC to implement new ontology terms
88 ENCODED_SCHEMA_ROOT = '/profiles/'
92 '''Programatic access encoded, the software powering ENCODE3's submit site.
94 def __init__(self, server, contexts=None, namespaces=None):
99 self.contexts = contexts if contexts else ENCODED_CONTEXT
100 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
101 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
105 return (self.username, self.password)
106 auth = property(get_auth)
108 def load_netrc(self):
110 session = netrc.netrc()
111 authenticators = session.authenticators(self.server)
113 self.username = authenticators[0]
114 self.password = authenticators[2]
116 def add_jsonld_context(self, tree, default_base):
117 """Add contexts to various objects in the tree.
119 tree is a json tree returned from the DCC's encoded database.
120 contexts is a dictionary of dictionaries containing contexts
121 for the various possible encoded classes.
122 base, if supplied allows setting the base url that relative
123 urls will be resolved against.
125 self.add_jsonld_child_context(tree, default_base)
126 self.add_jsonld_namespaces(tree['@context'])
128 def add_jsonld_child_context(self, obj, default_base):
129 '''Add JSON-LD context to the encoded JSON.
131 This is recursive because some of the IDs were relative URLs
132 and I needed a way to properly compute a the correct base URL.
134 # pretend strings aren't iterable
135 if type(obj) in types.StringTypes:
138 # recurse on container types
139 if isinstance(obj, collections.Sequence):
140 # how should I update lists?
142 self.add_jsonld_child_context(v, default_base)
145 if isinstance(obj, collections.Mapping):
146 for v in obj.values():
147 self.add_jsonld_child_context(v, default_base)
149 # we have an object. attach a context to it.
150 if self._is_encoded_object(obj):
151 context = self.create_jsonld_context(obj, default_base)
153 obj.setdefault('@context', {}).update(context)
155 def add_jsonld_namespaces(self, context):
156 '''Add shortcut namespaces to a context
158 Only needs to be run on the top-most context
160 context.update(self.namespaces)
162 def create_jsonld_context(self, obj, default_base):
163 '''Synthesize the context for a encoded type
165 self.contexts[None] = default context attributes added to any type
166 self.contexts[type] = context attributes for this type.
168 obj_type = self.get_object_type(obj)
169 context = {'@base': urljoin(default_base, obj['@id']),
170 '@vocab': self.get_schema_url(obj_type)}
172 context.update(self.contexts[None])
173 for t in obj['@type']:
174 if t in self.contexts:
175 context.update(self.contexts[t])
178 def get_json(self, obj_id, **kwargs):
179 '''GET an ENCODE object as JSON and return as dict
181 Uses prepare_url to allow url short-cuts
182 if no keyword arguments are specified it will default to adding limit=all
183 Alternative keyword arguments can be passed in and will be sent to the host.
186 limit - (integer or 'all') how many records to return, all for all of them
187 embed - (bool) if true expands linking ids into their associated object.
188 format - text/html or application/json
191 kwargs['limit'] = 'all'
193 url = self.prepare_url(obj_id)
194 LOGGER.info('requesting url: {}'.format(url))
198 LOGGER.debug('username: %s, password: %s', self.username, self.password)
199 response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
200 if not response.status_code == requests.codes.ok:
201 LOGGER.error("Error http status: {}".format(response.status_code))
202 response.raise_for_status()
203 return response.json()
205 def get_jsonld(self, obj_id, **kwargs):
206 '''Get ENCODE object as JSONLD annotated with classses contexts
208 see get_json for documentation about what keywords can be passed.
210 url = self.prepare_url(obj_id)
211 json = self.get_json(obj_id, **kwargs)
212 self.add_jsonld_context(json, url)
215 def get_object_type(self, obj):
216 """Return type for a encoded object
218 obj_type = obj.get('@type')
220 raise ValueError('None type')
221 if type(obj_type) in types.StringTypes:
222 raise ValueError('@type should be a list, not a string')
223 if not isinstance(obj_type, collections.Sequence):
224 raise ValueError('@type is not a sequence')
227 def get_schema_url(self, object_type):
228 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
230 def _is_encoded_object(self, obj):
231 '''Test to see if an object is a JSON-LD object
233 Some of the nested dictionaries lack the @id or @type
234 information necessary to convert them.
236 if not isinstance(obj, collections.Iterable):
239 if '@id' in obj and '@type' in obj:
243 def patch_json(self, obj_id, changes):
244 """Given a dictionary of changes push them as a HTTP patch request
246 url = self.prepare_url(obj_id)
247 LOGGER.info('PATCHing to %s', url)
248 payload = json.dumps(changes)
249 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
250 if response.status_code != requests.codes.ok:
251 LOGGER.error("Error http status: {}".format(response.status_code))
252 LOGGER.error("Response: %s", response.text)
253 response.raise_for_status()
254 return response.json()
256 def put_json(self, obj_id, new_object):
257 url = self.prepare_url(obj_id)
258 LOGGER.info('PUTing to %s', url)
259 payload = json.dumps(new_object)
260 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
261 if response.status_code != requests.codes.created:
262 LOGGER.error("Error http status: {}".format(response.status_code))
263 response.raise_for_status()
264 return response.json()
266 def post_json(self, collection_id, new_object):
267 url = self.prepare_url(collection_id)
268 LOGGER.info('POSTing to %s', url)
269 payload = json.dumps(new_object)
271 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
272 if response.status_code != requests.codes.created:
273 LOGGER.error("Error http status: {}".format(response.status_code))
274 response.raise_for_status()
275 return response.json()
277 def prepare_url(self, request_url):
278 '''This attempts to provide some convienence for accessing a URL
280 Given a url fragment it will default to :
282 * requests to self.server
284 This allows fairly flexible urls. e.g.
286 prepare_url('/experiments/ENCSR000AEG')
287 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
288 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
290 should all return the same url
292 # clean up potentially messy urls
293 url = urlparse(request_url)._asdict()
294 if not url['scheme']:
295 url['scheme'] = self.scheme
296 if not url['netloc']:
297 url['netloc'] = self.server
298 url = urlunparse(url.values())
301 def search_jsonld(self, term, **kwargs):
302 '''Send search request to ENCODED
304 url = self.prepare_url('/search/')
305 result = self.get_json(url, searchTerm=term, **kwargs)
306 self.convert_search_to_jsonld(result)
309 def convert_search_to_jsonld(self, result):
310 '''Add the context to search result
312 Also remove hard to handle nested attributes
313 e.g. remove object.term when we have no id
315 graph = result['@graph']
316 for i, obj in enumerate(graph):
317 # suppress nested attributes
318 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
320 self.add_jsonld_context(result, self.prepare_url(result['@id']))
323 def validate(self, obj, object_type=None):
324 object_type = object_type if object_type else self.get_object_type(obj)
325 schema_url = self.get_schema_url(object_type)
327 raise ValueError("Unable to construct schema url")
329 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
333 if '@type' in hidden:
335 jsonschema.validate(hidden, schema)
337 class TypedColumnParser(object):
339 def parse_sheet_array_type(value):
340 """Helper function to parse :array columns in sheet
342 return value.split(', ')
345 def parse_sheet_integer_type(value):
346 """Helper function to parse :integer columns in sheet
351 def parse_sheet_boolean_type(value):
352 """Helper function to parse :boolean columns in sheet
357 def parse_sheet_timestamp_type(value):
358 """Helper function to parse :date columns in sheet
360 return value.strftime('%Y-%m-%d')
363 def parse_sheet_string_type(value):
364 """Helper function to parse :string columns in sheet (the default)
366 return unicode(value)
368 def __getitem__(self, name):
370 'array': self.parse_sheet_array_type,
371 'boolean': self.parse_sheet_boolean_type,
372 'integer': self.parse_sheet_integer_type,
373 'date': self.parse_sheet_timestamp_type,
374 'string': self.parse_sheet_string_type
379 raise RuntimeError("unrecognized column type")
381 def __call__(self, header, value):
382 header = header.split(':')
383 column_type = 'string'
385 if header[1] == 'skip':
388 column_type = header[1]
389 return header[0], self[column_type](value)
391 typed_column_parser = TypedColumnParser()
393 class Document(object):
394 """Helper class for registering documents
397 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
398 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
399 lysis.create_if_needed(server, lysis_uuid)
401 award = 'U54HG006998'
402 lab = '/labs/barbara-wold'
404 def __init__(self, url, document_type, description, aliases=None):
406 self.filename = os.path.basename(url)
407 self.document_type = document_type
408 self.description = description
411 self.aliases = aliases if aliases is not None else []
412 self.content_type = None
420 def get_document(self):
421 if os.path.exists(self.url):
422 with open(self.url, 'r') as instream:
423 assert self.url.endswith('pdf')
424 self.content_type = 'application/pdf'
425 self.document = instream.read()
426 self.md5sum = hashlib.md5(self.document)
428 req = requests.get(self.url)
429 if req.status_code == 200:
430 self.content_type = req.headers['content-type']
431 self.document = req.content
432 self.md5sum = hashlib.md5(self.document)
433 self.urls = [self.url]
435 def create_payload(self):
438 'download': self.filename,
439 'type': self.content_type,
440 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
441 'md5sum': self.md5sum.hexdigest()
443 'document_type': self.document_type,
444 'description': self.description,
449 document_payload['aliases'] = self.aliases
451 document_payload['references'] = self.references
453 document_payload['urls'] = self.urls
455 return document_payload
457 def post(self, server):
458 document_payload = self.create_payload()
459 return server.post_json('/documents/', document_payload)
461 def save(self, filename):
462 payload = self.create_payload()
463 with open(filename, 'w') as outstream:
464 outstream.write(pformat(payload))
466 def create_if_needed(self, server, uuid):
469 return self.post(server)
471 return server.get_json(uuid, embed=False)
473 if __name__ == '__main__':
475 from htsworkflow.util.rdfhelp import get_model, dump_model
476 from htsworkflow.util.rdfjsonld import load_into_model
477 from pprint import pprint
479 logging.basicConfig(level=logging.DEBUG)
480 encoded = ENCODED('test.encodedcc.org')
482 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
484 load_into_model(model, body)