1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
15 from urlparse import urljoin, urlparse, urlunparse
17 LOGGER = logging.getLogger(__name__)
20 # The None context will get added to the root of the tree and will
21 # provide common defaults.
23 # terms in multiple encoded objects
24 'award': {'@type': '@id'},
25 'dataset': {'@type': '@id'},
26 'description': 'rdf:description',
27 'documents': {'@type': '@id'},
28 'experiment': {'@type': '@id'},
29 'href': {'@type': '@id'},
30 'lab': {'@type': '@id'},
31 'library': {'@type': '@id'},
32 'pi': {'@type': '@id'},
33 'platform': {'@type': '@id'},
34 'replicates': {'@type': '@id'},
35 'submitted_by': {'@type': '@id'},
36 'url': {'@type': '@id'},
38 # Identify and markup contained classes.
39 # e.g. in the tree there was a sub-dictionary named 'biosample'
40 # That dictionary had a term 'biosample_term_id, which is the
41 # term that should be used as the @id.
43 'biosample_term_id': {'@type': '@id'},
46 "assay_term_id": {"@type": "@id"},
47 "files": {"@type": "@id"},
48 "original_files": {"@type": "@id"},
50 # I tried to use the JSON-LD mapping capabilities to convert the lab
51 # contact information into a vcard record, but the encoded model
52 # didn't lend itself well to the vcard schema
54 # "address1": "vcard:street-address",
55 # "address2": "vcard:street-address",
56 # "city": "vcard:locality",
57 # "state": "vcard:region",
58 # "country": "vcard:country"
61 'nucleic_acid_term_id': {'@type': '@id'}
65 #FIXME: this needs to be initialized from rdfns
66 ENCODED_NAMESPACES = {
67 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
68 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
70 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
71 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
72 "owl": "http://www.w3.org/2002/07/owl#",
73 "dc": "htp://purl.org/dc/elements/1.1/",
74 "xsd": "http://www.w3.org/2001/XMLSchema#",
75 "vcard": "http://www.w3.org/2006/vcard/ns#",
77 # for some namespaces I made a best guess for the ontology root.
78 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
79 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
80 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
81 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
82 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
83 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
84 # NTR: New Term Request space for DCC to implement new ontology terms
88 ENCODED_SCHEMA_ROOT = '/profiles/'
92 '''Programatic access encoded, the software powering ENCODE3's submit site.
94 def __init__(self, server, contexts=None, namespaces=None):
99 self.contexts = contexts if contexts else ENCODED_CONTEXT
100 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
101 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
105 return (self.username, self.password)
106 auth = property(get_auth)
108 def load_netrc(self):
110 session = netrc.netrc()
111 authenticators = session.authenticators(self.server)
113 self.username = authenticators[0]
114 self.password = authenticators[2]
116 def add_jsonld_context(self, tree, default_base):
117 """Add contexts to various objects in the tree.
119 tree is a json tree returned from the DCC's encoded database.
120 contexts is a dictionary of dictionaries containing contexts
121 for the various possible encoded classes.
122 base, if supplied allows setting the base url that relative
123 urls will be resolved against.
125 self.add_jsonld_child_context(tree, default_base)
126 self.add_jsonld_namespaces(tree['@context'])
128 def add_jsonld_child_context(self, obj, default_base):
129 '''Add JSON-LD context to the encoded JSON.
131 This is recursive because some of the IDs were relative URLs
132 and I needed a way to properly compute a the correct base URL.
134 # pretend strings aren't iterable
135 if type(obj) in types.StringTypes:
138 # recurse on container types
139 if isinstance(obj, collections.Sequence):
140 # how should I update lists?
142 self.add_jsonld_child_context(v, default_base)
145 if isinstance(obj, collections.Mapping):
146 for v in obj.values():
147 self.add_jsonld_child_context(v, default_base)
149 # we have an object. attach a context to it.
150 if self._is_encoded_object(obj):
151 context = self.create_jsonld_context(obj, default_base)
153 obj.setdefault('@context', {}).update(context)
155 def add_jsonld_namespaces(self, context):
156 '''Add shortcut namespaces to a context
158 Only needs to be run on the top-most context
160 context.update(self.namespaces)
162 def create_jsonld_context(self, obj, default_base):
163 '''Synthesize the context for a encoded type
165 self.contexts[None] = default context attributes added to any type
166 self.contexts[type] = context attributes for this type.
168 context = {'@base': urljoin(default_base, obj['@id']),
169 '@vocab': self.get_schema_url(obj)}
171 context.update(self.contexts[None])
172 for t in obj['@type']:
173 if t in self.contexts:
174 context.update(self.contexts[t])
177 def get_json(self, obj_id, **kwargs):
178 '''GET an ENCODE object as JSON and return as dict
180 Uses prepare_url to allow url short-cuts
181 if no keyword arguments are specified it will default to adding limit=all
182 Alternative keyword arguments can be passed in and will be sent to the host.
185 limit - (integer or 'all') how many records to return, all for all of them
186 embed - (bool) if true expands linking ids into their associated object.
187 format - text/html or application/json
190 kwargs['limit'] = 'all'
192 url = self.prepare_url(obj_id)
193 LOGGER.info('requesting url: {}'.format(url))
197 LOGGER.debug('username: %s, password: %s', self.username, self.password)
198 response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
199 if not response.status_code == requests.codes.ok:
200 LOGGER.error("Error http status: {}".format(response.status_code))
201 response.raise_for_status()
202 return response.json()
204 def get_jsonld(self, obj_id, **kwargs):
205 '''Get ENCODE object as JSONLD annotated with classses contexts
207 see get_json for documentation about what keywords can be passed.
209 url = self.prepare_url(obj_id)
210 json = self.get_json(obj_id, **kwargs)
211 self.add_jsonld_context(json, url)
214 def get_object_type(self, obj):
215 """Return type for a encoded object
217 obj_type = obj.get('@type')
219 raise ValueError('None type')
220 if type(obj_type) in types.StringTypes:
221 raise ValueError('@type should be a list, not a string')
222 if not isinstance(obj_type, collections.Sequence):
223 raise ValueError('@type is not a sequence')
226 def get_schema_url(self, object_type):
227 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
229 def _is_encoded_object(self, obj):
230 '''Test to see if an object is a JSON-LD object
232 Some of the nested dictionaries lack the @id or @type
233 information necessary to convert them.
235 if not isinstance(obj, collections.Iterable):
238 if '@id' in obj and '@type' in obj:
242 def patch_json(self, obj_id, changes):
243 """Given a dictionary of changes push them as a HTTP patch request
245 url = self.prepare_url(obj_id)
246 LOGGER.info('PATCHing to %s', url)
247 payload = json.dumps(changes)
248 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
249 if response.status_code != requests.codes.ok:
250 LOGGER.error("Error http status: {}".format(response.status_code))
251 LOGGER.error("Response: %s", response.text)
252 response.raise_for_status()
253 return response.json()
255 def put_json(self, obj_id, new_object):
256 url = self.prepare_url(obj_id)
257 LOGGER.info('PUTing to %s', url)
258 payload = json.dumps(new_object)
259 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
260 if response.status_code != requests.codes.created:
261 LOGGER.error("Error http status: {}".format(response.status_code))
262 response.raise_for_status()
263 return response.json()
265 def post_json(self, collection_id, new_object):
266 url = self.prepare_url(collection_id)
267 LOGGER.info('POSTing to %s', url)
268 payload = json.dumps(new_object)
270 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
271 if response.status_code != requests.codes.created:
272 LOGGER.error("Error http status: {}".format(response.status_code))
273 response.raise_for_status()
274 return response.json()
276 def prepare_url(self, request_url):
277 '''This attempts to provide some convienence for accessing a URL
279 Given a url fragment it will default to :
281 * requests to self.server
283 This allows fairly flexible urls. e.g.
285 prepare_url('/experiments/ENCSR000AEG')
286 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
287 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
289 should all return the same url
291 # clean up potentially messy urls
292 url = urlparse(request_url)._asdict()
293 if not url['scheme']:
294 url['scheme'] = self.scheme
295 if not url['netloc']:
296 url['netloc'] = self.server
297 url = urlunparse(url.values())
300 def search_jsonld(self, term, **kwargs):
301 '''Send search request to ENCODED
303 url = self.prepare_url('/search/')
304 result = self.get_json(url, searchTerm=term, **kwargs)
305 self.convert_search_to_jsonld(result)
308 def convert_search_to_jsonld(self, result):
309 '''Add the context to search result
311 Also remove hard to handle nested attributes
312 e.g. remove object.term when we have no id
314 graph = result['@graph']
315 for i, obj in enumerate(graph):
316 # suppress nested attributes
317 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
319 self.add_jsonld_context(result, self.prepare_url(result['@id']))
322 def validate(self, obj, object_type=None):
323 object_type = object_type if object_type else self.get_object_type(obj)
324 schema_url = self.get_schema_url(object_type)
326 raise ValueError("Unable to construct schema url")
328 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
332 if '@type' in hidden:
334 jsonschema.validate(hidden, schema)
336 class TypedColumnParser(object):
338 def parse_sheet_array_type(value):
339 """Helper function to parse :array columns in sheet
341 return value.split(', ')
344 def parse_sheet_integer_type(value):
345 """Helper function to parse :integer columns in sheet
350 def parse_sheet_boolean_type(value):
351 """Helper function to parse :boolean columns in sheet
356 def parse_sheet_timestamp_type(value):
357 """Helper function to parse :date columns in sheet
359 return value.strftime('%Y-%m-%d')
362 def parse_sheet_string_type(value):
363 """Helper function to parse :string columns in sheet (the default)
365 return unicode(value)
367 def __getitem__(self, name):
369 'array': self.parse_sheet_array_type,
370 'boolean': self.parse_sheet_boolean_type,
371 'integer': self.parse_sheet_integer_type,
372 'date': self.parse_sheet_timestamp_type,
373 'string': self.parse_sheet_string_type
378 raise RuntimeError("unrecognized column type")
380 def __call__(self, header, value):
381 header = header.split(':')
382 column_type = 'string'
384 if header[1] == 'skip':
387 column_type = header[1]
388 return header[0], self[column_type](value)
390 typed_column_parser = TypedColumnParser()
392 class Document(object):
393 """Helper class for registering documents
396 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
397 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
398 lysis.create_if_needed(server, lysis_uuid)
400 award = 'U54HG006998'
401 lab = '/labs/barbara-wold'
403 def __init__(self, url, document_type, description, aliases=None):
405 self.filename = os.path.basename(url)
406 self.document_type = document_type
407 self.description = description
410 self.aliases = aliases if aliases is not None else []
411 self.content_type = None
419 def get_document(self):
420 if os.path.exists(self.url):
421 with open(self.url, 'r') as instream:
422 assert self.url.endswith('pdf')
423 self.content_type = 'application/pdf'
424 self.document = instream.read()
425 self.md5sum = hashlib.md5(self.document)
427 req = requests.get(self.url)
428 if req.status_code == 200:
429 self.content_type = req.headers['content-type']
430 self.document = req.content
431 self.md5sum = hashlib.md5(self.document)
432 self.urls = [self.url]
434 def create_payload(self):
437 'download': self.filename,
438 'type': self.content_type,
439 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
440 'md5sum': self.md5sum.hexdigest()
442 'document_type': self.document_type,
443 'description': self.description,
448 document_payload['aliases'] = self.aliases
450 document_payload['references'] = self.references
452 document_payload['urls'] = self.urls
454 return document_payload
456 def post(self, server):
457 document_payload = self.create_payload()
458 return server.post_json('/documents/', document_payload)
460 def save(self, filename):
461 payload = self.create_payload()
462 with open(filename, 'w') as outstream:
463 outstream.write(pformat(payload))
465 def create_if_needed(self, server, uuid):
468 return self.post(server)
470 return server.get_json(uuid, embed=False)
472 if __name__ == '__main__':
474 from htsworkflow.util.rdfhelp import get_model, dump_model
475 from htsworkflow.util.rdfjsonld import load_into_model
476 from pprint import pprint
478 logging.basicConfig(level=logging.DEBUG)
479 encoded = ENCODED('test.encodedcc.org')
481 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
483 load_into_model(model, body)