1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
7 from __future__ import print_function
14 from urlparse import urljoin, urlparse, urlunparse
16 LOGGER = logging.getLogger(__name__)
19 # The None context will get added to the root of the tree and will
20 # provide common defaults.
22 # terms in multiple encoded objects
23 'award': {'@type': '@id'},
24 'dataset': {'@type': '@id'},
25 'description': 'rdf:description',
26 'documents': {'@type': '@id'},
27 'experiment': {'@type': '@id'},
28 'href': {'@type': '@id'},
29 'lab': {'@type': '@id'},
30 'library': {'@type': '@id'},
31 'pi': {'@type': '@id'},
32 'platform': {'@type': '@id'},
33 'replicates': {'@type': '@id'},
34 'submitted_by': {'@type': '@id'},
35 'url': {'@type': '@id'},
37 # Identify and markup contained classes.
38 # e.g. in the tree there was a sub-dictionary named 'biosample'
39 # That dictionary had a term 'biosample_term_id, which is the
40 # term that should be used as the @id.
42 'biosample_term_id': {'@type': '@id'},
45 "assay_term_id": {"@type": "@id"},
46 "files": {"@type": "@id"},
47 "original_files": {"@type": "@id"},
49 # I tried to use the JSON-LD mapping capabilities to convert the lab
50 # contact information into a vcard record, but the encoded model
51 # didn't lend itself well to the vcard schema
53 # "address1": "vcard:street-address",
54 # "address2": "vcard:street-address",
55 # "city": "vcard:locality",
56 # "state": "vcard:region",
57 # "country": "vcard:country"
60 'nucleic_acid_term_id': {'@type': '@id'}
64 #FIXME: this needs to be initialized from rdfns
65 ENCODED_NAMESPACES = {
66 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
67 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
69 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
70 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
71 "owl": "http://www.w3.org/2002/07/owl#",
72 "dc": "htp://purl.org/dc/elements/1.1/",
73 "xsd": "http://www.w3.org/2001/XMLSchema#",
74 "vcard": "http://www.w3.org/2006/vcard/ns#",
76 # for some namespaces I made a best guess for the ontology root.
77 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
78 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
79 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
80 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
81 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
82 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
83 # NTR: New Term Request space for DCC to implement new ontology terms
87 ENCODED_SCHEMA_ROOT = '/profiles/'
91 '''Programatic access encoded, the software powering ENCODE3's submit site.
93 def __init__(self, server, contexts=None, namespaces=None):
98 self.contexts = contexts if contexts else ENCODED_CONTEXT
99 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
100 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
104 return (self.username, self.password)
105 auth = property(get_auth)
107 def load_netrc(self):
109 session = netrc.netrc()
110 authenticators = session.authenticators(self.server)
112 self.username = authenticators[0]
113 self.password = authenticators[2]
115 def add_jsonld_context(self, tree, default_base):
116 """Add contexts to various objects in the tree.
118 tree is a json tree returned from the DCC's encoded database.
119 contexts is a dictionary of dictionaries containing contexts
120 for the various possible encoded classes.
121 base, if supplied allows setting the base url that relative
122 urls will be resolved against.
124 self.add_jsonld_child_context(tree, default_base)
125 self.add_jsonld_namespaces(tree['@context'])
127 def add_jsonld_child_context(self, obj, default_base):
128 '''Add JSON-LD context to the encoded JSON.
130 This is recursive because some of the IDs were relative URLs
131 and I needed a way to properly compute a the correct base URL.
133 # pretend strings aren't iterable
134 if type(obj) in types.StringTypes:
137 # recurse on container types
138 if isinstance(obj, collections.Sequence):
139 # how should I update lists?
141 self.add_jsonld_child_context(v, default_base)
144 if isinstance(obj, collections.Mapping):
145 for v in obj.values():
146 self.add_jsonld_child_context(v, default_base)
148 # we have an object. attach a context to it.
149 if self._is_encoded_object(obj):
150 context = self.create_jsonld_context(obj, default_base)
152 obj.setdefault('@context', {}).update(context)
154 def add_jsonld_namespaces(self, context):
155 '''Add shortcut namespaces to a context
157 Only needs to be run on the top-most context
159 context.update(self.namespaces)
161 def create_jsonld_context(self, obj, default_base):
162 '''Synthesize the context for a encoded type
164 self.contexts[None] = default context attributes added to any type
165 self.contexts[type] = context attributes for this type.
167 context = {'@base': urljoin(default_base, obj['@id']),
168 '@vocab': self.get_schema_url(obj)}
170 context.update(self.contexts[None])
171 for t in obj['@type']:
172 if t in self.contexts:
173 context.update(self.contexts[t])
176 def get_json(self, obj_id, **kwargs):
177 '''GET an ENCODE object as JSON and return as dict
179 Uses prepare_url to allow url short-cuts
180 if no keyword arguments are specified it will default to adding limit=all
181 Alternative keyword arguments can be passed in and will be sent to the host.
184 limit - (integer or 'all') how many records to return, all for all of them
185 embed - (bool) if true expands linking ids into their associated object.
186 format - text/html or application/json
189 kwargs['limit'] = 'all'
191 url = self.prepare_url(obj_id)
192 LOGGER.info('requesting url: {}'.format(url))
196 LOGGER.debug('username: %s, password: %s', self.username, self.password)
197 response = requests.get(url, auth=self.auth, headers=self.json_headers, params=kwargs)
198 if not response.status_code == requests.codes.ok:
199 LOGGER.error("Error http status: {}".format(response.status_code))
200 response.raise_for_status()
201 return response.json()
203 def get_jsonld(self, obj_id, **kwargs):
204 '''Get ENCODE object as JSONLD annotated with classses contexts
206 see get_json for documentation about what keywords can be passed.
208 url = self.prepare_url(obj_id)
209 json = self.get_json(obj_id, **kwargs)
210 self.add_jsonld_context(json, url)
213 def get_object_type(self, obj):
214 """Return type for a encoded object
216 obj_type = obj.get('@type')
218 raise ValueError('None type')
219 if type(obj_type) in types.StringTypes:
220 raise ValueError('@type should be a list, not a string')
221 if not isinstance(obj_type, collections.Sequence):
222 raise ValueError('@type is not a sequence')
225 def get_schema_url(self, object_type):
226 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
228 def _is_encoded_object(self, obj):
229 '''Test to see if an object is a JSON-LD object
231 Some of the nested dictionaries lack the @id or @type
232 information necessary to convert them.
234 if not isinstance(obj, collections.Iterable):
237 if '@id' in obj and '@type' in obj:
241 def patch_json(self, obj_id, changes):
242 """Given a dictionary of changes push them as a HTTP patch request
244 url = self.prepare_url(obj_id)
245 LOGGER.info('PATCHing to %s', url)
246 payload = json.dumps(changes)
247 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
248 if response.status_code != requests.codes.ok:
249 LOGGER.error("Error http status: {}".format(response.status_code))
250 LOGGER.error("Response: %s", response.text)
251 response.raise_for_status()
252 return response.json()
254 def put_json(self, obj_id, new_object):
255 url = self.prepare_url(obj_id)
256 LOGGER.info('PUTing to %s', url)
257 payload = json.dumps(new_object)
258 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
259 if response.status_code != requests.codes.created:
260 LOGGER.error("Error http status: {}".format(response.status_code))
261 response.raise_for_status()
262 return response.json()
264 def post_json(self, collection_id, new_object):
265 url = self.prepare_url(collection_id)
266 LOGGER.info('POSTing to %s', url)
267 payload = json.dumps(new_object)
269 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
270 if response.status_code != requests.codes.created:
271 LOGGER.error("Error http status: {}".format(response.status_code))
272 response.raise_for_status()
273 return response.json()
275 def prepare_url(self, request_url):
276 '''This attempts to provide some convienence for accessing a URL
278 Given a url fragment it will default to :
280 * requests to self.server
282 This allows fairly flexible urls. e.g.
284 prepare_url('/experiments/ENCSR000AEG')
285 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
286 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
288 should all return the same url
290 # clean up potentially messy urls
291 url = urlparse(request_url)._asdict()
292 if not url['scheme']:
293 url['scheme'] = self.scheme
294 if not url['netloc']:
295 url['netloc'] = self.server
296 url = urlunparse(url.values())
299 def search_jsonld(self, term, **kwargs):
300 '''Send search request to ENCODED
302 url = self.prepare_url('/search/')
303 result = self.get_json(url, searchTerm=term, **kwargs)
304 self.convert_search_to_jsonld(result)
307 def convert_search_to_jsonld(self, result):
308 '''Add the context to search result
310 Also remove hard to handle nested attributes
311 e.g. remove object.term when we have no id
313 graph = result['@graph']
314 for i, obj in enumerate(graph):
315 # suppress nested attributes
316 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
318 self.add_jsonld_context(result, self.prepare_url(result['@id']))
321 def validate(self, obj, object_type=None):
322 object_type = object_type if object_type else self.get_object_type(obj)
323 schema_url = self.get_schema_url(object_type)
325 raise ValueError("Unable to construct schema url")
327 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
331 if '@type' in hidden:
333 jsonschema.validate(hidden, schema)
335 class TypedColumnParser(object):
337 def parse_sheet_array_type(value):
338 """Helper function to parse :array columns in sheet
340 return value.split(', ')
343 def parse_sheet_integer_type(value):
344 """Helper function to parse :integer columns in sheet
349 def parse_sheet_boolean_type(value):
350 """Helper function to parse :boolean columns in sheet
355 def parse_sheet_timestamp_type(value):
356 """Helper function to parse :date columns in sheet
358 return value.strftime('%Y-%m-%d')
361 def parse_sheet_string_type(value):
362 """Helper function to parse :string columns in sheet (the default)
364 return unicode(value)
366 def __getitem__(self, name):
368 'array': self.parse_sheet_array_type,
369 'boolean': self.parse_sheet_boolean_type,
370 'integer': self.parse_sheet_integer_type,
371 'date': self.parse_sheet_timestamp_type,
372 'string': self.parse_sheet_string_type
377 raise RuntimeError("unrecognized column type")
379 def __call__(self, header, value):
380 header = header.split(':')
381 column_type = 'string'
383 if header[1] == 'skip':
386 column_type = header[1]
387 return header[0], self[column_type](value)
389 typed_column_parser = TypedColumnParser()
391 class Document(object):
392 """Helper class for registering documents
395 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
396 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
397 lysis.create_if_needed(server, lysis_uuid)
399 award = 'U54HG006998'
400 lab = '/labs/barbara-wold'
402 def __init__(self, url, document_type, description, aliases=None):
404 self.filename = os.path.basename(url)
405 self.document_type = document_type
406 self.description = description
409 self.aliases = aliases if aliases is not None else []
410 self.content_type = None
418 def get_document(self):
419 if os.path.exists(self.url):
420 with open(self.url, 'r') as instream:
421 assert self.url.endswith('pdf')
422 self.content_type = 'application/pdf'
423 self.document = instream.read()
424 self.md5sum = hashlib.md5(self.document)
426 req = requests.get(self.url)
427 if req.status_code == 200:
428 self.content_type = req.headers['content-type']
429 self.document = req.content
430 self.md5sum = hashlib.md5(self.document)
431 self.urls = [self.url]
433 def create_payload(self):
436 'download': self.filename,
437 'type': self.content_type,
438 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
439 'md5sum': self.md5sum.hexdigest()
441 'document_type': self.document_type,
442 'description': self.description,
447 document_payload['aliases'] = self.aliases
449 document_payload['references'] = self.references
451 document_payload['urls'] = self.urls
453 return document_payload
455 def post(self, server):
456 document_payload = self.create_payload()
457 return server.post_json('/documents/', document_payload)
459 def save(self, filename):
460 payload = self.create_payload()
461 with open(filename, 'w') as outstream:
462 outstream.write(pformat(payload))
464 def create_if_needed(self, server, uuid):
467 return self.post(server)
469 return server.get_json(uuid, embed=False)
471 if __name__ == '__main__':
473 from htsworkflow.util.rdfhelp import get_model, dump_model
474 from htsworkflow.util.rdfjsonld import load_into_model
475 from pprint import pprint
477 logging.basicConfig(level=logging.DEBUG)
478 encoded = ENCODED('test.encodedcc.org')
480 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
482 load_into_model(model, body)