1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
16 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
18 LOGGER = logging.getLogger(__name__)
21 # The None context will get added to the root of the tree and will
22 # provide common defaults.
24 # terms in multiple encoded objects
25 'award': {'@type': '@id'},
26 'dataset': {'@type': '@id'},
27 'description': 'rdf:description',
28 'documents': {'@type': '@id'},
29 'experiment': {'@type': '@id'},
30 'href': {'@type': '@id'},
31 'lab': {'@type': '@id'},
32 'library': {'@type': '@id'},
33 'pi': {'@type': '@id'},
34 'platform': {'@type': '@id'},
35 'replicates': {'@type': '@id'},
36 'submitted_by': {'@type': '@id'},
37 'url': {'@type': '@id'},
39 # Identify and markup contained classes.
40 # e.g. in the tree there was a sub-dictionary named 'biosample'
41 # That dictionary had a term 'biosample_term_id, which is the
42 # term that should be used as the @id.
44 'biosample_term_id': {'@type': '@id'},
47 "assay_term_id": {"@type": "@id"},
48 "files": {"@type": "@id"},
49 "original_files": {"@type": "@id"},
51 # I tried to use the JSON-LD mapping capabilities to convert the lab
52 # contact information into a vcard record, but the encoded model
53 # didn't lend itself well to the vcard schema
55 # "address1": "vcard:street-address",
56 # "address2": "vcard:street-address",
57 # "city": "vcard:locality",
58 # "state": "vcard:region",
59 # "country": "vcard:country"
62 'nucleic_acid_term_id': {'@type': '@id'}
66 #FIXME: this needs to be initialized from rdfns
67 ENCODED_NAMESPACES = {
68 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
69 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
71 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
72 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
73 "owl": "http://www.w3.org/2002/07/owl#",
74 "dc": "htp://purl.org/dc/elements/1.1/",
75 "xsd": "http://www.w3.org/2001/XMLSchema#",
76 "vcard": "http://www.w3.org/2006/vcard/ns#",
78 # for some namespaces I made a best guess for the ontology root.
79 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
80 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
81 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
82 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
83 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
84 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
85 # NTR: New Term Request space for DCC to implement new ontology terms
89 ENCODED_SCHEMA_ROOT = '/profiles/'
93 '''Programatic access encoded, the software powering ENCODE3's submit site.
95 def __init__(self, server, contexts=None, namespaces=None):
100 self.contexts = contexts if contexts else ENCODED_CONTEXT
101 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
102 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
106 return (self.username, self.password)
107 auth = property(get_auth)
109 def load_netrc(self):
111 session = netrc.netrc()
112 authenticators = session.authenticators(self.server)
114 self.username = authenticators[0]
115 self.password = authenticators[2]
117 def add_jsonld_context(self, tree, default_base):
118 """Add contexts to various objects in the tree.
120 tree is a json tree returned from the DCC's encoded database.
121 contexts is a dictionary of dictionaries containing contexts
122 for the various possible encoded classes.
123 base, if supplied allows setting the base url that relative
124 urls will be resolved against.
126 self.add_jsonld_child_context(tree, default_base)
127 self.add_jsonld_namespaces(tree['@context'])
129 def add_jsonld_child_context(self, obj, default_base):
130 '''Add JSON-LD context to the encoded JSON.
132 This is recursive because some of the IDs were relative URLs
133 and I needed a way to properly compute a the correct base URL.
135 # pretend strings aren't iterable
136 if isinstance(obj, six.string_types):
139 # recurse on container types
140 if isinstance(obj, collections.Sequence):
141 # how should I update lists?
143 self.add_jsonld_child_context(v, default_base)
146 if isinstance(obj, collections.Mapping):
147 for v in obj.values():
148 self.add_jsonld_child_context(v, default_base)
150 # we have an object. attach a context to it.
151 if self._is_encoded_object(obj):
152 context = self.create_jsonld_context(obj, default_base)
154 obj.setdefault('@context', {}).update(context)
156 def add_jsonld_namespaces(self, context):
157 '''Add shortcut namespaces to a context
159 Only needs to be run on the top-most context
161 context.update(self.namespaces)
163 def create_jsonld_context(self, obj, default_base):
164 '''Synthesize the context for a encoded type
166 self.contexts[None] = default context attributes added to any type
167 self.contexts[type] = context attributes for this type.
169 obj_type = self.get_object_type(obj)
170 context = {'@base': urljoin(default_base, obj['@id']),
171 '@vocab': self.get_schema_url(obj_type)}
173 context.update(self.contexts[None])
174 for t in obj['@type']:
175 if t in self.contexts:
176 context.update(self.contexts[t])
179 def get_json(self, obj_id, **kwargs):
180 '''GET an ENCODE object as JSON and return as dict
182 Uses prepare_url to allow url short-cuts
183 if no keyword arguments are specified it will default to adding limit=all
184 Alternative keyword arguments can be passed in and will be sent to the host.
187 limit - (integer or 'all') how many records to return, all for all of them
188 embed - (bool) if true expands linking ids into their associated object.
189 format - text/html or application/json
192 kwargs['limit'] = 'all'
194 url = self.prepare_url(obj_id)
195 LOGGER.info('requesting url: {}'.format(url))
199 LOGGER.debug('username: %s, password: %s', self.username, self.password)
201 if self.username and self.password:
202 arguments['auth'] = self.auth
203 response = requests.get(url, headers=self.json_headers,
206 if not response.status_code == requests.codes.ok:
207 LOGGER.error("Error http status: {}".format(response.status_code))
208 response.raise_for_status()
209 return response.json()
211 def get_jsonld(self, obj_id, **kwargs):
212 '''Get ENCODE object as JSONLD annotated with classses contexts
214 see get_json for documentation about what keywords can be passed.
216 url = self.prepare_url(obj_id)
217 json = self.get_json(obj_id, **kwargs)
218 self.add_jsonld_context(json, url)
221 def get_object_type(self, obj):
222 """Return type for a encoded object
224 obj_type = obj.get('@type')
226 raise ValueError('None type')
227 if isinstance(obj_type, six.string_types):
228 raise ValueError('@type should be a list, not a string')
229 if not isinstance(obj_type, collections.Sequence):
230 raise ValueError('@type is not a sequence')
233 def get_schema_url(self, object_type):
234 """Create the ENCODED jsonschema url.
236 Return the ENCODED object schema url be either
237 object type name or the collection name one posts to.
240 server.get_schema_url('experiment') and
241 server.get_schema_url('/experiments/') both resolve to
242 SERVER/profiles/experiment.json
245 object_type (str): either ENCODED object name or collection
250 collection_to_type = {
251 '/biosamples/': 'biosample',
252 '/datasets/': 'dataset',
253 '/documents/': 'document',
254 '/experiments/': 'experiment',
255 '/libraries/': 'library',
256 '/replicates/': 'replicate',
258 object_type = collection_to_type.get(object_type, object_type)
260 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
262 def get_accession_name(self, collection):
263 """Lookup common object accession name given a collection name.
265 collection_to_accession_name = {
266 '/experiments/': 'experiment_accession',
267 '/biosamples/': 'biosample_accession',
268 '/libraries/': 'library_accession',
269 '/replicates/': 'uuid',
272 accession_name = collection_to_accession_name.get(collection, None)
273 if accession_name is None:
274 raise RuntimeError("Update list of collection to accession names for %s",
277 return accession_name
279 def _is_encoded_object(self, obj):
280 '''Test to see if an object is a JSON-LD object
282 Some of the nested dictionaries lack the @id or @type
283 information necessary to convert them.
285 if not isinstance(obj, collections.Iterable):
288 if '@id' in obj and '@type' in obj:
292 def patch_json(self, obj_id, changes):
293 """Given a dictionary of changes push them as a HTTP patch request
295 url = self.prepare_url(obj_id)
296 LOGGER.info('PATCHing to %s', url)
297 payload = json.dumps(changes)
298 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
299 if response.status_code != requests.codes.ok:
300 LOGGER.error("Error http status: {}".format(response.status_code))
301 LOGGER.error("Response: %s", response.text)
302 response.raise_for_status()
303 return response.json()
305 def put_json(self, obj_id, new_object):
306 url = self.prepare_url(obj_id)
307 LOGGER.info('PUTing to %s', url)
308 payload = json.dumps(new_object)
309 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
310 if response.status_code != requests.codes.created:
311 LOGGER.error("Error http status: {}".format(response.status_code))
312 response.raise_for_status()
313 return response.json()
315 def post_json(self, collection_id, new_object):
316 url = self.prepare_url(collection_id)
317 LOGGER.info('POSTing to %s', url)
318 payload = json.dumps(new_object)
320 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
321 if response.status_code != requests.codes.created:
322 LOGGER.error("http status: {}".format(response.status_code))
323 LOGGER.error("message: {}".format(response.content))
324 response.raise_for_status()
325 return response.json()
327 def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
328 """Create new ENCODED objects using metadata encoded in pandas DataFrame
330 The DataFrame column names need to encode the attribute names,
331 and in some cases also include some additional type information.
332 (see TypedColumnParser)
335 collection (str): name of collection to create new objects in
336 sheet (pandas.DataFrame): DataFrame with objects to create,
337 assuming the appropriate accession number is empty.
338 additional the accession number and uuid is updated if the object
340 dry_run (bool): whether or not to skip the code to post the objects
341 verbose (bool): print the http responses.
344 list of created objects.
347 jsonschema.ValidationError if the object doesn't validate against
348 the encoded jsonschema.
350 accession_name = self.get_accession_name(collection)
353 columns = sheet.columns
354 tosubmit = sheet[pandas.isnull(sheet[accession_name])]
356 for i in tosubmit.index:
360 if pandas.notnull(row[k]):
361 name, value = typed_column_parser(k, row[k])
364 new_object[name] = value
367 self.validate(new_object, collection)
368 except jsonschema.ValidationError as e:
369 LOGGER.error("Validation error row %s", i)
372 accession = row[accession_name]
373 description = row.get('description', None)
376 response = self.post_json(collection, new_object)
378 print("Reponse {}".format(response))
380 obj = response['@graph'][0]
382 accession = obj.get('accession')
383 uuid = obj.get('uuid')
386 sheet[accession_name][i] = accession
390 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
391 sheet['uuid'][i] = uuid
393 print("row {} created: {}".format(i, accession))
395 created.append(new_object)
396 LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
400 def prepare_url(self, request_url):
401 '''This attempts to provide some convienence for accessing a URL
403 Given a url fragment it will default to :
405 * requests to self.server
407 This allows fairly flexible urls. e.g.
409 prepare_url('/experiments/ENCSR000AEG')
410 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
411 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
413 should all return the same url
415 # clean up potentially messy urls
416 url = urlparse(request_url)._asdict()
417 if not url['scheme']:
418 url['scheme'] = self.scheme
419 if not url['netloc']:
420 url['netloc'] = self.server
421 url = urlunparse(url.values())
424 def search_jsonld(self, **kwargs):
425 '''Send search request to ENCODED
427 to do a general search do
430 url = self.prepare_url('/search/')
431 result = self.get_json(url, **kwargs)
432 self.convert_search_to_jsonld(result)
435 def convert_search_to_jsonld(self, result):
436 '''Add the context to search result
438 Also remove hard to handle nested attributes
439 e.g. remove object.term when we have no id
441 graph = result['@graph']
442 for i, obj in enumerate(graph):
443 # suppress nested attributes
444 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
446 self.add_jsonld_context(result, self.prepare_url(result['@id']))
449 def validate(self, obj, object_type=None):
450 """Validate an object against the ENCODED schema
453 obj (dictionary): object attributes to be submitted to encoded
454 object_type (string): ENCODED object name.
457 ValidationError: if the object does not conform to the schema.
459 object_type = object_type if object_type else self.get_object_type(obj)
460 schema_url = self.get_schema_url(object_type)
462 raise ValueError("Unable to construct schema url")
464 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
468 if '@type' in hidden:
470 jsonschema.validate(hidden, schema)
472 class TypedColumnParser(object):
474 def parse_sheet_array_type(value):
475 """Helper function to parse :array columns in sheet
477 return value.split(', ')
480 def parse_sheet_integer_type(value):
481 """Helper function to parse :integer columns in sheet
486 def parse_sheet_boolean_type(value):
487 """Helper function to parse :boolean columns in sheet
492 def parse_sheet_timestamp_type(value):
493 """Helper function to parse :date columns in sheet
495 return value.strftime('%Y-%m-%d')
498 def parse_sheet_string_type(value):
499 """Helper function to parse :string columns in sheet (the default)
501 return unicode(value)
503 def __getitem__(self, name):
505 'array': self.parse_sheet_array_type,
506 'boolean': self.parse_sheet_boolean_type,
507 'integer': self.parse_sheet_integer_type,
508 'date': self.parse_sheet_timestamp_type,
509 'string': self.parse_sheet_string_type
514 raise RuntimeError("unrecognized column type")
516 def __call__(self, header, value):
517 header = header.split(':')
518 column_type = 'string'
520 if header[1] == 'skip':
523 column_type = header[1]
524 return header[0], self[column_type](value)
526 typed_column_parser = TypedColumnParser()
528 class Document(object):
529 """Helper class for registering documents
532 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
533 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
534 lysis.create_if_needed(server, lysis_uuid)
536 award = 'U54HG006998'
537 lab = '/labs/barbara-wold'
539 def __init__(self, url, document_type, description, aliases=None):
541 self.filename = os.path.basename(url)
542 self.document_type = document_type
543 self.description = description
548 if isinstance(aliases, list):
549 self.aliases = aliases
551 raise ValueError("Aliases needs to be a list")
552 self.content_type = None
560 def get_document(self):
561 if os.path.exists(self.url):
562 with open(self.url, 'r') as instream:
563 assert self.url.endswith('pdf')
564 self.content_type = 'application/pdf'
565 self.document = instream.read()
566 self.md5sum = hashlib.md5(self.document)
568 req = requests.get(self.url)
569 if req.status_code == 200:
570 self.content_type = req.headers['content-type']
571 self.document = req.content
572 self.md5sum = hashlib.md5(self.document)
573 self.urls = [self.url]
575 def create_payload(self):
578 'download': self.filename,
579 'type': self.content_type,
580 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document),
581 'md5sum': self.md5sum.hexdigest()
583 'document_type': self.document_type,
584 'description': self.description,
589 document_payload['aliases'] = self.aliases
591 document_payload['references'] = self.references
593 document_payload['urls'] = self.urls
595 return document_payload
597 def post(self, server):
598 document_payload = self.create_payload()
599 server.validate(document_payload, 'document')
600 return server.post_json('/documents/', document_payload)
602 def save(self, filename):
603 payload = self.create_payload()
604 with open(filename, 'w') as outstream:
605 outstream.write(pformat(payload))
607 def create_if_needed(self, server, uuid):
610 return self.post(server)
612 return server.get_json(uuid, embed=False)
614 if __name__ == '__main__':
616 from htsworkflow.util.rdfhelp import get_model, dump_model
617 from htsworkflow.util.rdfjsonld import load_into_model
618 from pprint import pprint
620 logging.basicConfig(level=logging.DEBUG)
621 encoded = ENCODED('test.encodedcc.org')
623 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
625 load_into_model(model, body)