1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
18 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
20 LOGGER = logging.getLogger(__name__)
23 # The None context will get added to the root of the tree and will
24 # provide common defaults.
26 # terms in multiple encoded objects
27 'award': {'@type': '@id'},
28 'dataset': {'@type': '@id'},
29 'description': 'rdf:description',
30 'documents': {'@type': '@id'},
31 'experiment': {'@type': '@id'},
32 'href': {'@type': '@id'},
33 'lab': {'@type': '@id'},
34 'library': {'@type': '@id'},
35 'pi': {'@type': '@id'},
36 'platform': {'@type': '@id'},
37 'replicates': {'@type': '@id'},
38 'submitted_by': {'@type': '@id'},
39 'url': {'@type': '@id'},
41 # Identify and markup contained classes.
42 # e.g. in the tree there was a sub-dictionary named 'biosample'
43 # That dictionary had a term 'biosample_term_id, which is the
44 # term that should be used as the @id.
46 'biosample_term_id': {'@type': '@id'},
49 "assay_term_id": {"@type": "@id"},
50 "files": {"@type": "@id"},
51 "original_files": {"@type": "@id"},
53 # I tried to use the JSON-LD mapping capabilities to convert the lab
54 # contact information into a vcard record, but the encoded model
55 # didn't lend itself well to the vcard schema
57 # "address1": "vcard:street-address",
58 # "address2": "vcard:street-address",
59 # "city": "vcard:locality",
60 # "state": "vcard:region",
61 # "country": "vcard:country"
64 'nucleic_acid_term_id': {'@type': '@id'}
68 #FIXME: this needs to be initialized from rdfns
69 ENCODED_NAMESPACES = {
70 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
71 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
73 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
74 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
75 "owl": "http://www.w3.org/2002/07/owl#",
76 "dc": "htp://purl.org/dc/elements/1.1/",
77 "xsd": "http://www.w3.org/2001/XMLSchema#",
78 "vcard": "http://www.w3.org/2006/vcard/ns#",
80 # for some namespaces I made a best guess for the ontology root.
81 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
82 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
83 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
84 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
85 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
86 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
87 # NTR: New Term Request space for DCC to implement new ontology terms
91 ENCODED_SCHEMA_ROOT = '/profiles/'
95 '''Programatic access encoded, the software powering ENCODE3's submit site.
97 def __init__(self, server, contexts=None, namespaces=None):
102 self.contexts = contexts if contexts else ENCODED_CONTEXT
103 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
104 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
108 return (self.username, self.password)
109 auth = property(get_auth)
111 def load_netrc(self):
113 session = netrc.netrc()
114 authenticators = session.authenticators(self.server)
116 self.username = authenticators[0]
117 self.password = authenticators[2]
119 def add_jsonld_context(self, tree, default_base):
120 """Add contexts to various objects in the tree.
122 tree is a json tree returned from the DCC's encoded database.
123 contexts is a dictionary of dictionaries containing contexts
124 for the various possible encoded classes.
125 base, if supplied allows setting the base url that relative
126 urls will be resolved against.
128 self.add_jsonld_child_context(tree, default_base)
129 self.add_jsonld_namespaces(tree['@context'])
131 def add_jsonld_child_context(self, obj, default_base):
132 '''Add JSON-LD context to the encoded JSON.
134 This is recursive because some of the IDs were relative URLs
135 and I needed a way to properly compute a the correct base URL.
137 # pretend strings aren't iterable
138 if isinstance(obj, six.string_types):
141 # recurse on container types
142 if isinstance(obj, collections.Sequence):
143 # how should I update lists?
145 self.add_jsonld_child_context(v, default_base)
148 if isinstance(obj, collections.Mapping):
149 for v in obj.values():
150 self.add_jsonld_child_context(v, default_base)
152 # we have an object. attach a context to it.
153 if self._is_encoded_object(obj):
154 context = self.create_jsonld_context(obj, default_base)
156 # this is a total hack for relese 33 of
157 # encoded. They changed their model and
158 # i'm not sure what to do about it.
159 if obj.get('@context') == '/terms/':
161 obj.setdefault('@context', {}).update(context)
163 def add_jsonld_namespaces(self, context):
164 '''Add shortcut namespaces to a context
166 Only needs to be run on the top-most context
168 context.update(self.namespaces)
170 def create_jsonld_context(self, obj, default_base):
171 '''Synthesize the context for a encoded type
173 self.contexts[None] = default context attributes added to any type
174 self.contexts[type] = context attributes for this type.
176 obj_type = self.get_object_type(obj)
177 context = {'@base': urljoin(default_base, obj['@id']),
178 '@vocab': self.get_schema_url(obj_type)}
180 context.update(self.contexts[None])
181 for t in obj['@type']:
182 if t in self.contexts:
183 context.update(self.contexts[t])
186 def get_json(self, obj_id, **kwargs):
187 '''GET an ENCODE object as JSON and return as dict
189 Uses prepare_url to allow url short-cuts
190 if no keyword arguments are specified it will default to adding limit=all
191 Alternative keyword arguments can be passed in and will be sent to the host.
194 limit - (integer or 'all') how many records to return, all for all of them
195 embed - (bool) if true expands linking ids into their associated object.
196 format - text/html or application/json
199 kwargs['limit'] = 'all'
201 url = self.prepare_url(obj_id)
202 LOGGER.info('requesting url: {}'.format(url))
206 LOGGER.debug('username: %s, password: %s', self.username, self.password)
208 if self.username and self.password:
209 arguments['auth'] = self.auth
210 response = requests.get(url, headers=self.json_headers,
213 if not response.status_code == requests.codes.ok:
214 LOGGER.error("Error http status: {}".format(response.status_code))
215 response.raise_for_status()
216 return response.json()
218 def get_jsonld(self, obj_id, **kwargs):
219 '''Get ENCODE object as JSONLD annotated with classses contexts
221 see get_json for documentation about what keywords can be passed.
223 url = self.prepare_url(obj_id)
224 json = self.get_json(obj_id, **kwargs)
225 self.add_jsonld_context(json, url)
228 def get_object_type(self, obj):
229 """Return type for a encoded object
231 obj_type = obj.get('@type')
233 raise ValueError('None type')
234 if isinstance(obj_type, six.string_types):
235 raise ValueError('@type should be a list, not a string')
236 if not isinstance(obj_type, collections.Sequence):
237 raise ValueError('@type is not a sequence')
240 def get_schema_url(self, object_type):
241 """Create the ENCODED jsonschema url.
243 Return the ENCODED object schema url be either
244 object type name or the collection name one posts to.
247 server.get_schema_url('experiment') and
248 server.get_schema_url('/experiments/') both resolve to
249 SERVER/profiles/experiment.json
252 object_type (str): either ENCODED object name or collection
257 collection_to_type = {
258 '/biosamples/': 'biosample',
259 '/datasets/': 'dataset',
260 '/documents/': 'document',
261 '/experiments/': 'experiment',
262 '/libraries/': 'library',
263 '/replicates/': 'replicate',
266 object_type = collection_to_type.get(object_type, object_type)
268 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
270 def get_accession_name(self, collection):
271 """Lookup common object accession name given a collection name.
273 collection_to_accession_name = {
274 '/experiments/': 'experiment_accession',
275 '/biosamples/': 'biosample_accession',
276 '/libraries/': 'library_accession',
277 '/replicates/': 'uuid',
280 accession_name = collection_to_accession_name.get(collection, None)
281 if accession_name is None:
282 raise RuntimeError("Update list of collection to accession names for %s",
285 return accession_name
287 def _is_encoded_object(self, obj):
288 '''Test to see if an object is a JSON-LD object
290 Some of the nested dictionaries lack the @id or @type
291 information necessary to convert them.
293 if not isinstance(obj, collections.Iterable):
296 if '@id' in obj and '@type' in obj:
300 def patch_json(self, obj_id, changes):
301 """Given a dictionary of changes push them as a HTTP patch request
303 url = self.prepare_url(obj_id)
304 LOGGER.info('PATCHing to %s', url)
305 payload = json.dumps(changes)
306 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
307 if response.status_code != requests.codes.ok:
308 LOGGER.error("Error http status: {}".format(response.status_code))
309 LOGGER.error("Response: %s", response.text)
310 response.raise_for_status()
311 return response.json()
313 def put_json(self, obj_id, new_object):
314 url = self.prepare_url(obj_id)
315 LOGGER.info('PUTing to %s', url)
316 payload = json.dumps(new_object)
317 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
318 if response.status_code != requests.codes.created:
319 LOGGER.error("Error http status: {}".format(response.status_code))
320 response.raise_for_status()
321 return response.json()
323 def post_json(self, collection_id, new_object):
324 url = self.prepare_url(collection_id)
325 LOGGER.info('POSTing to %s', url)
326 payload = json.dumps(new_object)
328 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
329 if response.status_code != requests.codes.created:
330 LOGGER.error("http status: {}".format(response.status_code))
331 LOGGER.error("message: {}".format(response.content))
332 response.raise_for_status()
333 return response.json()
335 def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
336 """Create new ENCODED objects using metadata encoded in pandas DataFrame
338 The DataFrame column names need to encode the attribute names,
339 and in some cases also include some additional type information.
340 (see TypedColumnParser)
343 collection (str): name of collection to create new objects in
344 sheet (pandas.DataFrame): DataFrame with objects to create,
345 assuming the appropriate accession number is empty.
346 additional the accession number and uuid is updated if the object
348 dry_run (bool): whether or not to skip the code to post the objects
349 verbose (bool): print the http responses.
352 list of created objects.
355 jsonschema.ValidationError if the object doesn't validate against
356 the encoded jsonschema.
358 accession_name = self.get_accession_name(collection)
360 to_create = self.prepare_objects_from_sheet(collection, sheet)
365 for i, new_object in to_create:
367 accession = new_object.get('accession')
368 uuid = new_object.get('uuid')
369 description = new_object.get('description')
371 posted_object = self.post_object_from_row(
372 collection, i, new_object, dry_run, verbose
374 created.append(posted_object)
377 accession = posted_object.get('accession')
378 uuid = posted_object.get('uuid')
379 description = posted_object.get('description')
381 accessions.append(accession)
384 LOGGER.info('row {} ({}) -> {}'.format(
385 (i+2), description, accession))
386 # +2 comes from python row index + 1 to convert to
387 # one based indexing + 1 to account for
388 # row removed by header parsing
390 accessions.append(numpy.nan)
391 uuids.append(numpy.nan)
393 if accession_name in sheet.columns:
394 sheet[accession_name] = accessions
395 if 'uuid' in sheet.columns:
396 sheet['uuid'] = uuids
400 def prepare_objects_from_sheet(self, collection, sheet):
401 accession_name = self.get_accession_name(collection)
403 for i, row in sheet.iterrows():
405 for name, value in row.items():
406 if pandas.notnull(value):
407 name, value = typed_column_parser(name, value)
410 new_object[name] = value
412 if new_object and new_object.get(accession_name) is None:
414 self.validate(new_object, collection)
415 except jsonschema.ValidationError as e:
416 LOGGER.error("Validation error row %s", i)
418 to_create.append((i, new_object))
421 to_create.append((i, None))
425 def post_object_from_row(self, collection, i, new_object,
426 dry_run=True, verbose=True):
427 accession_name = self.get_accession_name(collection)
430 response = self.post_json(collection, new_object)
432 print("Reponse {}".format(response))
434 obj = response['@graph'][0]
436 accession = obj.get(accession_name)
438 accession = obj.get('uuid')
440 print("row {} created: {}".format(i, accession))
443 new_object[accession_name] = 'would create'
446 def prepare_url(self, request_url):
447 '''This attempts to provide some convienence for accessing a URL
449 Given a url fragment it will default to :
451 * requests to self.server
453 This allows fairly flexible urls. e.g.
455 prepare_url('/experiments/ENCSR000AEG')
456 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
457 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
459 should all return the same url
461 # clean up potentially messy urls
462 url = urlparse(request_url)._asdict()
463 if not url['scheme']:
464 url['scheme'] = self.scheme
465 if not url['netloc']:
466 url['netloc'] = self.server
467 url = urlunparse(url.values())
470 def search_jsonld(self, **kwargs):
471 '''Send search request to ENCODED
473 to do a general search do
476 url = self.prepare_url('/search/')
477 result = self.get_json(url, **kwargs)
478 self.convert_search_to_jsonld(result)
481 def convert_search_to_jsonld(self, result):
482 '''Add the context to search result
484 Also remove hard to handle nested attributes
485 e.g. remove object.term when we have no id
487 graph = result['@graph']
488 for i, obj in enumerate(graph):
489 # suppress nested attributes
490 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
492 self.add_jsonld_context(result, self.prepare_url(result['@id']))
495 def validate(self, obj, object_type=None):
496 """Validate an object against the ENCODED schema
499 obj (dictionary): object attributes to be submitted to encoded
500 object_type (string): ENCODED object name.
503 ValidationError: if the object does not conform to the schema.
505 object_type = object_type if object_type else self.get_object_type(obj)
506 schema_url = self.get_schema_url(object_type)
508 raise ValueError("Unable to construct schema url")
510 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
514 if '@type' in hidden:
516 jsonschema.validate(hidden, schema)
518 class TypedColumnParser(object):
520 def parse_sheet_array_type(value):
521 """Helper function to parse :array columns in sheet
523 return re.split(',\s*', value)
526 def parse_sheet_integer_type(value):
527 """Helper function to parse :integer columns in sheet
532 def parse_sheet_boolean_type(value):
533 """Helper function to parse :boolean columns in sheet
538 def parse_sheet_timestamp_type(value):
539 """Helper function to parse :date columns in sheet
541 return value.strftime('%Y-%m-%d')
544 def parse_sheet_string_type(value):
545 """Helper function to parse :string columns in sheet (the default)
549 def __getitem__(self, name):
551 'array': self.parse_sheet_array_type,
552 'boolean': self.parse_sheet_boolean_type,
553 'integer': self.parse_sheet_integer_type,
554 'date': self.parse_sheet_timestamp_type,
555 'string': self.parse_sheet_string_type
560 raise RuntimeError("unrecognized column type")
562 def __call__(self, header, value):
563 header = header.split(':')
564 column_type = 'string'
566 if header[1] == 'skip':
569 column_type = header[1]
570 return header[0], self[column_type](value)
572 typed_column_parser = TypedColumnParser()
574 class Document(object):
575 """Helper class for registering documents
578 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
579 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
580 lysis.create_if_needed(server, lysis_uuid)
582 award = 'U54HG006998'
583 lab = '/labs/barbara-wold'
585 def __init__(self, url, document_type, description, aliases=None):
587 self.filename = os.path.basename(url)
588 self.document_type = document_type
589 self.description = description
594 if isinstance(aliases, list):
595 self.aliases = aliases
597 raise ValueError("Aliases needs to be a list")
598 self.content_type = None
606 def get_document(self):
607 if os.path.exists(self.url):
608 with open(self.url, 'rb') as instream:
609 assert self.url.endswith('pdf')
610 self.content_type = 'application/pdf'
611 self.document = instream.read()
612 self.md5sum = hashlib.md5(self.document)
614 req = requests.get(self.url)
615 if req.status_code == 200:
616 self.content_type = req.headers['content-type']
617 self.document = req.content
618 self.md5sum = hashlib.md5(self.document)
619 self.urls = [self.url]
621 def create_payload(self):
624 'download': self.filename,
625 'type': self.content_type,
626 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
627 'md5sum': self.md5sum.hexdigest()
629 'document_type': self.document_type,
630 'description': self.description,
635 document_payload['aliases'] = self.aliases
637 document_payload['references'] = self.references
639 document_payload['urls'] = self.urls
641 return document_payload
643 def post(self, server):
644 document_payload = self.create_payload()
645 server.validate(document_payload, 'document')
646 return server.post_json('/documents/', document_payload)
648 def save(self, filename):
649 payload = self.create_payload()
650 with open(filename, 'w') as outstream:
651 outstream.write(pformat(payload))
653 def create_if_needed(self, server, uuid):
656 return self.post(server)
658 return server.get_json(uuid, embed=False)
660 if __name__ == '__main__':
662 from htsworkflow.util.rdfhelp import get_model, dump_model
663 from htsworkflow.util.rdfjsonld import load_into_model
664 from pprint import pprint
666 logging.basicConfig(level=logging.DEBUG)
667 encoded = ENCODED('test.encodedcc.org')
669 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
671 load_into_model(model, body)