1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
17 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
19 LOGGER = logging.getLogger(__name__)
22 # The None context will get added to the root of the tree and will
23 # provide common defaults.
25 # terms in multiple encoded objects
26 'award': {'@type': '@id'},
27 'dataset': {'@type': '@id'},
28 'description': 'rdf:description',
29 'documents': {'@type': '@id'},
30 'experiment': {'@type': '@id'},
31 'href': {'@type': '@id'},
32 'lab': {'@type': '@id'},
33 'library': {'@type': '@id'},
34 'pi': {'@type': '@id'},
35 'platform': {'@type': '@id'},
36 'replicates': {'@type': '@id'},
37 'submitted_by': {'@type': '@id'},
38 'url': {'@type': '@id'},
40 # Identify and markup contained classes.
41 # e.g. in the tree there was a sub-dictionary named 'biosample'
42 # That dictionary had a term 'biosample_term_id, which is the
43 # term that should be used as the @id.
45 'biosample_term_id': {'@type': '@id'},
48 "assay_term_id": {"@type": "@id"},
49 "files": {"@type": "@id"},
50 "original_files": {"@type": "@id"},
52 # I tried to use the JSON-LD mapping capabilities to convert the lab
53 # contact information into a vcard record, but the encoded model
54 # didn't lend itself well to the vcard schema
56 # "address1": "vcard:street-address",
57 # "address2": "vcard:street-address",
58 # "city": "vcard:locality",
59 # "state": "vcard:region",
60 # "country": "vcard:country"
63 'nucleic_acid_term_id': {'@type': '@id'}
67 #FIXME: this needs to be initialized from rdfns
68 ENCODED_NAMESPACES = {
69 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
70 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
72 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
73 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
74 "owl": "http://www.w3.org/2002/07/owl#",
75 "dc": "htp://purl.org/dc/elements/1.1/",
76 "xsd": "http://www.w3.org/2001/XMLSchema#",
77 "vcard": "http://www.w3.org/2006/vcard/ns#",
79 # for some namespaces I made a best guess for the ontology root.
80 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
81 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
82 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
83 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
84 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
85 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
86 # NTR: New Term Request space for DCC to implement new ontology terms
90 ENCODED_SCHEMA_ROOT = '/profiles/'
94 '''Programatic access encoded, the software powering ENCODE3's submit site.
96 def __init__(self, server, contexts=None, namespaces=None):
101 self.contexts = contexts if contexts else ENCODED_CONTEXT
102 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
103 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
107 return (self.username, self.password)
108 auth = property(get_auth)
110 def load_netrc(self):
112 session = netrc.netrc()
113 authenticators = session.authenticators(self.server)
115 self.username = authenticators[0]
116 self.password = authenticators[2]
118 def add_jsonld_context(self, tree, default_base):
119 """Add contexts to various objects in the tree.
121 tree is a json tree returned from the DCC's encoded database.
122 contexts is a dictionary of dictionaries containing contexts
123 for the various possible encoded classes.
124 base, if supplied allows setting the base url that relative
125 urls will be resolved against.
127 self.add_jsonld_child_context(tree, default_base)
128 self.add_jsonld_namespaces(tree['@context'])
130 def add_jsonld_child_context(self, obj, default_base):
131 '''Add JSON-LD context to the encoded JSON.
133 This is recursive because some of the IDs were relative URLs
134 and I needed a way to properly compute a the correct base URL.
136 # pretend strings aren't iterable
137 if isinstance(obj, six.string_types):
140 # recurse on container types
141 if isinstance(obj, collections.Sequence):
142 # how should I update lists?
144 self.add_jsonld_child_context(v, default_base)
147 if isinstance(obj, collections.Mapping):
148 for v in obj.values():
149 self.add_jsonld_child_context(v, default_base)
151 # we have an object. attach a context to it.
152 if self._is_encoded_object(obj):
153 context = self.create_jsonld_context(obj, default_base)
155 # this is a total hack for relese 33 of
156 # encoded. They changed their model and
157 # i'm not sure what to do about it.
158 if obj.get('@context') == '/terms/':
160 obj.setdefault('@context', {}).update(context)
162 def add_jsonld_namespaces(self, context):
163 '''Add shortcut namespaces to a context
165 Only needs to be run on the top-most context
167 context.update(self.namespaces)
169 def create_jsonld_context(self, obj, default_base):
170 '''Synthesize the context for a encoded type
172 self.contexts[None] = default context attributes added to any type
173 self.contexts[type] = context attributes for this type.
175 obj_type = self.get_object_type(obj)
176 context = {'@base': urljoin(default_base, obj['@id']),
177 '@vocab': self.get_schema_url(obj_type)}
179 context.update(self.contexts[None])
180 for t in obj['@type']:
181 if t in self.contexts:
182 context.update(self.contexts[t])
185 def get_json(self, obj_id, **kwargs):
186 '''GET an ENCODE object as JSON and return as dict
188 Uses prepare_url to allow url short-cuts
189 if no keyword arguments are specified it will default to adding limit=all
190 Alternative keyword arguments can be passed in and will be sent to the host.
193 limit - (integer or 'all') how many records to return, all for all of them
194 embed - (bool) if true expands linking ids into their associated object.
195 format - text/html or application/json
198 kwargs['limit'] = 'all'
200 url = self.prepare_url(obj_id)
201 LOGGER.info('requesting url: {}'.format(url))
205 LOGGER.debug('username: %s, password: %s', self.username, self.password)
207 if self.username and self.password:
208 arguments['auth'] = self.auth
209 response = requests.get(url, headers=self.json_headers,
212 if not response.status_code == requests.codes.ok:
213 LOGGER.error("Error http status: {}".format(response.status_code))
214 response.raise_for_status()
215 return response.json()
217 def get_jsonld(self, obj_id, **kwargs):
218 '''Get ENCODE object as JSONLD annotated with classses contexts
220 see get_json for documentation about what keywords can be passed.
222 url = self.prepare_url(obj_id)
223 json = self.get_json(obj_id, **kwargs)
224 self.add_jsonld_context(json, url)
227 def get_object_type(self, obj):
228 """Return type for a encoded object
230 obj_type = obj.get('@type')
232 raise ValueError('None type')
233 if isinstance(obj_type, six.string_types):
234 raise ValueError('@type should be a list, not a string')
235 if not isinstance(obj_type, collections.Sequence):
236 raise ValueError('@type is not a sequence')
239 def get_schema_url(self, object_type):
240 """Create the ENCODED jsonschema url.
242 Return the ENCODED object schema url be either
243 object type name or the collection name one posts to.
246 server.get_schema_url('experiment') and
247 server.get_schema_url('/experiments/') both resolve to
248 SERVER/profiles/experiment.json
251 object_type (str): either ENCODED object name or collection
256 collection_to_type = {
257 '/biosamples/': 'biosample',
258 '/datasets/': 'dataset',
259 '/documents/': 'document',
260 '/experiments/': 'experiment',
261 '/libraries/': 'library',
262 '/replicates/': 'replicate',
265 object_type = collection_to_type.get(object_type, object_type)
267 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
269 def get_accession_name(self, collection):
270 """Lookup common object accession name given a collection name.
272 collection_to_accession_name = {
273 '/experiments/': 'experiment_accession',
274 '/biosamples/': 'biosample_accession',
275 '/libraries/': 'library_accession',
276 '/replicates/': 'uuid',
279 accession_name = collection_to_accession_name.get(collection, None)
280 if accession_name is None:
281 raise RuntimeError("Update list of collection to accession names for %s",
284 return accession_name
286 def _is_encoded_object(self, obj):
287 '''Test to see if an object is a JSON-LD object
289 Some of the nested dictionaries lack the @id or @type
290 information necessary to convert them.
292 if not isinstance(obj, collections.Iterable):
295 if '@id' in obj and '@type' in obj:
299 def patch_json(self, obj_id, changes):
300 """Given a dictionary of changes push them as a HTTP patch request
302 url = self.prepare_url(obj_id)
303 LOGGER.info('PATCHing to %s', url)
304 payload = json.dumps(changes)
305 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
306 if response.status_code != requests.codes.ok:
307 LOGGER.error("Error http status: {}".format(response.status_code))
308 LOGGER.error("Response: %s", response.text)
309 response.raise_for_status()
310 return response.json()
312 def put_json(self, obj_id, new_object):
313 url = self.prepare_url(obj_id)
314 LOGGER.info('PUTing to %s', url)
315 payload = json.dumps(new_object)
316 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
317 if response.status_code != requests.codes.created:
318 LOGGER.error("Error http status: {}".format(response.status_code))
319 response.raise_for_status()
320 return response.json()
322 def post_json(self, collection_id, new_object):
323 url = self.prepare_url(collection_id)
324 LOGGER.info('POSTing to %s', url)
325 payload = json.dumps(new_object)
327 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
328 if response.status_code != requests.codes.created:
329 LOGGER.error("http status: {}".format(response.status_code))
330 LOGGER.error("message: {}".format(response.content))
331 response.raise_for_status()
332 return response.json()
334 def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
335 """Create new ENCODED objects using metadata encoded in pandas DataFrame
337 The DataFrame column names need to encode the attribute names,
338 and in some cases also include some additional type information.
339 (see TypedColumnParser)
342 collection (str): name of collection to create new objects in
343 sheet (pandas.DataFrame): DataFrame with objects to create,
344 assuming the appropriate accession number is empty.
345 additional the accession number and uuid is updated if the object
347 dry_run (bool): whether or not to skip the code to post the objects
348 verbose (bool): print the http responses.
351 list of created objects.
354 jsonschema.ValidationError if the object doesn't validate against
355 the encoded jsonschema.
357 accession_name = self.get_accession_name(collection)
360 columns = sheet.columns
361 tosubmit = sheet[pandas.isnull(sheet[accession_name])]
363 for i in tosubmit.index:
367 if pandas.notnull(row[k]):
368 name, value = typed_column_parser(k, row[k])
371 new_object[name] = value
374 self.validate(new_object, collection)
375 except jsonschema.ValidationError as e:
376 LOGGER.error("Validation error row %s", i)
379 accession = row[accession_name]
380 description = row.get('description', None)
383 response = self.post_json(collection, new_object)
385 print("Reponse {}".format(response))
387 obj = response['@graph'][0]
389 accession = obj.get('accession')
390 uuid = obj.get('uuid')
393 sheet[accession_name][i] = accession
397 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
398 sheet['uuid'][i] = uuid
400 print("row {} created: {}".format(i, accession))
402 created.append(new_object)
403 LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
407 def prepare_url(self, request_url):
408 '''This attempts to provide some convienence for accessing a URL
410 Given a url fragment it will default to :
412 * requests to self.server
414 This allows fairly flexible urls. e.g.
416 prepare_url('/experiments/ENCSR000AEG')
417 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
418 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
420 should all return the same url
422 # clean up potentially messy urls
423 url = urlparse(request_url)._asdict()
424 if not url['scheme']:
425 url['scheme'] = self.scheme
426 if not url['netloc']:
427 url['netloc'] = self.server
428 url = urlunparse(url.values())
431 def search_jsonld(self, **kwargs):
432 '''Send search request to ENCODED
434 to do a general search do
437 url = self.prepare_url('/search/')
438 result = self.get_json(url, **kwargs)
439 self.convert_search_to_jsonld(result)
442 def convert_search_to_jsonld(self, result):
443 '''Add the context to search result
445 Also remove hard to handle nested attributes
446 e.g. remove object.term when we have no id
448 graph = result['@graph']
449 for i, obj in enumerate(graph):
450 # suppress nested attributes
451 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
453 self.add_jsonld_context(result, self.prepare_url(result['@id']))
456 def validate(self, obj, object_type=None):
457 """Validate an object against the ENCODED schema
460 obj (dictionary): object attributes to be submitted to encoded
461 object_type (string): ENCODED object name.
464 ValidationError: if the object does not conform to the schema.
466 object_type = object_type if object_type else self.get_object_type(obj)
467 schema_url = self.get_schema_url(object_type)
469 raise ValueError("Unable to construct schema url")
471 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
475 if '@type' in hidden:
477 jsonschema.validate(hidden, schema)
479 class TypedColumnParser(object):
481 def parse_sheet_array_type(value):
482 """Helper function to parse :array columns in sheet
484 return re.split(',\s*', value)
487 def parse_sheet_integer_type(value):
488 """Helper function to parse :integer columns in sheet
493 def parse_sheet_boolean_type(value):
494 """Helper function to parse :boolean columns in sheet
499 def parse_sheet_timestamp_type(value):
500 """Helper function to parse :date columns in sheet
502 return value.strftime('%Y-%m-%d')
505 def parse_sheet_string_type(value):
506 """Helper function to parse :string columns in sheet (the default)
510 def __getitem__(self, name):
512 'array': self.parse_sheet_array_type,
513 'boolean': self.parse_sheet_boolean_type,
514 'integer': self.parse_sheet_integer_type,
515 'date': self.parse_sheet_timestamp_type,
516 'string': self.parse_sheet_string_type
521 raise RuntimeError("unrecognized column type")
523 def __call__(self, header, value):
524 header = header.split(':')
525 column_type = 'string'
527 if header[1] == 'skip':
530 column_type = header[1]
531 return header[0], self[column_type](value)
533 typed_column_parser = TypedColumnParser()
535 class Document(object):
536 """Helper class for registering documents
539 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
540 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
541 lysis.create_if_needed(server, lysis_uuid)
543 award = 'U54HG006998'
544 lab = '/labs/barbara-wold'
546 def __init__(self, url, document_type, description, aliases=None):
548 self.filename = os.path.basename(url)
549 self.document_type = document_type
550 self.description = description
555 if isinstance(aliases, list):
556 self.aliases = aliases
558 raise ValueError("Aliases needs to be a list")
559 self.content_type = None
567 def get_document(self):
568 if os.path.exists(self.url):
569 with open(self.url, 'rb') as instream:
570 assert self.url.endswith('pdf')
571 self.content_type = 'application/pdf'
572 self.document = instream.read()
573 self.md5sum = hashlib.md5(self.document)
575 req = requests.get(self.url)
576 if req.status_code == 200:
577 self.content_type = req.headers['content-type']
578 self.document = req.content
579 self.md5sum = hashlib.md5(self.document)
580 self.urls = [self.url]
582 def create_payload(self):
585 'download': self.filename,
586 'type': self.content_type,
587 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
588 'md5sum': self.md5sum.hexdigest()
590 'document_type': self.document_type,
591 'description': self.description,
596 document_payload['aliases'] = self.aliases
598 document_payload['references'] = self.references
600 document_payload['urls'] = self.urls
602 return document_payload
604 def post(self, server):
605 document_payload = self.create_payload()
606 server.validate(document_payload, 'document')
607 return server.post_json('/documents/', document_payload)
609 def save(self, filename):
610 payload = self.create_payload()
611 with open(filename, 'w') as outstream:
612 outstream.write(pformat(payload))
614 def create_if_needed(self, server, uuid):
617 return self.post(server)
619 return server.get_json(uuid, embed=False)
621 if __name__ == '__main__':
623 from htsworkflow.util.rdfhelp import get_model, dump_model
624 from htsworkflow.util.rdfjsonld import load_into_model
625 from pprint import pprint
627 logging.basicConfig(level=logging.DEBUG)
628 encoded = ENCODED('test.encodedcc.org')
630 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
632 load_into_model(model, body)