1 """Interface with encoded software for ENCODE3 data submission & warehouse
3 This allows retrieving blocks
5 from __future__ import print_function
17 from six.moves.urllib.parse import urljoin, urlparse, urlunparse
19 LOGGER = logging.getLogger(__name__)
22 # The None context will get added to the root of the tree and will
23 # provide common defaults.
25 # terms in multiple encoded objects
26 'award': {'@type': '@id'},
27 'dataset': {'@type': '@id'},
28 'description': 'rdf:description',
29 'documents': {'@type': '@id'},
30 'experiment': {'@type': '@id'},
31 'href': {'@type': '@id'},
32 'lab': {'@type': '@id'},
33 'library': {'@type': '@id'},
34 'pi': {'@type': '@id'},
35 'platform': {'@type': '@id'},
36 'replicates': {'@type': '@id'},
37 'submitted_by': {'@type': '@id'},
38 'url': {'@type': '@id'},
40 # Identify and markup contained classes.
41 # e.g. in the tree there was a sub-dictionary named 'biosample'
42 # That dictionary had a term 'biosample_term_id, which is the
43 # term that should be used as the @id.
45 'biosample_term_id': {'@type': '@id'},
48 "assay_term_id": {"@type": "@id"},
49 "files": {"@type": "@id"},
50 "original_files": {"@type": "@id"},
52 # I tried to use the JSON-LD mapping capabilities to convert the lab
53 # contact information into a vcard record, but the encoded model
54 # didn't lend itself well to the vcard schema
56 # "address1": "vcard:street-address",
57 # "address2": "vcard:street-address",
58 # "city": "vcard:locality",
59 # "state": "vcard:region",
60 # "country": "vcard:country"
63 'nucleic_acid_term_id': {'@type': '@id'}
67 #FIXME: this needs to be initialized from rdfns
68 ENCODED_NAMESPACES = {
69 # JSON-LD lets you define namespaces so you can used the shorted url syntax.
70 # (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
72 "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
73 "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
74 "owl": "http://www.w3.org/2002/07/owl#",
75 "dc": "htp://purl.org/dc/elements/1.1/",
76 "xsd": "http://www.w3.org/2001/XMLSchema#",
77 "vcard": "http://www.w3.org/2006/vcard/ns#",
79 # for some namespaces I made a best guess for the ontology root.
80 "EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
81 "OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
82 "OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
83 # OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
84 "SO": "http://purl.obolibrary.org/obo/SO_", # Sequence ontology
85 # SO: available from http://www.berkeleybop.org/ontologies/so.owl
86 # NTR: New Term Request space for DCC to implement new ontology terms
90 ENCODED_SCHEMA_ROOT = '/profiles/'
94 '''Programatic access encoded, the software powering ENCODE3's submit site.
96 def __init__(self, server, contexts=None, namespaces=None):
101 self.contexts = contexts if contexts else ENCODED_CONTEXT
102 self.namespaces = namespaces if namespaces else ENCODED_NAMESPACES
103 self.json_headers = {'content-type': 'application/json', 'accept': 'application/json'}
107 return (self.username, self.password)
108 auth = property(get_auth)
110 def load_netrc(self):
112 session = netrc.netrc()
113 authenticators = session.authenticators(self.server)
115 self.username = authenticators[0]
116 self.password = authenticators[2]
118 def add_jsonld_context(self, tree, default_base):
119 """Add contexts to various objects in the tree.
121 tree is a json tree returned from the DCC's encoded database.
122 contexts is a dictionary of dictionaries containing contexts
123 for the various possible encoded classes.
124 base, if supplied allows setting the base url that relative
125 urls will be resolved against.
127 self.add_jsonld_child_context(tree, default_base)
128 self.add_jsonld_namespaces(tree['@context'])
130 def add_jsonld_child_context(self, obj, default_base):
131 '''Add JSON-LD context to the encoded JSON.
133 This is recursive because some of the IDs were relative URLs
134 and I needed a way to properly compute a the correct base URL.
136 # pretend strings aren't iterable
137 if isinstance(obj, six.string_types):
140 # recurse on container types
141 if isinstance(obj, collections.Sequence):
142 # how should I update lists?
144 self.add_jsonld_child_context(v, default_base)
147 if isinstance(obj, collections.Mapping):
148 for v in obj.values():
149 self.add_jsonld_child_context(v, default_base)
151 # we have an object. attach a context to it.
152 if self._is_encoded_object(obj):
153 context = self.create_jsonld_context(obj, default_base)
155 # this is a total hack for relese 33 of
156 # encoded. They changed their model and
157 # i'm not sure what to do about it.
158 if obj.get('@context') == '/terms/':
160 obj.setdefault('@context', {}).update(context)
162 def add_jsonld_namespaces(self, context):
163 '''Add shortcut namespaces to a context
165 Only needs to be run on the top-most context
167 context.update(self.namespaces)
169 def create_jsonld_context(self, obj, default_base):
170 '''Synthesize the context for a encoded type
172 self.contexts[None] = default context attributes added to any type
173 self.contexts[type] = context attributes for this type.
175 obj_type = self.get_object_type(obj)
176 context = {'@base': urljoin(default_base, obj['@id']),
177 '@vocab': self.get_schema_url(obj_type)}
179 context.update(self.contexts[None])
180 for t in obj['@type']:
181 if t in self.contexts:
182 context.update(self.contexts[t])
185 def get_json(self, obj_id, **kwargs):
186 '''GET an ENCODE object as JSON and return as dict
188 Uses prepare_url to allow url short-cuts
189 if no keyword arguments are specified it will default to adding limit=all
190 Alternative keyword arguments can be passed in and will be sent to the host.
193 limit - (integer or 'all') how many records to return, all for all of them
194 embed - (bool) if true expands linking ids into their associated object.
195 format - text/html or application/json
198 kwargs['limit'] = 'all'
200 url = self.prepare_url(obj_id)
201 LOGGER.info('requesting url: {}'.format(url))
205 LOGGER.debug('username: %s, password: %s', self.username, self.password)
207 if self.username and self.password:
208 arguments['auth'] = self.auth
209 response = requests.get(url, headers=self.json_headers,
212 if not response.status_code == requests.codes.ok:
213 LOGGER.error("Error http status: {}".format(response.status_code))
214 response.raise_for_status()
215 return response.json()
217 def get_jsonld(self, obj_id, **kwargs):
218 '''Get ENCODE object as JSONLD annotated with classses contexts
220 see get_json for documentation about what keywords can be passed.
222 url = self.prepare_url(obj_id)
223 json = self.get_json(obj_id, **kwargs)
224 self.add_jsonld_context(json, url)
227 def get_object_type(self, obj):
228 """Return type for a encoded object
230 obj_type = obj.get('@type')
232 raise ValueError('None type')
233 if isinstance(obj_type, six.string_types):
234 raise ValueError('@type should be a list, not a string')
235 if not isinstance(obj_type, collections.Sequence):
236 raise ValueError('@type is not a sequence')
239 def get_schema_url(self, object_type):
240 """Create the ENCODED jsonschema url.
242 Return the ENCODED object schema url be either
243 object type name or the collection name one posts to.
246 server.get_schema_url('experiment') and
247 server.get_schema_url('/experiments/') both resolve to
248 SERVER/profiles/experiment.json
251 object_type (str): either ENCODED object name or collection
256 collection_to_type = {
257 '/biosamples/': 'biosample',
258 '/datasets/': 'dataset',
259 '/documents/': 'document',
260 '/experiments/': 'experiment',
261 '/libraries/': 'library',
262 '/replicates/': 'replicate',
264 object_type = collection_to_type.get(object_type, object_type)
266 return self.prepare_url(ENCODED_SCHEMA_ROOT + object_type + '.json') + '#'
268 def get_accession_name(self, collection):
269 """Lookup common object accession name given a collection name.
271 collection_to_accession_name = {
272 '/experiments/': 'experiment_accession',
273 '/biosamples/': 'biosample_accession',
274 '/libraries/': 'library_accession',
275 '/replicates/': 'uuid',
278 accession_name = collection_to_accession_name.get(collection, None)
279 if accession_name is None:
280 raise RuntimeError("Update list of collection to accession names for %s",
283 return accession_name
285 def _is_encoded_object(self, obj):
286 '''Test to see if an object is a JSON-LD object
288 Some of the nested dictionaries lack the @id or @type
289 information necessary to convert them.
291 if not isinstance(obj, collections.Iterable):
294 if '@id' in obj and '@type' in obj:
298 def patch_json(self, obj_id, changes):
299 """Given a dictionary of changes push them as a HTTP patch request
301 url = self.prepare_url(obj_id)
302 LOGGER.info('PATCHing to %s', url)
303 payload = json.dumps(changes)
304 response = requests.patch(url, auth=self.auth, headers=self.json_headers, data=payload)
305 if response.status_code != requests.codes.ok:
306 LOGGER.error("Error http status: {}".format(response.status_code))
307 LOGGER.error("Response: %s", response.text)
308 response.raise_for_status()
309 return response.json()
311 def put_json(self, obj_id, new_object):
312 url = self.prepare_url(obj_id)
313 LOGGER.info('PUTing to %s', url)
314 payload = json.dumps(new_object)
315 response = requests.put(url, auth=self.auth, headers=self.json_headers, data=payload)
316 if response.status_code != requests.codes.created:
317 LOGGER.error("Error http status: {}".format(response.status_code))
318 response.raise_for_status()
319 return response.json()
321 def post_json(self, collection_id, new_object):
322 url = self.prepare_url(collection_id)
323 LOGGER.info('POSTing to %s', url)
324 payload = json.dumps(new_object)
326 response = requests.post(url, auth=self.auth, headers=self.json_headers, data=payload)
327 if response.status_code != requests.codes.created:
328 LOGGER.error("http status: {}".format(response.status_code))
329 LOGGER.error("message: {}".format(response.content))
330 response.raise_for_status()
331 return response.json()
333 def post_sheet(self, collection, sheet, dry_run=True, verbose=False):
334 """Create new ENCODED objects using metadata encoded in pandas DataFrame
336 The DataFrame column names need to encode the attribute names,
337 and in some cases also include some additional type information.
338 (see TypedColumnParser)
341 collection (str): name of collection to create new objects in
342 sheet (pandas.DataFrame): DataFrame with objects to create,
343 assuming the appropriate accession number is empty.
344 additional the accession number and uuid is updated if the object
346 dry_run (bool): whether or not to skip the code to post the objects
347 verbose (bool): print the http responses.
350 list of created objects.
353 jsonschema.ValidationError if the object doesn't validate against
354 the encoded jsonschema.
356 accession_name = self.get_accession_name(collection)
359 columns = sheet.columns
360 tosubmit = sheet[pandas.isnull(sheet[accession_name])]
362 for i in tosubmit.index:
366 if pandas.notnull(row[k]):
367 name, value = typed_column_parser(k, row[k])
370 new_object[name] = value
373 self.validate(new_object, collection)
374 except jsonschema.ValidationError as e:
375 LOGGER.error("Validation error row %s", i)
378 accession = row[accession_name]
379 description = row.get('description', None)
382 response = self.post_json(collection, new_object)
384 print("Reponse {}".format(response))
386 obj = response['@graph'][0]
388 accession = obj.get('accession')
389 uuid = obj.get('uuid')
392 sheet[accession_name][i] = accession
396 if 'uuid' in columns and pandas.isnull(sheet['uuid'][i]):
397 sheet['uuid'][i] = uuid
399 print("row {} created: {}".format(i, accession))
401 created.append(new_object)
402 LOGGER.info('row {} ({}) -> {}'.format(i, description, accession))
406 def prepare_url(self, request_url):
407 '''This attempts to provide some convienence for accessing a URL
409 Given a url fragment it will default to :
411 * requests to self.server
413 This allows fairly flexible urls. e.g.
415 prepare_url('/experiments/ENCSR000AEG')
416 prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
417 prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
419 should all return the same url
421 # clean up potentially messy urls
422 url = urlparse(request_url)._asdict()
423 if not url['scheme']:
424 url['scheme'] = self.scheme
425 if not url['netloc']:
426 url['netloc'] = self.server
427 url = urlunparse(url.values())
430 def search_jsonld(self, **kwargs):
431 '''Send search request to ENCODED
433 to do a general search do
436 url = self.prepare_url('/search/')
437 result = self.get_json(url, **kwargs)
438 self.convert_search_to_jsonld(result)
441 def convert_search_to_jsonld(self, result):
442 '''Add the context to search result
444 Also remove hard to handle nested attributes
445 e.g. remove object.term when we have no id
447 graph = result['@graph']
448 for i, obj in enumerate(graph):
449 # suppress nested attributes
450 graph[i] = {k: v for k, v in obj.items() if '.' not in k}
452 self.add_jsonld_context(result, self.prepare_url(result['@id']))
455 def validate(self, obj, object_type=None):
456 """Validate an object against the ENCODED schema
459 obj (dictionary): object attributes to be submitted to encoded
460 object_type (string): ENCODED object name.
463 ValidationError: if the object does not conform to the schema.
465 object_type = object_type if object_type else self.get_object_type(obj)
466 schema_url = self.get_schema_url(object_type)
468 raise ValueError("Unable to construct schema url")
470 schema = self.schemas.setdefault(object_type, self.get_json(schema_url))
474 if '@type' in hidden:
476 jsonschema.validate(hidden, schema)
478 class TypedColumnParser(object):
480 def parse_sheet_array_type(value):
481 """Helper function to parse :array columns in sheet
483 return re.split(',\s*', value)
486 def parse_sheet_integer_type(value):
487 """Helper function to parse :integer columns in sheet
492 def parse_sheet_boolean_type(value):
493 """Helper function to parse :boolean columns in sheet
498 def parse_sheet_timestamp_type(value):
499 """Helper function to parse :date columns in sheet
501 return value.strftime('%Y-%m-%d')
504 def parse_sheet_string_type(value):
505 """Helper function to parse :string columns in sheet (the default)
509 def __getitem__(self, name):
511 'array': self.parse_sheet_array_type,
512 'boolean': self.parse_sheet_boolean_type,
513 'integer': self.parse_sheet_integer_type,
514 'date': self.parse_sheet_timestamp_type,
515 'string': self.parse_sheet_string_type
520 raise RuntimeError("unrecognized column type")
522 def __call__(self, header, value):
523 header = header.split(':')
524 column_type = 'string'
526 if header[1] == 'skip':
529 column_type = header[1]
530 return header[0], self[column_type](value)
532 typed_column_parser = TypedColumnParser()
534 class Document(object):
535 """Helper class for registering documents
538 lysis_uuid = 'f0cc5a7f-96a5-4970-9f46-317cc8e2d6a4'
539 lysis = Document(url_to_pdf, 'extraction protocol', 'Lysis Protocol')
540 lysis.create_if_needed(server, lysis_uuid)
542 award = 'U54HG006998'
543 lab = '/labs/barbara-wold'
545 def __init__(self, url, document_type, description, aliases=None):
547 self.filename = os.path.basename(url)
548 self.document_type = document_type
549 self.description = description
554 if isinstance(aliases, list):
555 self.aliases = aliases
557 raise ValueError("Aliases needs to be a list")
558 self.content_type = None
566 def get_document(self):
567 if os.path.exists(self.url):
568 with open(self.url, 'rb') as instream:
569 assert self.url.endswith('pdf')
570 self.content_type = 'application/pdf'
571 self.document = instream.read()
572 self.md5sum = hashlib.md5(self.document)
574 req = requests.get(self.url)
575 if req.status_code == 200:
576 self.content_type = req.headers['content-type']
577 self.document = req.content
578 self.md5sum = hashlib.md5(self.document)
579 self.urls = [self.url]
581 def create_payload(self):
584 'download': self.filename,
585 'type': self.content_type,
586 'href': 'data:'+self.content_type+';base64,' + base64.b64encode(self.document).decode('ascii'),
587 'md5sum': self.md5sum.hexdigest()
589 'document_type': self.document_type,
590 'description': self.description,
595 document_payload['aliases'] = self.aliases
597 document_payload['references'] = self.references
599 document_payload['urls'] = self.urls
601 return document_payload
603 def post(self, server):
604 document_payload = self.create_payload()
605 server.validate(document_payload, 'document')
606 return server.post_json('/documents/', document_payload)
608 def save(self, filename):
609 payload = self.create_payload()
610 with open(filename, 'w') as outstream:
611 outstream.write(pformat(payload))
613 def create_if_needed(self, server, uuid):
616 return self.post(server)
618 return server.get_json(uuid, embed=False)
620 if __name__ == '__main__':
622 from htsworkflow.util.rdfhelp import get_model, dump_model
623 from htsworkflow.util.rdfjsonld import load_into_model
624 from pprint import pprint
626 logging.basicConfig(level=logging.DEBUG)
627 encoded = ENCODED('test.encodedcc.org')
629 body = encoded.get_jsonld('/experiments/ENCSR000AEC/')
631 load_into_model(model, body)