X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=blobdiff_plain;f=htsworkflow%2Futil%2Fethelp.py;h=e4fe897342acfe656d63c3155a51c715610a9dca;hp=19f6c9f8536d612c29406b2204b8f40c6ea53adb;hb=89c191828b16e8385685eb7ef18f82bc751e5b3b;hpb=40d2e0eb1ad3612f18c0543bfa5950f89ac749f1 diff --git a/htsworkflow/util/ethelp.py b/htsworkflow/util/ethelp.py index 19f6c9f..e4fe897 100644 --- a/htsworkflow/util/ethelp.py +++ b/htsworkflow/util/ethelp.py @@ -1,6 +1,15 @@ +"""ElementTree helper functions """ -ElementTree helper functions -""" +import logging +import os +LOGGER = logging.getLogger(__name__) + +import lxml.etree +try: + XHTML_RDF_DTD = lxml.etree.DTD(external_id='-//W3C//DTD XHTML+RDFa 1.0//EN') +except lxml.etree.DTDParseError as e: + LOGGER.warn("Unable to load XHTML DTD %s" % (str(e),)) + def indent(elem, level=0): """ reformat an element tree to be 'pretty' (indented) @@ -21,7 +30,7 @@ def indent(elem, level=0): def flatten(elem, include_tail=0): """ - Extract the text from an element tree + Extract the text from an element tree (AKA extract the text that not part of XML tags) """ text = elem.text or "" @@ -30,3 +39,36 @@ def flatten(elem, include_tail=0): if include_tail and elem.tail: text += elem.tail return text +def validate_xhtml(html, base_url='http://localhost'): + """Helper for validating xhtml, mostly intended for test code + + Defaults to assuming XHTML+RDFa + Returns None if there was a problem configuring validation + Logs messages from lxml.etree using python logging + Returns True if it passed validation + and False if it fails. + """ + if XHTML_RDF_DTD is None: + return None + + try: + root = lxml.etree.fromstring(html, base_url=base_url) + except lxml.etree.ParseError as e: + LOGGER.warn("Unable to parse document: %s" % (str(e),)) + return False + + if XHTML_RDF_DTD.validate(root): + # so unlikely. + return True + + isgood = True + for msg in XHTML_RDF_DTD.error_log.filter_from_errors(): + # I have no idea how to suppress this error + # but I need the xmlns attributes for of my RDFa 1.0 encoding + if 'ERROR:VALID:DTD_UNKNOWN_ATTRIBUTE' in str(msg): + continue + else: + LOGGER.error(msg) + isgood = False + + return isgood