From: Diane Trout Date: Wed, 15 Jun 2011 02:31:45 +0000 (-0700) Subject: Add module to parse UCSC DAF file. X-Git-Tag: 0.5.2~18 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=a8691bd8d11e930323f2d714786c4f7f10fa1322 Add module to parse UCSC DAF file. This includes some utilities to help convert the DAF dictionary to a librdf model. --- diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py new file mode 100644 index 0000000..cdc9312 --- /dev/null +++ b/htsworkflow/submission/daf.py @@ -0,0 +1,129 @@ +"""Parse UCSC DAF File +""" +import logging +import re +import string +from StringIO import StringIO +import types + +from htsworkflow.util.rdfhelp import blankOrUri, toTypedNode + +logger = logging.getLogger(__name__) + +# STATES +DAF_HEADER = 1 +DAF_VIEW = 2 + + +def parse(filename): + stream = open(filename,'r') + attributes = parse_stream(stream) + stream.close() + return stream + +def fromstring(daf_string): + stream = StringIO(daf_string) + return parse_stream(stream) + +def parse_stream(stream): + comment_re = re.compile("#.*$") + + state = DAF_HEADER + attributes = {'views': {}} + view_name = None + view_attributes = {} + for line in stream: + #remove comments + line = comment_re.sub("", line) + nstop = _extract_name_index(line) + name = line[0:nstop] + sstop = _consume_whitespace(line, start=nstop) + vstop = _extract_value_index(line, start=sstop) + value = line[sstop:vstop] + + if value.lower() in ('yes',): + value = True + elif value.lower() in ('no',): + value = False + + if len(name) == 0: + if view_name is not None: + attributes['views'][view_name] = view_attributes + view_name = None + view_attributes = {} + state = DAF_HEADER + elif state == DAF_HEADER and name == 'variables': + attributes[name] = [ x.strip() for x in value.split(',')] + elif state == DAF_HEADER and name == 'view': + view_name = value + view_attributes['view'] = value + state = DAF_VIEW + elif state == DAF_HEADER: + attributes[name] = value + elif state == DAF_VIEW: + view_attributes[name] = value + + # save last block + if view_name is not None: + attributes['views'][view_name] = view_attributes + + return attributes + +def _consume_whitespace(line, start=0): + for i in xrange(start, len(line)): + if line[i] not in string.whitespace: + return i + + return len(line) + +def _extract_name_index(line, start=0): + for i in xrange(start, len(line)): + if line[i] in string.whitespace: + return i + + return len(line) + +def _extract_value_index(line, start=0): + shortline = line.rstrip() + return len(shortline) + +try: + import RDF + def convert_to_rdf_statements(attributes, source=None): + ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#") + + subject = blankOrUri(source) + + statements = [] + for name in attributes: + predicate = ddfNS[name] + if name == 'views': + predicate = ddfNS['views'] + for view_name in attributes.get('views', []): + view = attributes['views'][view_name] + viewNode = RDF.Node() + statements.append(RDF.Statement(subject, predicate, viewNode)) + statements.extend(convert_to_rdf_statements(view, viewNode)) + elif name == 'variables': + predicate = ddfNS['variables'] + for var in attributes.get('variables', []): + obj = toTypedNode(var) + statements.append(RDF.Statement(subject, predicate, obj)) + else: + value = attributes[name] + obj = toTypedNode(value) + statements.append(RDF.Statement(subject,predicate,obj)) + + return statements + + + def add_to_model(model, attributes, source=None): + for statement in convert_to_rdf_statements(attributes, source): + model.add_statement(statement) + +except ImportError, e: + def convert_to_rdf_statements(attributes, source=None): + raise NotImplementedError("librdf not installed") + def add_to_model(model, attributes, source=None): + raise NotImplementedError("librdf not installed") + diff --git a/htsworkflow/submission/test/test_daf.py b/htsworkflow/submission/test/test_daf.py new file mode 100644 index 0000000..3749446 --- /dev/null +++ b/htsworkflow/submission/test/test_daf.py @@ -0,0 +1,67 @@ +import unittest + +from htsworkflow.submission import daf + +test_daf = """# Lab and general info +grant Hardison +lab Caltech-m +dataType ChipSeq +variables cell, antibody,sex,age,strain,control +compositeSuffix CaltechHistone +assembly mm9 +dafVersion 2.0 +validationSettings validateFiles.bam:mismatches=2,bamPercent=99.9;validateFiles.fastq:quick=1000 + +# Track/view definition +view Peaks +longLabelPrefix Caltech Histone Peaks +type narrowPeak +hasReplicates yes +required no + +view Signal +longLabelPrefix Caltech Histone Signal +type bigWig +hasReplicates yes +required no +""" + +class TestDAF(unittest.TestCase): + def test_parse(self): + + parsed = daf.fromstring(test_daf) + + self.failUnlessEqual(parsed['assembly'], 'mm9') + self.failUnlessEqual(parsed['grant'], 'Hardison') + self.failUnlessEqual(len(parsed['variables']), 6) + self.failUnlessEqual(len(parsed['views']), 2) + self.failUnlessEqual(len(parsed['views']['Peaks']), 5) + self.failUnlessEqual(len(parsed['views']['Signal']), 5) + signal = parsed['views']['Signal'] + self.failUnlessEqual(signal['required'], False) + self.failUnlessEqual(signal['longLabelPrefix'], + 'Caltech Histone Signal') + + def test_rdf(self): + try: + import RDF + + parsed = daf.fromstring(test_daf) + #mem = RDF.Storage(storage_name='hashes', + # options_string='hash-type="memory"'), + mem = RDF.MemoryStorage() + model = RDF.Model(mem) + + daf.add_to_model(model, parsed) + + writer = RDF.Serializer(name='turtle') + print writer.serialize_model_to_string(model) + + except ImportError, e: + print "Skipped test_rdf" + +def suite(): + return unittest.makeSuite(TestDAF, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py new file mode 100644 index 0000000..3a27c8b --- /dev/null +++ b/htsworkflow/util/rdfhelp.py @@ -0,0 +1,34 @@ +"""Helper features for working with librdf +""" +import RDF +import types + +xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#") + +def blankOrUri(value=None): + node = None + if value is None: + node = RDF.Node() + elif type(value) in types.StringTypes: + node = RDF.Node(uri_string=value) + elif isinstance(value, RDF.Node): + node = value + + return node + + +def toTypedNode(value): + if type(value) == types.BooleanType: + value_type = xsdNS['boolean'].uri + if value: + value = u'1' + else: + value = u'0' + elif type(value) in types.StringTypes: + value_type = xsdNS['string'].uri + else: + value_type = None + value = unicode(value) + + return RDF.Node(literal=value, datatype=value_type) + diff --git a/htsworkflow/util/test/test_rdfhelp.py b/htsworkflow/util/test/test_rdfhelp.py new file mode 100644 index 0000000..451ab73 --- /dev/null +++ b/htsworkflow/util/test/test_rdfhelp.py @@ -0,0 +1,32 @@ +import unittest + +from htsworkflow.util.rdfhelp import toTypedNode, blankOrUri + +class TestRDFHelp(unittest.TestCase): + def test_typed_node_boolean(self): + node = toTypedNode(True) + self.failUnlessEqual(node.literal_value['string'], u'1') + self.failUnlessEqual(str(node.literal_value['datatype']), + 'http://www.w3.org/2001/XMLSchema#boolean') + + def test_typed_node_string(self): + node = toTypedNode('hello') + self.failUnlessEqual(node.literal_value['string'], u'hello') + self.failUnlessEqual(str(node.literal_value['datatype']), + 'http://www.w3.org/2001/XMLSchema#string') + + def test_blank_or_uri_blank(self): + node = blankOrUri() + self.failUnlessEqual(node.is_blank(), True) + + def test_blank_or_uri_url(self): + s = 'http://google.com' + node = blankOrUri(s) + self.failUnlessEqual(node.is_resource(), True) + self.failUnlessEqual(str(node.uri), s) + +def suite(): + return unittest.makeSuite(testRdfHelp, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite')