Add module to parse UCSC DAF file.
authorDiane Trout <diane@caltech.edu>
Wed, 15 Jun 2011 02:31:45 +0000 (19:31 -0700)
committerDiane Trout <diane@caltech.edu>
Wed, 15 Jun 2011 02:31:45 +0000 (19:31 -0700)
This includes some utilities to help convert the DAF dictionary
to a librdf model.

htsworkflow/submission/daf.py [new file with mode: 0644]
htsworkflow/submission/test/test_daf.py [new file with mode: 0644]
htsworkflow/util/rdfhelp.py [new file with mode: 0644]
htsworkflow/util/test/test_rdfhelp.py [new file with mode: 0644]

diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py
new file mode 100644 (file)
index 0000000..cdc9312
--- /dev/null
@@ -0,0 +1,129 @@
+"""Parse UCSC DAF File
+"""
+import logging
+import re
+import string
+from StringIO import StringIO
+import types
+
+from htsworkflow.util.rdfhelp import blankOrUri, toTypedNode
+
+logger = logging.getLogger(__name__)
+
+# STATES
+DAF_HEADER = 1
+DAF_VIEW = 2
+
+
+def parse(filename):
+    stream = open(filename,'r')
+    attributes =  parse_stream(stream)
+    stream.close()
+    return stream
+
+def fromstring(daf_string):
+    stream = StringIO(daf_string)
+    return parse_stream(stream)
+
+def parse_stream(stream):
+    comment_re = re.compile("#.*$")
+
+    state = DAF_HEADER
+    attributes = {'views': {}}
+    view_name = None
+    view_attributes = {}
+    for line in stream:
+        #remove comments
+        line = comment_re.sub("", line)
+        nstop = _extract_name_index(line)
+        name = line[0:nstop]
+        sstop = _consume_whitespace(line, start=nstop)
+        vstop = _extract_value_index(line, start=sstop)
+        value = line[sstop:vstop]
+
+        if value.lower() in ('yes',):
+            value = True
+        elif value.lower() in ('no',):
+            value = False
+            
+        if len(name) == 0:
+            if view_name is not None:
+                attributes['views'][view_name] = view_attributes
+                view_name = None
+                view_attributes = {}
+            state = DAF_HEADER
+        elif state == DAF_HEADER and name == 'variables':
+            attributes[name] = [ x.strip() for x in value.split(',')]
+        elif state == DAF_HEADER and name == 'view':
+            view_name = value
+            view_attributes['view'] = value
+            state = DAF_VIEW
+        elif state == DAF_HEADER:
+            attributes[name] = value
+        elif state == DAF_VIEW:
+            view_attributes[name] = value
+
+    # save last block
+    if view_name is not None:
+        attributes['views'][view_name] = view_attributes
+        
+    return attributes
+
+def _consume_whitespace(line, start=0):
+    for i in xrange(start, len(line)):
+        if line[i] not in string.whitespace:
+            return i
+        
+    return len(line)
+
+def _extract_name_index(line, start=0):
+    for i in xrange(start, len(line)):
+        if line[i] in string.whitespace:
+            return i
+        
+    return len(line)
+
+def _extract_value_index(line, start=0):
+    shortline = line.rstrip()
+    return len(shortline)
+
+try:
+    import RDF
+    def convert_to_rdf_statements(attributes, source=None):
+        ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
+    
+        subject = blankOrUri(source)
+        
+        statements = []
+        for name in attributes:
+            predicate = ddfNS[name]
+            if name == 'views':
+                predicate = ddfNS['views']
+                for view_name in attributes.get('views', []):
+                    view = attributes['views'][view_name]
+                    viewNode = RDF.Node()
+                    statements.append(RDF.Statement(subject, predicate, viewNode))
+                    statements.extend(convert_to_rdf_statements(view, viewNode))
+            elif name == 'variables':
+                predicate = ddfNS['variables']
+                for var in attributes.get('variables', []):
+                    obj = toTypedNode(var)
+                    statements.append(RDF.Statement(subject, predicate, obj))
+            else:
+                value = attributes[name]
+                obj = toTypedNode(value)
+                statements.append(RDF.Statement(subject,predicate,obj))
+    
+        return statements
+    
+    
+    def add_to_model(model, attributes, source=None):
+        for statement in convert_to_rdf_statements(attributes, source):
+            model.add_statement(statement)
+            
+except ImportError, e:
+    def convert_to_rdf_statements(attributes, source=None):
+        raise NotImplementedError("librdf not installed")
+    def add_to_model(model, attributes, source=None):
+        raise NotImplementedError("librdf not installed")
+
diff --git a/htsworkflow/submission/test/test_daf.py b/htsworkflow/submission/test/test_daf.py
new file mode 100644 (file)
index 0000000..3749446
--- /dev/null
@@ -0,0 +1,67 @@
+import unittest
+
+from htsworkflow.submission import daf
+
+test_daf = """# Lab and general info
+grant             Hardison
+lab               Caltech-m
+dataType          ChipSeq 
+variables         cell, antibody,sex,age,strain,control
+compositeSuffix   CaltechHistone
+assembly          mm9
+dafVersion        2.0
+validationSettings validateFiles.bam:mismatches=2,bamPercent=99.9;validateFiles.fastq:quick=1000
+
+# Track/view definition
+view             Peaks
+longLabelPrefix  Caltech Histone Peaks
+type             narrowPeak
+hasReplicates    yes
+required         no
+
+view             Signal
+longLabelPrefix  Caltech Histone Signal
+type             bigWig
+hasReplicates    yes
+required         no
+"""
+
+class TestDAF(unittest.TestCase):
+    def test_parse(self):
+
+        parsed = daf.fromstring(test_daf)
+        
+        self.failUnlessEqual(parsed['assembly'], 'mm9')
+        self.failUnlessEqual(parsed['grant'], 'Hardison')
+        self.failUnlessEqual(len(parsed['variables']), 6)
+        self.failUnlessEqual(len(parsed['views']), 2)
+        self.failUnlessEqual(len(parsed['views']['Peaks']), 5)
+        self.failUnlessEqual(len(parsed['views']['Signal']), 5)
+        signal = parsed['views']['Signal']
+        self.failUnlessEqual(signal['required'], False)
+        self.failUnlessEqual(signal['longLabelPrefix'],
+                             'Caltech Histone Signal')
+
+    def test_rdf(self):
+        try:
+            import RDF
+
+            parsed = daf.fromstring(test_daf)
+            #mem = RDF.Storage(storage_name='hashes',
+            #                  options_string='hash-type="memory"'),
+            mem = RDF.MemoryStorage()
+            model = RDF.Model(mem)
+            
+            daf.add_to_model(model, parsed)
+
+            writer = RDF.Serializer(name='turtle')
+            print writer.serialize_model_to_string(model)
+            
+        except ImportError, e:
+            print "Skipped test_rdf"
+
+def suite():
+    return unittest.makeSuite(TestDAF, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py
new file mode 100644 (file)
index 0000000..3a27c8b
--- /dev/null
@@ -0,0 +1,34 @@
+"""Helper features for working with librdf
+"""
+import RDF
+import types
+
+xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
+
+def blankOrUri(value=None):
+    node = None
+    if value is None:
+        node = RDF.Node()
+    elif type(value) in types.StringTypes:
+        node = RDF.Node(uri_string=value)
+    elif isinstance(value, RDF.Node):
+        node = value
+
+    return node
+
+
+def toTypedNode(value):
+    if type(value) == types.BooleanType:
+        value_type = xsdNS['boolean'].uri
+        if value:
+            value = u'1'
+        else:
+            value = u'0'
+    elif type(value) in types.StringTypes:
+        value_type = xsdNS['string'].uri
+    else:
+        value_type = None
+        value = unicode(value)
+
+    return RDF.Node(literal=value, datatype=value_type)
+    
diff --git a/htsworkflow/util/test/test_rdfhelp.py b/htsworkflow/util/test/test_rdfhelp.py
new file mode 100644 (file)
index 0000000..451ab73
--- /dev/null
@@ -0,0 +1,32 @@
+import unittest
+
+from htsworkflow.util.rdfhelp import toTypedNode, blankOrUri
+
+class TestRDFHelp(unittest.TestCase):
+    def test_typed_node_boolean(self):
+        node = toTypedNode(True)
+        self.failUnlessEqual(node.literal_value['string'], u'1')
+        self.failUnlessEqual(str(node.literal_value['datatype']),
+                             'http://www.w3.org/2001/XMLSchema#boolean')
+
+    def test_typed_node_string(self):
+        node = toTypedNode('hello')
+        self.failUnlessEqual(node.literal_value['string'], u'hello')
+        self.failUnlessEqual(str(node.literal_value['datatype']),
+                             'http://www.w3.org/2001/XMLSchema#string')
+
+    def test_blank_or_uri_blank(self):
+        node = blankOrUri()
+        self.failUnlessEqual(node.is_blank(), True)
+
+    def test_blank_or_uri_url(self):
+        s = 'http://google.com'
+        node = blankOrUri(s)
+        self.failUnlessEqual(node.is_resource(), True)
+        self.failUnlessEqual(str(node.uri), s)
+        
+def suite():
+    return unittest.makeSuite(testRdfHelp, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')