start the trivial sequence filter
authorDiane Trout <diane@caltech.edu>
Thu, 13 Jul 2006 01:28:34 +0000 (01:28 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 13 Jul 2006 01:28:34 +0000 (01:28 +0000)
repeatmasker is annoying because it is slow as a webservice and
depends on a bunch of hard to access components when run locally.

So the idea was how much of an improvment to mussa would there be if
we caught just the absolute simplest repeats?

alg/CMakeLists.txt
alg/tsf/CMakeLists.txt [new file with mode: 0644]
alg/tsf/test_tsf.cpp [new file with mode: 0644]
alg/tsf/trivial_filter.cpp [new file with mode: 0644]
alg/tsf/trivial_filter.hpp [new file with mode: 0644]

index 5eeb55513d800591cdc2d55cb0537b16e49096af..a04c0e2ace3a2b7924749323baa4726e78084239 100644 (file)
@@ -39,3 +39,4 @@ SET_SOURCE_FILES_PROPERTIES(${SOURCES} ${MOC_SOURCES}
 SET_TARGET_PROPERTIES(mussa_core PROPERTIES COMPILE_FLAGS "-fPIC")
 
 ADD_SUBDIRECTORY( test )
+ADD_SUBDIRECTORY( tsf )
diff --git a/alg/tsf/CMakeLists.txt b/alg/tsf/CMakeLists.txt
new file mode 100644 (file)
index 0000000..cfb8fd1
--- /dev/null
@@ -0,0 +1,8 @@
+INCLUDE(FindBoost)
+
+SET(SOURCES trivial_filter.cpp)
+SET(TEST_SOURCES test_tsf.cpp)
+
+ADD_EXECUTABLE(tsf ${SOURCES} ${TEST_SOURCES} )
+TARGET_LINK_LIBRARIES(tsf ${BOOST_UNIT_TEST_LIBRARY})
+ADD_TEST(tsf ${CMAKE_BINARY_DIR}/tsf)
diff --git a/alg/tsf/test_tsf.cpp b/alg/tsf/test_tsf.cpp
new file mode 100644 (file)
index 0000000..1e2efe2
--- /dev/null
@@ -0,0 +1,60 @@
+#define BOOST_AUTO_TEST_MAIN
+#include <boost/test/auto_unit_test.hpp>
+
+#include "trivial_filter.hpp"
+
+BOOST_AUTO_TEST_CASE( test_nt_map )
+{
+  BOOST_CHECK_EQUAL( nt_map('A'), A );
+  BOOST_CHECK_EQUAL( nt_map('a'), A );
+  BOOST_CHECK_EQUAL( nt_map('T'), T );
+  BOOST_CHECK_EQUAL( nt_map('t'), T );
+  BOOST_CHECK_EQUAL( nt_map('G'), G );
+  BOOST_CHECK_EQUAL( nt_map('g'), G );
+  BOOST_CHECK_EQUAL( nt_map('C'), C );
+  BOOST_CHECK_EQUAL( nt_map('c'), C );
+  BOOST_CHECK_EQUAL( nt_map('U'), U );
+  BOOST_CHECK_EQUAL( nt_map('u'), U );
+  BOOST_CHECK_EQUAL( nt_map('N'), N );
+  BOOST_CHECK_EQUAL( nt_map('n'), N );
+  BOOST_CHECK_EQUAL( nt_map('X'), X );
+  BOOST_CHECK_EQUAL( nt_map('x'), X );
+}
+
+BOOST_AUTO_TEST_CASE( test_generate_dna_pattern_1mer )
+{
+  TSFLookupTable table(1,4);
+  BOOST_CHECK( table.states() > 0);
+
+  const int failed_state =  TSFLookupTable::failed_state;
+  const int accept_state = TSFLookupTable::accept_state;
+  
+
+  int a_state = table.next(TSFLookupTable::start_state, A);
+  int t_state = table.next(TSFLookupTable::start_state, T);
+  int g_state = table.next(TSFLookupTable::start_state, G);
+  int c_state = table.next(TSFLookupTable::start_state, C);
+
+  BOOST_CHECK(a_state != t_state);
+  BOOST_CHECK(a_state != g_state);
+  BOOST_CHECK(a_state != c_state);
+  BOOST_CHECK(t_state != g_state);
+  BOOST_CHECK(t_state != c_state);
+  BOOST_CHECK(g_state != c_state);
+
+  BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, A), a_state );
+  BOOST_CHECK_EQUAL( table.next(a_state, A), a_state+1 );
+  BOOST_CHECK_EQUAL( table.next(a_state, G), failed_state);
+  BOOST_CHECK_EQUAL( table.next(a_state+1, A), a_state+2 );
+  BOOST_CHECK_EQUAL( table.next(a_state+2, A), a_state+3 );
+  BOOST_CHECK_EQUAL( table.next(a_state+3, A), a_state+3 );
+  BOOST_CHECK_EQUAL( table.next(a_state+3, G), accept_state);
+
+  BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, G), g_state );
+  BOOST_CHECK_EQUAL( table.next(g_state, G), g_state+1 );
+  BOOST_CHECK_EQUAL( table.next(g_state, T), failed_state);
+  BOOST_CHECK_EQUAL( table.next(g_state+1, G), g_state+2 );
+  BOOST_CHECK_EQUAL( table.next(g_state+2, G), g_state+3 );
+  BOOST_CHECK_EQUAL( table.next(g_state+3, G), g_state+3 );
+  BOOST_CHECK_EQUAL( table.next(g_state+3, C), accept_state);
+}
diff --git a/alg/tsf/trivial_filter.cpp b/alg/tsf/trivial_filter.cpp
new file mode 100644 (file)
index 0000000..9689261
--- /dev/null
@@ -0,0 +1,58 @@
+#include "trivial_filter.hpp"
+
+#include <boost/assign.hpp>
+using namespace boost::assign;
+
+#include <iostream>
+
+TSFLookupTable::TSFLookupTable(int nmer, int length)
+{
+  generate_dna_pattern(nmer, length);
+}
+
+int TSFLookupTable::next(int state, enum nucleotide symbol)
+{
+#if 0
+  // dump what row we're looking at
+  std::cout << "[" << state << "]: ";
+  for(int i=0; i != table[state].size(); ++i) {
+    std::cout << table[state][i] << " ";
+  }
+  std::cout << "lookup(" << symbol << ")" << std::endl;
+#endif
+  return table[state][symbol];
+}
+
+void TSFLookupTable::generate_dna_pattern(int nmer, int length)
+{
+  // currently we can only handle single nucleotides
+  assert (nmer == 1);
+  
+  symbol_vec symbols;
+  symbols += A, T, G, C;
+
+  // add the start state
+  std::vector<int> transitions(8, failed_state);
+  table.push_back(transitions);
+
+  for (symbol_vec::iterator pattern_itor = symbols.begin();
+       pattern_itor != symbols.end();
+       ++pattern_itor)
+  {
+    // update the start state to point at the state we're adding
+    table[0][*pattern_itor] = table.size();
+    
+    // add the counting states
+    for(int i = 0; i != length-nmer; ++ i)
+    {
+      std::vector<int> transitions(8, failed_state);
+      transitions[*pattern_itor] = table.size()+1; //point to next state
+      table.push_back(transitions);
+    }
+
+    // add the accept state
+    std::vector<int> transitions(8, accept_state);
+    transitions[*pattern_itor] = table.size();
+    table.push_back(transitions);
+  }
+}
diff --git a/alg/tsf/trivial_filter.hpp b/alg/tsf/trivial_filter.hpp
new file mode 100644 (file)
index 0000000..2ad5758
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef _TRIVIAL_FILTER_HPP_
+#define _TRIVIAL_FILTER_HPP_
+
+#include <vector>
+
+//! symbolic constants used to inidicate numeric values for nucleotides
+/*! the ordering comes from the very simple bit-wise operator
+ *  used to convert ascii values to the range 0..7
+ */
+enum nucleotide { X, A, C=3, T, U, N, G };
+
+//! convert a nucleotide ascii encoding to value in the range 0..7
+inline enum nucleotide nt_map(char c) 
+{
+  return static_cast<nucleotide>(c & 0x07);
+}
+
+//! generates our transition table
+class TSFLookupTable {
+  public:
+    //! create transition table using DNA symbols
+    /*! generate all the dna patterns of n base pairs and 
+     * require length copies of the pattern.
+     *
+     * e.g. n=2, length=2 would match things like
+     * ATAT *(AT), 
+     */
+
+    TSFLookupTable(int n, int length);
+
+    //! return the next state
+    int next(int current, enum nucleotide symbol);
+
+    //! how many states are there in our table?
+    int states() { return table.size(); }
+   
+    static const int start_state = 0;
+    static const int accept_state = -1;
+    static const int failed_state = -2;
+
+  private:
+    typedef std::vector<enum nucleotide> symbol_vec;
+
+    //! generate length copies of a pattern n bases long
+    void generate_dna_pattern(int n, int length);
+
+    std::vector<std::vector<int> > table;
+
+};
+
+class TrivialSequenceFilter {
+  public:
+  private:
+    TSFLookupTable table;
+};
+
+
+#endif // _TRIVIAL_FILTER_HPP_