SET_TARGET_PROPERTIES(mussa_core PROPERTIES COMPILE_FLAGS "-fPIC")
ADD_SUBDIRECTORY( test )
+ADD_SUBDIRECTORY( tsf )
--- /dev/null
+INCLUDE(FindBoost)
+
+SET(SOURCES trivial_filter.cpp)
+SET(TEST_SOURCES test_tsf.cpp)
+
+ADD_EXECUTABLE(tsf ${SOURCES} ${TEST_SOURCES} )
+TARGET_LINK_LIBRARIES(tsf ${BOOST_UNIT_TEST_LIBRARY})
+ADD_TEST(tsf ${CMAKE_BINARY_DIR}/tsf)
--- /dev/null
+#define BOOST_AUTO_TEST_MAIN
+#include <boost/test/auto_unit_test.hpp>
+
+#include "trivial_filter.hpp"
+
+BOOST_AUTO_TEST_CASE( test_nt_map )
+{
+ BOOST_CHECK_EQUAL( nt_map('A'), A );
+ BOOST_CHECK_EQUAL( nt_map('a'), A );
+ BOOST_CHECK_EQUAL( nt_map('T'), T );
+ BOOST_CHECK_EQUAL( nt_map('t'), T );
+ BOOST_CHECK_EQUAL( nt_map('G'), G );
+ BOOST_CHECK_EQUAL( nt_map('g'), G );
+ BOOST_CHECK_EQUAL( nt_map('C'), C );
+ BOOST_CHECK_EQUAL( nt_map('c'), C );
+ BOOST_CHECK_EQUAL( nt_map('U'), U );
+ BOOST_CHECK_EQUAL( nt_map('u'), U );
+ BOOST_CHECK_EQUAL( nt_map('N'), N );
+ BOOST_CHECK_EQUAL( nt_map('n'), N );
+ BOOST_CHECK_EQUAL( nt_map('X'), X );
+ BOOST_CHECK_EQUAL( nt_map('x'), X );
+}
+
+BOOST_AUTO_TEST_CASE( test_generate_dna_pattern_1mer )
+{
+ TSFLookupTable table(1,4);
+ BOOST_CHECK( table.states() > 0);
+
+ const int failed_state = TSFLookupTable::failed_state;
+ const int accept_state = TSFLookupTable::accept_state;
+
+
+ int a_state = table.next(TSFLookupTable::start_state, A);
+ int t_state = table.next(TSFLookupTable::start_state, T);
+ int g_state = table.next(TSFLookupTable::start_state, G);
+ int c_state = table.next(TSFLookupTable::start_state, C);
+
+ BOOST_CHECK(a_state != t_state);
+ BOOST_CHECK(a_state != g_state);
+ BOOST_CHECK(a_state != c_state);
+ BOOST_CHECK(t_state != g_state);
+ BOOST_CHECK(t_state != c_state);
+ BOOST_CHECK(g_state != c_state);
+
+ BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, A), a_state );
+ BOOST_CHECK_EQUAL( table.next(a_state, A), a_state+1 );
+ BOOST_CHECK_EQUAL( table.next(a_state, G), failed_state);
+ BOOST_CHECK_EQUAL( table.next(a_state+1, A), a_state+2 );
+ BOOST_CHECK_EQUAL( table.next(a_state+2, A), a_state+3 );
+ BOOST_CHECK_EQUAL( table.next(a_state+3, A), a_state+3 );
+ BOOST_CHECK_EQUAL( table.next(a_state+3, G), accept_state);
+
+ BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, G), g_state );
+ BOOST_CHECK_EQUAL( table.next(g_state, G), g_state+1 );
+ BOOST_CHECK_EQUAL( table.next(g_state, T), failed_state);
+ BOOST_CHECK_EQUAL( table.next(g_state+1, G), g_state+2 );
+ BOOST_CHECK_EQUAL( table.next(g_state+2, G), g_state+3 );
+ BOOST_CHECK_EQUAL( table.next(g_state+3, G), g_state+3 );
+ BOOST_CHECK_EQUAL( table.next(g_state+3, C), accept_state);
+}
--- /dev/null
+#include "trivial_filter.hpp"
+
+#include <boost/assign.hpp>
+using namespace boost::assign;
+
+#include <iostream>
+
+TSFLookupTable::TSFLookupTable(int nmer, int length)
+{
+ generate_dna_pattern(nmer, length);
+}
+
+int TSFLookupTable::next(int state, enum nucleotide symbol)
+{
+#if 0
+ // dump what row we're looking at
+ std::cout << "[" << state << "]: ";
+ for(int i=0; i != table[state].size(); ++i) {
+ std::cout << table[state][i] << " ";
+ }
+ std::cout << "lookup(" << symbol << ")" << std::endl;
+#endif
+ return table[state][symbol];
+}
+
+void TSFLookupTable::generate_dna_pattern(int nmer, int length)
+{
+ // currently we can only handle single nucleotides
+ assert (nmer == 1);
+
+ symbol_vec symbols;
+ symbols += A, T, G, C;
+
+ // add the start state
+ std::vector<int> transitions(8, failed_state);
+ table.push_back(transitions);
+
+ for (symbol_vec::iterator pattern_itor = symbols.begin();
+ pattern_itor != symbols.end();
+ ++pattern_itor)
+ {
+ // update the start state to point at the state we're adding
+ table[0][*pattern_itor] = table.size();
+
+ // add the counting states
+ for(int i = 0; i != length-nmer; ++ i)
+ {
+ std::vector<int> transitions(8, failed_state);
+ transitions[*pattern_itor] = table.size()+1; //point to next state
+ table.push_back(transitions);
+ }
+
+ // add the accept state
+ std::vector<int> transitions(8, accept_state);
+ transitions[*pattern_itor] = table.size();
+ table.push_back(transitions);
+ }
+}
--- /dev/null
+#ifndef _TRIVIAL_FILTER_HPP_
+#define _TRIVIAL_FILTER_HPP_
+
+#include <vector>
+
+//! symbolic constants used to inidicate numeric values for nucleotides
+/*! the ordering comes from the very simple bit-wise operator
+ * used to convert ascii values to the range 0..7
+ */
+enum nucleotide { X, A, C=3, T, U, N, G };
+
+//! convert a nucleotide ascii encoding to value in the range 0..7
+inline enum nucleotide nt_map(char c)
+{
+ return static_cast<nucleotide>(c & 0x07);
+}
+
+//! generates our transition table
+class TSFLookupTable {
+ public:
+ //! create transition table using DNA symbols
+ /*! generate all the dna patterns of n base pairs and
+ * require length copies of the pattern.
+ *
+ * e.g. n=2, length=2 would match things like
+ * ATAT *(AT),
+ */
+
+ TSFLookupTable(int n, int length);
+
+ //! return the next state
+ int next(int current, enum nucleotide symbol);
+
+ //! how many states are there in our table?
+ int states() { return table.size(); }
+
+ static const int start_state = 0;
+ static const int accept_state = -1;
+ static const int failed_state = -2;
+
+ private:
+ typedef std::vector<enum nucleotide> symbol_vec;
+
+ //! generate length copies of a pattern n bases long
+ void generate_dna_pattern(int n, int length);
+
+ std::vector<std::vector<int> > table;
+
+};
+
+class TrivialSequenceFilter {
+ public:
+ private:
+ TSFLookupTable table;
+};
+
+
+#endif // _TRIVIAL_FILTER_HPP_