From 7f1791e0ff395113cbf1f1b6e837102b14c16316 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Thu, 13 Jul 2006 01:28:34 +0000 Subject: [PATCH] start the trivial sequence filter repeatmasker is annoying because it is slow as a webservice and depends on a bunch of hard to access components when run locally. So the idea was how much of an improvment to mussa would there be if we caught just the absolute simplest repeats? --- alg/CMakeLists.txt | 1 + alg/tsf/CMakeLists.txt | 8 +++++ alg/tsf/test_tsf.cpp | 60 ++++++++++++++++++++++++++++++++++++++ alg/tsf/trivial_filter.cpp | 58 ++++++++++++++++++++++++++++++++++++ alg/tsf/trivial_filter.hpp | 58 ++++++++++++++++++++++++++++++++++++ 5 files changed, 185 insertions(+) create mode 100644 alg/tsf/CMakeLists.txt create mode 100644 alg/tsf/test_tsf.cpp create mode 100644 alg/tsf/trivial_filter.cpp create mode 100644 alg/tsf/trivial_filter.hpp diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt index 5eeb555..a04c0e2 100644 --- a/alg/CMakeLists.txt +++ b/alg/CMakeLists.txt @@ -39,3 +39,4 @@ SET_SOURCE_FILES_PROPERTIES(${SOURCES} ${MOC_SOURCES} SET_TARGET_PROPERTIES(mussa_core PROPERTIES COMPILE_FLAGS "-fPIC") ADD_SUBDIRECTORY( test ) +ADD_SUBDIRECTORY( tsf ) diff --git a/alg/tsf/CMakeLists.txt b/alg/tsf/CMakeLists.txt new file mode 100644 index 0000000..cfb8fd1 --- /dev/null +++ b/alg/tsf/CMakeLists.txt @@ -0,0 +1,8 @@ +INCLUDE(FindBoost) + +SET(SOURCES trivial_filter.cpp) +SET(TEST_SOURCES test_tsf.cpp) + +ADD_EXECUTABLE(tsf ${SOURCES} ${TEST_SOURCES} ) +TARGET_LINK_LIBRARIES(tsf ${BOOST_UNIT_TEST_LIBRARY}) +ADD_TEST(tsf ${CMAKE_BINARY_DIR}/tsf) diff --git a/alg/tsf/test_tsf.cpp b/alg/tsf/test_tsf.cpp new file mode 100644 index 0000000..1e2efe2 --- /dev/null +++ b/alg/tsf/test_tsf.cpp @@ -0,0 +1,60 @@ +#define BOOST_AUTO_TEST_MAIN +#include + +#include "trivial_filter.hpp" + +BOOST_AUTO_TEST_CASE( test_nt_map ) +{ + BOOST_CHECK_EQUAL( nt_map('A'), A ); + BOOST_CHECK_EQUAL( nt_map('a'), A ); + BOOST_CHECK_EQUAL( nt_map('T'), T ); + BOOST_CHECK_EQUAL( nt_map('t'), T ); + BOOST_CHECK_EQUAL( nt_map('G'), G ); + BOOST_CHECK_EQUAL( nt_map('g'), G ); + BOOST_CHECK_EQUAL( nt_map('C'), C ); + BOOST_CHECK_EQUAL( nt_map('c'), C ); + BOOST_CHECK_EQUAL( nt_map('U'), U ); + BOOST_CHECK_EQUAL( nt_map('u'), U ); + BOOST_CHECK_EQUAL( nt_map('N'), N ); + BOOST_CHECK_EQUAL( nt_map('n'), N ); + BOOST_CHECK_EQUAL( nt_map('X'), X ); + BOOST_CHECK_EQUAL( nt_map('x'), X ); +} + +BOOST_AUTO_TEST_CASE( test_generate_dna_pattern_1mer ) +{ + TSFLookupTable table(1,4); + BOOST_CHECK( table.states() > 0); + + const int failed_state = TSFLookupTable::failed_state; + const int accept_state = TSFLookupTable::accept_state; + + + int a_state = table.next(TSFLookupTable::start_state, A); + int t_state = table.next(TSFLookupTable::start_state, T); + int g_state = table.next(TSFLookupTable::start_state, G); + int c_state = table.next(TSFLookupTable::start_state, C); + + BOOST_CHECK(a_state != t_state); + BOOST_CHECK(a_state != g_state); + BOOST_CHECK(a_state != c_state); + BOOST_CHECK(t_state != g_state); + BOOST_CHECK(t_state != c_state); + BOOST_CHECK(g_state != c_state); + + BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, A), a_state ); + BOOST_CHECK_EQUAL( table.next(a_state, A), a_state+1 ); + BOOST_CHECK_EQUAL( table.next(a_state, G), failed_state); + BOOST_CHECK_EQUAL( table.next(a_state+1, A), a_state+2 ); + BOOST_CHECK_EQUAL( table.next(a_state+2, A), a_state+3 ); + BOOST_CHECK_EQUAL( table.next(a_state+3, A), a_state+3 ); + BOOST_CHECK_EQUAL( table.next(a_state+3, G), accept_state); + + BOOST_CHECK_EQUAL( table.next(TSFLookupTable::start_state, G), g_state ); + BOOST_CHECK_EQUAL( table.next(g_state, G), g_state+1 ); + BOOST_CHECK_EQUAL( table.next(g_state, T), failed_state); + BOOST_CHECK_EQUAL( table.next(g_state+1, G), g_state+2 ); + BOOST_CHECK_EQUAL( table.next(g_state+2, G), g_state+3 ); + BOOST_CHECK_EQUAL( table.next(g_state+3, G), g_state+3 ); + BOOST_CHECK_EQUAL( table.next(g_state+3, C), accept_state); +} diff --git a/alg/tsf/trivial_filter.cpp b/alg/tsf/trivial_filter.cpp new file mode 100644 index 0000000..9689261 --- /dev/null +++ b/alg/tsf/trivial_filter.cpp @@ -0,0 +1,58 @@ +#include "trivial_filter.hpp" + +#include +using namespace boost::assign; + +#include + +TSFLookupTable::TSFLookupTable(int nmer, int length) +{ + generate_dna_pattern(nmer, length); +} + +int TSFLookupTable::next(int state, enum nucleotide symbol) +{ +#if 0 + // dump what row we're looking at + std::cout << "[" << state << "]: "; + for(int i=0; i != table[state].size(); ++i) { + std::cout << table[state][i] << " "; + } + std::cout << "lookup(" << symbol << ")" << std::endl; +#endif + return table[state][symbol]; +} + +void TSFLookupTable::generate_dna_pattern(int nmer, int length) +{ + // currently we can only handle single nucleotides + assert (nmer == 1); + + symbol_vec symbols; + symbols += A, T, G, C; + + // add the start state + std::vector transitions(8, failed_state); + table.push_back(transitions); + + for (symbol_vec::iterator pattern_itor = symbols.begin(); + pattern_itor != symbols.end(); + ++pattern_itor) + { + // update the start state to point at the state we're adding + table[0][*pattern_itor] = table.size(); + + // add the counting states + for(int i = 0; i != length-nmer; ++ i) + { + std::vector transitions(8, failed_state); + transitions[*pattern_itor] = table.size()+1; //point to next state + table.push_back(transitions); + } + + // add the accept state + std::vector transitions(8, accept_state); + transitions[*pattern_itor] = table.size(); + table.push_back(transitions); + } +} diff --git a/alg/tsf/trivial_filter.hpp b/alg/tsf/trivial_filter.hpp new file mode 100644 index 0000000..2ad5758 --- /dev/null +++ b/alg/tsf/trivial_filter.hpp @@ -0,0 +1,58 @@ +#ifndef _TRIVIAL_FILTER_HPP_ +#define _TRIVIAL_FILTER_HPP_ + +#include + +//! symbolic constants used to inidicate numeric values for nucleotides +/*! the ordering comes from the very simple bit-wise operator + * used to convert ascii values to the range 0..7 + */ +enum nucleotide { X, A, C=3, T, U, N, G }; + +//! convert a nucleotide ascii encoding to value in the range 0..7 +inline enum nucleotide nt_map(char c) +{ + return static_cast(c & 0x07); +} + +//! generates our transition table +class TSFLookupTable { + public: + //! create transition table using DNA symbols + /*! generate all the dna patterns of n base pairs and + * require length copies of the pattern. + * + * e.g. n=2, length=2 would match things like + * ATAT *(AT), + */ + + TSFLookupTable(int n, int length); + + //! return the next state + int next(int current, enum nucleotide symbol); + + //! how many states are there in our table? + int states() { return table.size(); } + + static const int start_state = 0; + static const int accept_state = -1; + static const int failed_state = -2; + + private: + typedef std::vector symbol_vec; + + //! generate length copies of a pattern n bases long + void generate_dna_pattern(int n, int length); + + std::vector > table; + +}; + +class TrivialSequenceFilter { + public: + private: + TSFLookupTable table; +}; + + +#endif // _TRIVIAL_FILTER_HPP_ -- 2.30.2