From: Diane Trout Date: Wed, 21 Mar 2007 03:55:12 +0000 (+0000) Subject: Move alphabet type into SeqString X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=commitdiff_plain;h=f1724abab87d2e5b160620b10cb81eabf56aadeb Move alphabet type into SeqString This is another large patch. This moves alphabet_ref from Sequence to SeqString. I still need to deal with a better way of handling reverse complementing especially given my attempts to keep everything in reference counted structures. Also this patch and the previous: Store Sequence sequence location in a shared_ptr class have broken the display of motifs in mussa. (I used to be using shared_ptr for everything, but now since there's a couple of seperate objects updating the motifs in one doesn't really help so much for the other. so warning, these two patches and the next few that I make are going to be unstable. **END OF DESCRIPTION*** Place the long patch description above the ***END OF DESCRIPTION*** marker. The first line of this file will be the patch name. This patch contains the following changes: M ./alg/CMakeLists.txt +1 M ./alg/alphabet.cpp +35 M ./alg/alphabet.hpp -1 +14 M ./alg/motif_parser.cpp -3 +4 A ./alg/seq.cpp M ./alg/seq.hpp -3 +17 M ./alg/seq_span.cpp -6 +3 M ./alg/seq_span.hpp -1 +4 M ./alg/sequence.cpp -82 +26 M ./alg/sequence.hpp -15 +10 M ./alg/test/CMakeLists.txt +1 M ./alg/test/test_alphabet.cpp +5 A ./alg/test/test_seq.cpp M ./alg/test/test_seq_span.cpp +9 M ./alg/test/test_sequence.cpp -43 +43 M ./qui/motif_editor/MotifElement.cpp -2 +2 --- diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt index 0a7e8e7..13448bc 100644 --- a/alg/CMakeLists.txt +++ b/alg/CMakeLists.txt @@ -24,6 +24,7 @@ SET(SOURCES alphabet.cpp nway_other.cpp nway_paths.cpp parse_options.cpp + seq.cpp seq_span.cpp sequence.cpp sequence_location.cpp ) diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp index 3e4e5c8..5691919 100644 --- a/alg/alphabet.cpp +++ b/alg/alphabet.cpp @@ -1,3 +1,5 @@ +#include + #include "alg/alphabet.hpp" // some standard dna alphabets @@ -12,6 +14,7 @@ const char *Alphabet::nucleic_cstr = //! the protein alphabet const char *Alphabet::protein_cstr = "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"; +const char *Alphabet::empty_cstr = ""; const Alphabet& Alphabet::reduced_dna_alphabet() { static Alphabet *a = new Alphabet(reduced_dna_cstr); @@ -33,6 +36,10 @@ const Alphabet& Alphabet::protein_alphabet() { static Alphabet *a = new Alphabet(protein_cstr); return *a; } +const Alphabet& Alphabet::empty_alphabet() { + static Alphabet *a = new Alphabet(empty_cstr); + return *a; +} Alphabet::Alphabet(const char *a) : alphabet(a) @@ -47,7 +54,36 @@ void Alphabet::assign(const Alphabet& a) alphabet_set.insert(alphabet.begin(), alphabet.end()); } +bool operator==(const Alphabet& x, const Alphabet& y) +{ + return x.alphabet == y.alphabet; +} + +std::ostream& operator<<(std::ostream& out, const Alphabet& a) +{ + out << a.alphabet; +} + bool Alphabet::exists(const char c) const { return (alphabet_set.find(c) != alphabet_set.end()); } + +const Alphabet& Alphabet::get_alphabet(AlphabetRef alpha) +{ + switch (alpha) { + case ::reduced_dna_alphabet: + return Alphabet::reduced_dna_alphabet(); + case ::reduced_rna_alphabet: + return Alphabet::reduced_rna_alphabet(); + case ::reduced_nucleic_alphabet: + return Alphabet::reduced_nucleic_alphabet(); + case ::nucleic_alphabet: + return Alphabet::nucleic_alphabet(); + case ::protein_alphabet: + return Alphabet::protein_alphabet(); + default: + throw std::runtime_error("unrecognized alphabet type"); + break; + } +} \ No newline at end of file diff --git a/alg/alphabet.hpp b/alg/alphabet.hpp index 3991254..513ea76 100644 --- a/alg/alphabet.hpp +++ b/alg/alphabet.hpp @@ -8,16 +8,26 @@ #include #include +#include //! this is a helper class for sequence +enum AlphabetRef { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, + nucleic_alphabet, protein_alphabet, empty_alphabet=255 }; + class Alphabet { friend class Sequence; public: typedef std::string::const_iterator const_iterator; - + //! define the various alphabet types (as python corebio) + + friend bool operator==(const Alphabet&, const Alphabet&); + friend std::ostream& operator<<(std::ostream&, const Alphabet&); + //! case-insensitive test to check a character for existence in our alphabet bool exists(const char) const; + //! return an alphabet given an AlphabetRef enumeration + static const Alphabet &get_alphabet(AlphabetRef); // note, if you want to define an alphabet for a sequence, you probably want // to update the enumeration in Sequence, and Sequence::get_sequence //! The standard DNA alphabet, with unique, and unknown characters @@ -35,6 +45,9 @@ public: //! the protein alphabet static const char *protein_cstr; static const Alphabet &protein_alphabet(); + + static const char *empty_cstr; + static const Alphabet &empty_alphabet(); private: //! what are allowable symbols in our alphabet diff --git a/alg/motif_parser.cpp b/alg/motif_parser.cpp index d05f15b..c59c287 100644 --- a/alg/motif_parser.cpp +++ b/alg/motif_parser.cpp @@ -1,6 +1,7 @@ #include "mussa_exceptions.hpp" -#include "alg/alphabet.hpp" -#include "alg/motif_parser.hpp" +#include "alphabet.hpp" +#include "motif_parser.hpp" +#include "seq.hpp" #include #include @@ -56,7 +57,7 @@ void motif_parser::push_motif::operator()( Iterator end) const { float red, green, blue, alpha; - Sequence seq(parsed->sequence, Sequence::nucleic_alphabet); + Sequence seq(parsed->sequence, nucleic_alphabet); seq.set_fasta_header(parsed->name); alpha = 1.0; diff --git a/alg/seq.cpp b/alg/seq.cpp new file mode 100644 index 0000000..41b99d4 --- /dev/null +++ b/alg/seq.cpp @@ -0,0 +1,40 @@ +#include + +#include "seq.hpp" + +const Alphabet& SeqString::get_alphabet() const +{ + return Alphabet::get_alphabet(alphabet); +} + +std::string SeqString::create_reverse_complement_map() const +{ + std::string rc_map(256, '~'); + // if we're rna, use U instead of T + // we might want to add an "is_rna" to sequence at somepoint + char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T'; + char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't'; + rc_map['A'] = TU ; rc_map['a'] = tu ; + rc_map['T'] = 'A'; rc_map['t'] = 'a'; + rc_map['U'] = 'A'; rc_map['u'] = 'a'; + rc_map['G'] = 'C'; rc_map['g'] = 'c'; + rc_map['C'] = 'G'; rc_map['c'] = 'g'; + rc_map['M'] = 'K'; rc_map['m'] = 'k'; + rc_map['R'] = 'Y'; rc_map['r'] = 'y'; + rc_map['W'] = 'W'; rc_map['w'] = 'w'; + rc_map['S'] = 'S'; rc_map['s'] = 's'; + rc_map['Y'] = 'R'; rc_map['y'] = 'r'; + rc_map['K'] = 'M'; rc_map['k'] = 'm'; + rc_map['V'] = 'B'; rc_map['v'] = 'b'; + rc_map['H'] = 'D'; rc_map['h'] = 'd'; + rc_map['D'] = 'H'; rc_map['d'] = 'h'; + rc_map['B'] = 'V'; rc_map['b'] = 'v'; + rc_map['N'] = 'N'; rc_map['n'] = 'n'; + rc_map['X'] = 'X'; rc_map['x'] = 'x'; + rc_map['?'] = '?'; + rc_map['.'] = '.'; + rc_map['-'] = '-'; + rc_map['~'] = '~'; // not really needed, but perhaps it's clearer. + + return rc_map; +} diff --git a/alg/seq.hpp b/alg/seq.hpp index b102a00..5ac42a2 100644 --- a/alg/seq.hpp +++ b/alg/seq.hpp @@ -15,11 +15,13 @@ #include +#include "alphabet.hpp" //! These classes provide for the internal implementation for the Sequence class /** the only purpose of this class is that the shared_ptr template * functions need the serialization support to be in-class. */ + class SeqString : public std::string { public: @@ -34,17 +36,29 @@ public: typedef std::string::value_type value_type; static const size_type npos = std::string::npos; - SeqString() : std::string() {}; - SeqString(const std::string &s) : std::string(s) {}; + SeqString(AlphabetRef a=reduced_nucleic_alphabet) : + std::string(), + alphabet(a) + {} + SeqString(const std::string &s, AlphabetRef a=reduced_nucleic_alphabet) : + std::string(s), + alphabet(a) + {} + + AlphabetRef get_alphabet_ref() { return alphabet; } + const Alphabet& get_alphabet() const; + std::string create_reverse_complement_map() const; private: + AlphabetRef alphabet; friend class boost::serialization::access; template void serialize(Archive& ar, const unsigned int /*version*/) { //ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(std::string); ar & boost::serialization::make_nvp("bases", - boost::serialization::base_object(*this) + boost::serialization::base_object(*this) ); + ar & BOOST_SERIALIZATION_NVP(alphabet); } }; typedef boost::shared_ptr SeqStringRef; diff --git a/alg/seq_span.cpp b/alg/seq_span.cpp index 3382bd8..2870f0f 100644 --- a/alg/seq_span.cpp +++ b/alg/seq_span.cpp @@ -21,8 +21,9 @@ SeqSpan::SeqSpan(const SeqSpan *p) { } -SeqSpan::SeqSpan(const std::string &seq_) - : seq(new SeqString(seq_)), +SeqSpan::SeqSpan(const std::string &seq_, + AlphabetRef a) + : seq(new SeqString(seq_, a)), seq_start(0), seq_count(seq_.length()), parent() @@ -79,10 +80,6 @@ bool operator<(const SeqSpan& a, const SeqSpan& b) bool operator==(const SeqSpan& a, const SeqSpan& b) { if (SeqSpan::isFamily(a, b)) { - std::cout << " " << a.seq_start - << " " << b.seq_start - << " " << a.seq_count - << " " << b.seq_count << std::endl; // can do fast comparison if (a.seq_start == b.seq_start and a.seq_count == b.seq_count) { return true; diff --git a/alg/seq_span.hpp b/alg/seq_span.hpp index ec94c27..8d19913 100644 --- a/alg/seq_span.hpp +++ b/alg/seq_span.hpp @@ -39,7 +39,9 @@ public: public: SeqSpan(const SeqSpan &); SeqSpan(const SeqSpan *); - explicit SeqSpan(const std::string &); + explicit SeqSpan(const std::string &, + AlphabetRef = reduced_nucleic_alphabet + ); SeqSpan(const SeqSpanRef, size_type start=0, size_type count=npos); //! assignment @@ -50,6 +52,7 @@ public: friend bool operator==(const SeqSpan&, const SeqSpan&); friend bool operator!=(const SeqSpan&, const SeqSpan&); + const Alphabet& get_alphabet() const { return seq->get_alphabet(); } //! \defgroup string_operators //! @{ //! retrive element at specific position diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 91656ab..47cca2a 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -79,9 +79,8 @@ motif::~motif() } -Sequence::Sequence(alphabet_ref alphabet_) - : seq(new SeqSpan("")), - alphabet(alphabet_), +Sequence::Sequence(AlphabetRef alphabet) + : seq(new SeqSpan("", alphabet)), strand(UnknownStrand) { } @@ -90,27 +89,24 @@ Sequence::~Sequence() { } -Sequence::Sequence(const char *seq, alphabet_ref alphabet_) - : alphabet(alphabet_), - strand(UnknownStrand), +Sequence::Sequence(const char *seq, AlphabetRef alphabet_) + : strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq, alphabet); + set_filtered_sequence(seq, alphabet_); } -Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_) - : alphabet(alphabet_), - strand(UnknownStrand), +Sequence::Sequence(const std::string& seq, AlphabetRef alphabet_) + : strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq, alphabet); + set_filtered_sequence(seq, alphabet_); } Sequence::Sequence(const Sequence& o) : seq(o.seq), - alphabet(o.alphabet), strand(o.strand), header(o.header), species(o.species), @@ -121,7 +117,6 @@ Sequence::Sequence(const Sequence& o) Sequence::Sequence(const Sequence* o) : seq(o->seq), - alphabet(o->alphabet), strand(o->strand), header(o->header), species(o->species), @@ -130,9 +125,8 @@ Sequence::Sequence(const Sequence* o) { } -Sequence::Sequence(const SeqSpanRef& seq_ref, alphabet_ref alphabet_) +Sequence::Sequence(const SeqSpanRef& seq_ref) : seq(seq_ref), - alphabet(alphabet), strand(UnknownStrand), header(""), species("") @@ -143,7 +137,6 @@ Sequence &Sequence::operator=(const Sequence& s) { if (this != &s) { seq = s.seq; - alphabet = s.alphabet; strand = s.strand; header = s.header; species = s.species; @@ -171,11 +164,11 @@ static void multiplatform_getline(std::istream& in, std::string& line) void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index) { - load_fasta(file_path, alphabet, seq_num, start_index, end_index); + load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index); } //! load a fasta file into a sequence -void Sequence::load_fasta(fs::path file_path, alphabet_ref a, +void Sequence::load_fasta(fs::path file_path, AlphabetRef a, int seq_num, int start_index, int end_index) { fs::fstream data_file; @@ -212,11 +205,11 @@ void Sequence::load_fasta(fs::path file_path, alphabet_ref a, void Sequence::load_fasta(std::istream& file, int seq_num, int start_index, int end_index) { - load_fasta(file, alphabet, seq_num, start_index, end_index); + load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index); } void -Sequence::load_fasta(std::istream& data_file, alphabet_ref a, +Sequence::load_fasta(std::istream& data_file, AlphabetRef a, int seq_num, int start_index, int end_index) { @@ -227,7 +220,7 @@ Sequence::load_fasta(std::istream& data_file, alphabet_ref a, std::string rev_comp; std::string sequence_raw; std::string seq_tmp; // holds sequence during basic filtering - const Alphabet &alpha = get_alphabet(a); + const Alphabet &alpha = get_alphabet(); if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); @@ -288,19 +281,18 @@ Sequence::load_fasta(std::istream& data_file, alphabet_ref a, } void Sequence::set_filtered_sequence(const std::string &in_seq, - alphabet_ref alphabet_, + AlphabetRef alphabet_, size_type start, size_type count, strand_type strand_) { - alphabet = alphabet_; if ( count == npos) count = in_seq.size() - start; std::string new_seq; new_seq.reserve(count); // finally, the actual conversion loop - const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets + const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets std::string::const_iterator seq_i = in_seq.begin()+start; for(size_type i = 0; i != count; ++i, ++seq_i) { @@ -310,7 +302,7 @@ void Sequence::set_filtered_sequence(const std::string &in_seq, new_seq.append(1, 'N'); } } - SeqSpanRef new_seq_ref(new SeqSpan(new_seq)); + SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_)); seq = new_seq_ref; strand = strand_; } @@ -570,43 +562,14 @@ Sequence::subseq(size_type start, size_type count) const return new_seq; } -std::string Sequence::create_reverse_map() const -{ - std::string rc_map(256, '~'); - // if we're rna, use U instead of T - // we might want to add an "is_rna" to sequence at somepoint - char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T'; - char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't'; - rc_map['A'] = TU ; rc_map['a'] = tu ; - rc_map['T'] = 'A'; rc_map['t'] = 'a'; - rc_map['U'] = 'A'; rc_map['u'] = 'a'; - rc_map['G'] = 'C'; rc_map['g'] = 'c'; - rc_map['C'] = 'G'; rc_map['c'] = 'g'; - rc_map['M'] = 'K'; rc_map['m'] = 'k'; - rc_map['R'] = 'Y'; rc_map['r'] = 'y'; - rc_map['W'] = 'W'; rc_map['w'] = 'w'; - rc_map['S'] = 'S'; rc_map['s'] = 's'; - rc_map['Y'] = 'R'; rc_map['y'] = 'r'; - rc_map['K'] = 'M'; rc_map['k'] = 'm'; - rc_map['V'] = 'B'; rc_map['v'] = 'b'; - rc_map['H'] = 'D'; rc_map['h'] = 'd'; - rc_map['D'] = 'H'; rc_map['d'] = 'h'; - rc_map['B'] = 'V'; rc_map['b'] = 'v'; - rc_map['N'] = 'N'; rc_map['n'] = 'n'; - rc_map['X'] = 'X'; rc_map['x'] = 'x'; - rc_map['?'] = '?'; - rc_map['.'] = '.'; - rc_map['-'] = '-'; - rc_map['~'] = '~'; // not really needed, but perhaps it's clearer. - return rc_map; -} +// FIXME: This needs to be moved into SeqSpan Sequence Sequence::rev_comp() const { std::string rev_comp; rev_comp.reserve(length()); - std::string rc_map = create_reverse_map(); + std::string rc_map = seq->seq->create_reverse_complement_map(); // reverse and convert Sequence::const_reverse_iterator seq_i; @@ -617,7 +580,12 @@ Sequence Sequence::rev_comp() const { rev_comp.append(1, rc_map[*seq_i]); } - return Sequence(rev_comp, alphabet); + return Sequence(rev_comp, seq->seq->get_alphabet_ref()); +} + +const Alphabet& Sequence::get_alphabet() const +{ + return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet(); } void Sequence::set_fasta_header(std::string header_) @@ -653,31 +621,7 @@ Sequence::get_name() const return ""; } -const Alphabet& Sequence::get_alphabet() const -{ - return get_alphabet(alphabet); -} - -const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const -{ - switch (alpha) { - case reduced_dna_alphabet: - return Alphabet::reduced_dna_alphabet(); - case reduced_rna_alphabet: - return Alphabet::reduced_rna_alphabet(); - case reduced_nucleic_alphabet: - return Alphabet::reduced_nucleic_alphabet(); - case nucleic_alphabet: - return Alphabet::nucleic_alphabet(); - case protein_alphabet: - return Alphabet::protein_alphabet(); - default: - throw std::runtime_error("unrecognized alphabet type"); - break; - } -} - -void Sequence::set_sequence(const std::string& s, alphabet_ref a) +void Sequence::set_sequence(const std::string& s, AlphabetRef a) { set_filtered_sequence(s, a); } diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 09c53c4..4b93ef1 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -104,15 +104,15 @@ public: typedef SeqString::size_type size_type; static const size_type npos = SeqString::npos; enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand }; - enum alphabet_ref { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, - nucleic_alphabet, protein_alphabet }; - Sequence(alphabet_ref a = reduced_nucleic_alphabet); - Sequence(const char* seq, alphabet_ref a = reduced_nucleic_alphabet); - Sequence(const std::string& seq, alphabet_ref a = reduced_nucleic_alphabet); + Sequence(AlphabetRef a = reduced_nucleic_alphabet); + Sequence(const char* seq, + AlphabetRef a = reduced_nucleic_alphabet); + Sequence(const std::string& seq, + AlphabetRef a = reduced_nucleic_alphabet); Sequence(const Sequence& seq); Sequence(const Sequence *); - Sequence(const SeqSpanRef&, alphabet_ref a = reduced_nucleic_alphabet); + Sequence(const SeqSpanRef&); ~Sequence(); //! assignment to constant sequences Sequence &operator=(const Sequence&); @@ -125,7 +125,7 @@ public: //! set sequence to a (sub)string containing nothing but AGCTN void set_filtered_sequence(const std::string& seq, - alphabet_ref a, + AlphabetRef a, size_type start=0, size_type count=npos, strand_type strand=UnknownStrand); @@ -166,7 +166,7 @@ public: Sequence rev_comp() const; //! set sequence (filtered) - void set_sequence(const std::string &, alphabet_ref); + void set_sequence(const std::string &, AlphabetRef); //! get sequence std::string get_sequence() const; //! set species name @@ -181,8 +181,6 @@ public: std::string get_name() const; //! return a reference to whichever alphabet we're currently representing const Alphabet& get_alphabet() const; - //! return a reference to whichever alphabet we're currently representing - const Alphabet& get_alphabet(alphabet_ref) const; //! load sequence from fasta file using the sequences current alphabet void load_fasta(const boost::filesystem::path file_path, int seq_num=1, @@ -192,7 +190,7 @@ public: //! \throw sequence_empty_error //! \throw sequence_empty_file_error void load_fasta(const boost::filesystem::path file_path, - alphabet_ref a, + AlphabetRef a, int seq_num=1, int start_index=0, int end_index=0); void load_fasta(std::istream& file, @@ -202,7 +200,7 @@ public: //! \throw sequence_empty_error //! \throw sequence_empty_file_error void load_fasta(std::istream& file, - alphabet_ref a, + AlphabetRef a, int seq_num=1, int start_index=0, int end_index=0); //! load sequence annotations @@ -236,8 +234,6 @@ public: protected: SeqSpanRef seq; - //! which alphabet we're using - alphabet_ref alphabet; //! strand orientation strand_type strand; //! fasta header @@ -263,7 +259,6 @@ protected: template void serialize(Archive& ar, const unsigned int /*version*/) { ar & BOOST_SERIALIZATION_NVP(seq); - ar & BOOST_SERIALIZATION_NVP(alphabet); ar & BOOST_SERIALIZATION_NVP(strand); ar & BOOST_SERIALIZATION_NVP(header); ar & BOOST_SERIALIZATION_NVP(species); diff --git a/alg/test/CMakeLists.txt b/alg/test/CMakeLists.txt index 433c63d..c3f8d09 100644 --- a/alg/test/CMakeLists.txt +++ b/alg/test/CMakeLists.txt @@ -51,6 +51,7 @@ MAKE_ALG_UNITTEST( test_glseqbrowser ) MAKE_ALG_UNITTEST( test_glsequence ) MAKE_ALG_UNITTEST( test_mussa ) MAKE_ALG_UNITTEST( test_nway ) +MAKE_ALG_UNITTEST( test_seq ) MAKE_ALG_UNITTEST( test_sequence ) MAKE_ALG_UNITTEST( test_seq_span ) MAKE_ALG_UNITTEST( test_sequence_location ) diff --git a/alg/test/test_alphabet.cpp b/alg/test/test_alphabet.cpp index 7758556..5f7d9f3 100644 --- a/alg/test/test_alphabet.cpp +++ b/alg/test/test_alphabet.cpp @@ -21,3 +21,9 @@ BOOST_AUTO_TEST_CASE( alphabet_simple ) // copied from alphabet.cpp BOOST_CHECK_EQUAL( Alphabet::reduced_dna_cstr, "AaCcGgTtNn\012\015"); } + +BOOST_AUTO_TEST_CASE( alphabet_equality) +{ + Alphabet a(Alphabet::reduced_dna_alphabet()); + BOOST_CHECK_EQUAL( a, Alphabet::reduced_dna_alphabet() ); +} \ No newline at end of file diff --git a/alg/test/test_seq.cpp b/alg/test/test_seq.cpp new file mode 100644 index 0000000..a016cb2 --- /dev/null +++ b/alg/test/test_seq.cpp @@ -0,0 +1,31 @@ +#define BOOST_AUTO_TEST_MAIN +#include + +#include "seq.hpp" + +BOOST_AUTO_TEST_CASE( seqstring_default_alphabet ) +{ + SeqString s; + BOOST_CHECK_EQUAL(s.get_alphabet_ref(), reduced_nucleic_alphabet); + BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::reduced_nucleic_alphabet()); + BOOST_CHECK_EQUAL(s.size(), 0); +} + +BOOST_AUTO_TEST_CASE( seqstring_string ) +{ + SeqString s("AGCT"); + + BOOST_CHECK_EQUAL(s.get_alphabet_ref(), reduced_nucleic_alphabet); + BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::reduced_nucleic_alphabet()); + BOOST_CHECK_EQUAL(s.size(), 4); +} + +// such an exciting unit test, making sure that a=b; a==b +BOOST_AUTO_TEST_CASE( seqstring_string_alphabet ) +{ + SeqString s("AGCT", nucleic_alphabet); + + BOOST_CHECK_EQUAL(s.get_alphabet_ref(), nucleic_alphabet); + BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::nucleic_alphabet()); + BOOST_CHECK_EQUAL(s.size(), 4); +} \ No newline at end of file diff --git a/alg/test/test_seq_span.cpp b/alg/test/test_seq_span.cpp index 9b98f21..90e3d00 100644 --- a/alg/test/test_seq_span.cpp +++ b/alg/test/test_seq_span.cpp @@ -14,6 +14,15 @@ BOOST_AUTO_TEST_CASE( seqspan_from_string ) BOOST_CHECK_EQUAL(span1->sequence(), str1); } +BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet ) +{ + std::string str1("AAGGCCUU"); + SeqSpanRef span1(new SeqSpan(str1, reduced_rna_alphabet)); + BOOST_CHECK_EQUAL(span1->length(), str1.length()); + BOOST_CHECK_EQUAL(span1->sequence(), str1); + BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet()); +} + BOOST_AUTO_TEST_CASE( seqspan_from_seqspan ) { std::string str1("AAGGCCTT"); diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 3847252..6da21f1 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( sequence_eol_conventions ) BOOST_AUTO_TEST_CASE( sequence_filter ) { const char *core_seq = "AATTGGCC"; - Sequence s1(core_seq, Sequence::reduced_dna_alphabet); + Sequence s1(core_seq, reduced_dna_alphabet); BOOST_CHECK_EQUAL(s1, core_seq); - Sequence s2("aattggcc", Sequence::reduced_dna_alphabet); + Sequence s2("aattggcc", reduced_dna_alphabet); BOOST_CHECK_EQUAL(s2, "AATTGGCC"); BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT"); BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now @@ -97,15 +97,15 @@ BOOST_AUTO_TEST_CASE( sequence_filter ) //We're currently forcing sequences to uppercase BOOST_CHECK_EQUAL(s2.get_sequence(), "AATTGGCC"); - Sequence s3("asdfg", Sequence::reduced_dna_alphabet); + Sequence s3("asdfg", reduced_dna_alphabet); BOOST_CHECK_EQUAL(s3, "ANNNG"); BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN"); - s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2); + s3.set_filtered_sequence("AAGGCCTT", reduced_dna_alphabet, 0, 2); BOOST_CHECK_EQUAL(s3, "AA"); - s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2); + s3.set_filtered_sequence("AAGGCCTT", reduced_dna_alphabet, 2, 2); BOOST_CHECK_EQUAL( s3, "GG"); - s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4); + s3.set_filtered_sequence("AAGGCCTT", reduced_dna_alphabet, 4); BOOST_CHECK_EQUAL( s3, "CCTT"); s3 = "AAGGFF"; @@ -115,12 +115,12 @@ BOOST_AUTO_TEST_CASE( sequence_filter ) BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet ) { std::string agct("AGCT"); - Sequence seq(agct, Sequence::nucleic_alphabet); + Sequence seq(agct, nucleic_alphabet); BOOST_CHECK_EQUAL(seq.size(), agct.size()); BOOST_CHECK_EQUAL(seq.get_sequence(), agct); std::string bdv("BDv"); - Sequence seq_bdv(bdv, Sequence::nucleic_alphabet); + Sequence seq_bdv(bdv, nucleic_alphabet); BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size()); // forcing sequence to upper case BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), @@ -148,7 +148,7 @@ BOOST_AUTO_TEST_CASE( sequence_default_alphabet ) BOOST_AUTO_TEST_CASE( subseq_names ) { - Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet); + Sequence s1("AAGGCCTT", reduced_dna_alphabet); s1.set_species("species"); s1.set_fasta_header("a fasta header"); Sequence s2 = s1.subseq(2,2); @@ -164,7 +164,7 @@ BOOST_AUTO_TEST_CASE( sequence_start_stop ) BOOST_CHECK_EQUAL( s1.stop(), 0 ); std::string seq_string("AAGGCCTT"); - Sequence s2(seq_string, Sequence::reduced_dna_alphabet); + Sequence s2(seq_string, reduced_dna_alphabet); BOOST_CHECK_EQUAL( s2.start(), 0 ); BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() ); @@ -189,7 +189,7 @@ BOOST_AUTO_TEST_CASE( sequence_load ) fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq"); seq_path /= "human_mck_pro.fa"; Sequence s; - s.load_fasta(seq_path, Sequence::reduced_dna_alphabet); + s.load_fasta(seq_path, reduced_dna_alphabet); BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC"); BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human " @@ -255,15 +255,15 @@ BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced ) std::stringstream garbage_fasta(garbage_string); Sequence s; - s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet); + s.load_fasta(reduced_dna_fasta, reduced_dna_alphabet); BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta, - Sequence::reduced_dna_alphabet), + reduced_dna_alphabet), sequence_invalid_load_error); BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta, - Sequence::reduced_dna_alphabet), + reduced_dna_alphabet), sequence_invalid_load_error); BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, - Sequence::reduced_dna_alphabet), + reduced_dna_alphabet), sequence_invalid_load_error); } @@ -280,15 +280,15 @@ BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced ) std::stringstream garbage_fasta(garbage_string); Sequence s; - s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet); + s.load_fasta(reduced_rna_fasta, reduced_rna_alphabet); BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta, - Sequence::reduced_rna_alphabet), + reduced_rna_alphabet), sequence_invalid_load_error); BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta, - Sequence::reduced_rna_alphabet), + reduced_rna_alphabet), sequence_invalid_load_error); BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, - Sequence::reduced_rna_alphabet), + reduced_rna_alphabet), sequence_invalid_load_error); } @@ -310,11 +310,11 @@ BOOST_AUTO_TEST_CASE( sequence_load_fasta_default ) // there's two copies of reduced_rna_fasta because i didn't feel like // figuring out how to properly reset the read pointer in a stringstream s.load_fasta(reduced_rna_fasta1); - specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet); + specific.load_fasta(reduced_rna_fasta2, reduced_nucleic_alphabet); BOOST_CHECK_EQUAL(s, specific); s.load_fasta(reduced_dna_fasta1); - specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet); + specific.load_fasta(reduced_dna_fasta2, reduced_nucleic_alphabet); BOOST_CHECK_EQUAL(s, specific); BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta), @@ -352,7 +352,7 @@ BOOST_AUTO_TEST_CASE( sequence_load_multiple_sequences_one_fasta ) BOOST_AUTO_TEST_CASE( sequence_reverse_complement ) { std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?"); - Sequence seq(iupac_symbols, Sequence::nucleic_alphabet); + Sequence seq(iupac_symbols, nucleic_alphabet); Sequence seqr = seq.rev_comp(); BOOST_CHECK( seq != seqr ); @@ -365,7 +365,7 @@ BOOST_AUTO_TEST_CASE( sequence_reverse_complement ) BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna ) { std::string dna_str("AGCTN"); - Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet); + Sequence dna_seq(dna_str, reduced_dna_alphabet); BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT"); BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp()); } @@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna ) BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna ) { std::string rna_str("AGCUN"); - Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet); + Sequence rna_seq(rna_str, reduced_rna_alphabet); BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU"); BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp()); } @@ -381,7 +381,7 @@ BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna ) BOOST_AUTO_TEST_CASE( sequence_reverse_complement_subseq ) { std::string dna_str("AAAAAAAAAAGGGGGGGGGGG"); - Sequence seq(dna_str, Sequence::reduced_dna_alphabet); + Sequence seq(dna_str, reduced_dna_alphabet); Sequence subseq = seq.subseq(8,4); BOOST_CHECK_EQUAL( subseq, "AAGG"); Sequence rev_subseq = subseq.rev_comp(); @@ -444,7 +444,7 @@ BOOST_AUTO_TEST_CASE( annotation_load ) ; string s(100, 'A'); s += "GCTGCTAATT"; - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); @@ -484,7 +484,7 @@ BOOST_AUTO_TEST_CASE( annotation_broken_load ) ; string s(100, 'A'); s += "GCTGCTAATT"; - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); BOOST_CHECK_THROW(seq.parse_annot(annot_data, 0, 0), annotation_load_error); BOOST_CHECK_EQUAL(seq.annotations().size(), 0); @@ -511,7 +511,7 @@ BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load) "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG" "AAAAA" "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC"; - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); seq.parse_annot(annot_data); std::list annots = seq.annotations(); BOOST_CHECK_EQUAL( annots.size(), 2); @@ -532,7 +532,7 @@ BOOST_AUTO_TEST_CASE( annotation_load_no_species_name ) ; string s(100, 'A'); s += "GCTGCTAATT"; - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); @@ -583,12 +583,12 @@ BOOST_AUTO_TEST_CASE ( sequence_size ) BOOST_AUTO_TEST_CASE( sequence_empty_equality ) { - Sequence szero("", Sequence::reduced_dna_alphabet); + Sequence szero("", reduced_dna_alphabet); BOOST_CHECK_EQUAL(szero.empty(), true); BOOST_CHECK_EQUAL(szero, szero); BOOST_CHECK_EQUAL(szero, ""); - Sequence sclear("AGCT", Sequence::reduced_dna_alphabet); + Sequence sclear("AGCT", reduced_dna_alphabet); sclear.clear(); BOOST_CHECK_EQUAL(sclear.empty(), true); BOOST_CHECK_EQUAL(sclear, sclear); @@ -599,7 +599,7 @@ BOOST_AUTO_TEST_CASE( sequence_empty_equality ) BOOST_AUTO_TEST_CASE ( sequence_iterators ) { std::string seq_string = "AAGGCCTTNNTATA"; - Sequence s(seq_string, Sequence::reduced_dna_alphabet); + Sequence s(seq_string, reduced_dna_alphabet); const Sequence cs(s); std::string::size_type count = 0; @@ -628,7 +628,7 @@ BOOST_AUTO_TEST_CASE( sequence_motifs ) { string m("AAAA"); string bogus("AATTGGAA"); - Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet); + Sequence s1("AAAAGGGGCCCCTTTT", reduced_dna_alphabet); list::const_iterator motif_i = s1.motifs().begin(); list::const_iterator motif_end = s1.motifs().end(); @@ -683,7 +683,7 @@ BOOST_AUTO_TEST_CASE( sequence_motif_subseq) // only search the subsequence ticket:199 string aaaa("AAAA"); string cccc("CCCC"); - Sequence s1("AAAANCCCC", Sequence::reduced_dna_alphabet); + Sequence s1("AAAANCCCC", reduced_dna_alphabet); // this shouldn't show up s1.add_motif(cccc); @@ -736,7 +736,7 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence ) "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); string gc("GCCCCC"); string gga("GGACACCTC"); - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); std::list query_list; std::list string_list; @@ -782,7 +782,7 @@ BOOST_AUTO_TEST_CASE( subseq_annotation_test ) "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA" "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA" "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); seq.add_annotation(annot(0, 10, "0-10", "0-10")); @@ -819,7 +819,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update ) "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA" "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA" "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); // starting conditions BOOST_CHECK_EQUAL(seq.annotations().size(), 0); @@ -840,7 +840,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update ) BOOST_AUTO_TEST_CASE( out_operator ) { string s("AAGGCCTT"); - Sequence seq(s, Sequence::reduced_dna_alphabet); + Sequence seq(s, reduced_dna_alphabet); ostringstream buf; buf << s; @@ -849,7 +849,7 @@ BOOST_AUTO_TEST_CASE( out_operator ) BOOST_AUTO_TEST_CASE( get_name ) { - Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet); + Sequence seq("AAGGCCTT", reduced_dna_alphabet); BOOST_CHECK_EQUAL( seq.get_name(), "" ); seq.set_species("hooman"); // anyone remember tradewars? @@ -861,7 +861,7 @@ BOOST_AUTO_TEST_CASE( get_name ) BOOST_AUTO_TEST_CASE( serialize_simple ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string, Sequence::reduced_dna_alphabet); + Sequence seq(seq_string, reduced_dna_alphabet); seq.set_species("ribbet"); std::ostringstream oss; // allocate/deallocate serialization components @@ -884,7 +884,7 @@ BOOST_AUTO_TEST_CASE( serialize_simple ) BOOST_AUTO_TEST_CASE( serialize_tree ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string, Sequence::reduced_dna_alphabet); + Sequence seq(seq_string, reduced_dna_alphabet); seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); @@ -914,7 +914,7 @@ BOOST_AUTO_TEST_CASE( serialize_tree ) BOOST_AUTO_TEST_CASE( serialize_xml_sequence ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string, Sequence::reduced_dna_alphabet); + Sequence seq(seq_string, reduced_dna_alphabet); seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); @@ -941,7 +941,7 @@ BOOST_AUTO_TEST_CASE( serialize_xml_sequence ) BOOST_AUTO_TEST_CASE( serialize_xml_two ) { std::string seq_string = "AAGGCCTT"; - Sequence seq1(seq_string, Sequence::reduced_dna_alphabet); + Sequence seq1(seq_string, reduced_dna_alphabet); Sequence seq2(seq1); std::ostringstream oss; diff --git a/qui/motif_editor/MotifElement.cpp b/qui/motif_editor/MotifElement.cpp index 89308c7..e847a24 100644 --- a/qui/motif_editor/MotifElement.cpp +++ b/qui/motif_editor/MotifElement.cpp @@ -6,7 +6,7 @@ MotifElement::MotifElement() : enabled(true), color(Color(1.0,0.0,0.0,1.0)), - motif(Sequence::nucleic_alphabet) + motif(nucleic_alphabet) { } @@ -77,7 +77,7 @@ void MotifElement::setSequence(const Sequence& seq) //! set sequence text void MotifElement::setSequence(const std::string& seq_text) { - motif.set_sequence(seq_text, Sequence::nucleic_alphabet); + motif.set_sequence(seq_text, nucleic_alphabet); } QString MotifElement::getSequenceText() const