From 3f3aba42bdea1eaa21c4345732320b9cf3473da2 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Tue, 10 Oct 2006 22:27:54 +0000 Subject: [PATCH] implement alphabet for sequence in order to fix ticket:144 when motifs were changed to be sequences I needed someway of specifying what alphabet a particular sequence is using. This patch changes things so a sequences default sequence alphabet is a "AGCTUN" AKA reduced nucleic alphabet. Motifs however use the full IUPAC nucleic alphabet. (Or at least I hope they do I tested the sequence class, but GUI testing is harder). Another change is that instead of building a fixed reverse complement table, there's a function which creates the table and understands that at least for reduced_rna_alphabet (currently the only specifically RNA alphabet) the reverse of A should be a. Sequence is no longer case sensitive, all of the comparisons are now done in an insensitive manner. Finally I found a bug in the motif scanning code that was due to a typo from long before I inherited the code. Aparently that code branch in motif_scan had never been executed before. --- alg/CMakeLists.txt | 3 +- alg/alphabet.cpp | 39 ++ alg/alphabet.hpp | 59 +++ alg/mussa.cpp | 4 +- alg/sequence.cpp | 419 ++++++++------------ alg/sequence.hpp | 85 ++-- alg/test/CMakeLists.txt | 17 +- alg/test/test_alphabet.cpp | 21 + alg/test/test_sequence.cpp | 199 ++++++++-- mussa_exceptions.hpp | 8 +- qui/motif_editor/MotifElement.cpp | 5 +- qui/mussa_setup_dialog/MussaSetupWidget.cpp | 4 +- 12 files changed, 535 insertions(+), 328 deletions(-) create mode 100644 alg/alphabet.cpp create mode 100644 alg/alphabet.hpp create mode 100644 alg/test/test_alphabet.cpp diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt index 5d36611..0d6a84c 100644 --- a/alg/CMakeLists.txt +++ b/alg/CMakeLists.txt @@ -10,7 +10,8 @@ SET(MOC_HEADERS ) QT4_WRAP_CPP(MOC_SOURCES ${MOC_HEADERS}) -SET(SOURCES annotation_colors.cpp +SET(SOURCES alphabet.cpp + annotation_colors.cpp color.cpp conserved_path.cpp flp.cpp diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp new file mode 100644 index 0000000..bd29dd2 --- /dev/null +++ b/alg/alphabet.cpp @@ -0,0 +1,39 @@ +#include "alg/alphabet.hpp" + +// some standard dna alphabets +// Include nl (\012), and cr (\015) to make sequence parsing eol +// convention independent. +const Alphabet Alphabet::reduced_dna_alphabet("AaCcGgTtNn\012\015"); +const Alphabet Alphabet::reduced_rna_alphabet("AaCcGgUuNn\012\015"); +const Alphabet Alphabet::reduced_nucleic_alphabet("AaCcGgTtUuNn\012\015"); +//! this is the general iupac alphabet for nucleotides +const Alphabet Alphabet::nucleic_alphabet( + "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015" +); +//! the protein alphabet +const Alphabet Alphabet::protein_alphabet( + "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015" +); + +Alphabet::Alphabet(const char *a) : + alphabet(a) +{ + alphabet_set.insert(alphabet.begin(), alphabet.end()); +} + +void Alphabet::assign(const Alphabet& a) +{ + alphabet = a.alphabet; + alphabet_set.clear(); + alphabet_set.insert(alphabet.begin(), alphabet.end()); +} + +const char *Alphabet::c_str() const +{ + alphabet.c_str(); +} + +bool Alphabet::exists(const char c) const +{ + return (alphabet_set.find(c) != alphabet_set.end()); +} \ No newline at end of file diff --git a/alg/alphabet.hpp b/alg/alphabet.hpp new file mode 100644 index 0000000..e7d35c9 --- /dev/null +++ b/alg/alphabet.hpp @@ -0,0 +1,59 @@ +#ifndef ALPHABET_HPP_ +#define ALPHABET_HPP_ + +#include +#include +#include +#include +#include + +#include + +//! this is a helper class for sequence +class Alphabet { +friend class Sequence; +public: + typedef std::string::const_iterator const_iterator; + + //! return reference to the characters in our alphabet + const char *c_str() const; + //! case-insensitive test to check a character for existence in our alphabet + bool exists(const char) const; + + // note, if you want to define an alphabet for a sequence, you probably want + // to update the enumeration in Sequence, and Sequence::get_sequence + //! The standard DNA alphabet, with unique, and unknown characters + static const Alphabet reduced_dna_alphabet; + //! The standard RNA alphabet, with unique, and unknown characters + static const Alphabet reduced_rna_alphabet; + //! The standard DNA/RNA alphabet, with unique, and unknown characters + static const Alphabet reduced_nucleic_alphabet; + //! this is the general IUPAC alphabet for nucleotides + static const Alphabet nucleic_alphabet; + //! the protein alphabet + static const Alphabet protein_alphabet; + +private: + //! what are allowable symbols in our alphabet + std::string alphabet; + //! internal variable to make exists() faster + std::set alphabet_set; + + //! some necessary string api access + Alphabet(const char *a); + //! allow sequence to copy one alphabet to another (needed when unserializing) + void assign(const Alphabet& a); + const_iterator begin() const { return alphabet.begin(); } + const_iterator end() const { return alphabet.end(); } + + // serialization + friend class boost::serialization::access; + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & BOOST_SERIALIZATION_NVP(alphabet); + alphabet_set.clear(); + alphabet_set.insert(alphabet.begin(), alphabet.end()); + } +}; + +#endif /*ALPHABET_*/ diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 47a6b0f..50991c1 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -792,7 +792,7 @@ struct push_back_motif { void operator()(std::string::const_iterator, std::string::const_iterator) const { - Sequence seq(seq_string); + Sequence seq(seq_string, Sequence::nucleic_alphabet); // shouldn't we have a better field than "fasta header" and speices? seq.set_fasta_header(name); // we need to clear the name in case the next motif doesn't have one. @@ -809,7 +809,7 @@ struct push_back_motif { void Mussa::load_motifs(std::istream &in) { std::string data; - const char *alphabet = Sequence::nucleic_iupac_alphabet.c_str(); + const char *alphabet = Alphabet::nucleic_alphabet.c_str(); string seq; string name; float red = 1.0; diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 845a016..ae58e5c 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -34,8 +34,10 @@ namespace spirit = boost::spirit; #include "mussa_exceptions.hpp" #include +#include #include #include +#include annot::annot() : begin(0), @@ -75,15 +77,10 @@ motif::~motif() { } -const std::string Sequence::dna_alphabet("AaCcGgTtNn\012\015"); -const std::string Sequence::rna_alphabet("AaCcGgNnUu\012\015"); - //! this is the general iupac alphabet for nucleotides -const std::string Sequence::nucleic_iupac_alphabet("AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015"); - //! the protein alphabet -const std::string Sequence::protein_alphabet("AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"); -Sequence::Sequence() +Sequence::Sequence(alphabet_ref alphabet_) : parent(0), + alphabet(alphabet_), seq_start(0), seq_count(0), strand(UnknownStrand) @@ -94,31 +91,34 @@ Sequence::~Sequence() { } -Sequence::Sequence(const char *seq) +Sequence::Sequence(const char *seq, alphabet_ref alphabet_) : parent(0), + alphabet(alphabet_), seq_start(0), seq_count(0), strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq); + set_filtered_sequence(seq, alphabet); } -Sequence::Sequence(const std::string& seq) +Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_) : parent(0), + alphabet(alphabet_), seq_start(0), seq_count(0), strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq); + set_filtered_sequence(seq, alphabet); } Sequence::Sequence(const Sequence& o) : parent(o.parent), seq(o.seq), + alphabet(o.alphabet), seq_start(o.seq_start), seq_count(o.seq_count), strand(o.strand), @@ -134,6 +134,7 @@ Sequence &Sequence::operator=(const Sequence& s) if (this != &s) { parent = s.parent; seq = s.seq; + alphabet = s.alphabet; seq_start = s.seq_start; seq_count = s.seq_count; strand = s.strand; @@ -161,16 +162,14 @@ static void multiplatform_getline(std::istream& in, std::string& line) } } +void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index) +{ + load_fasta(file_path, alphabet, seq_num, start_index, end_index); +} + //! load a fasta file into a sequence -/*! - * \param file_path the location of the fasta file in the filesystem - * \param seq_num which sequence in the file to load - * \param start_index starting position in the fasta sequence, 0 for beginning - * \param end_index ending position in the fasta sequence, 0 for end - * \return error message, empty string if no error. (gag!) - */ -void Sequence::load_fasta(fs::path file_path, int seq_num, - int start_index, int end_index) +void Sequence::load_fasta(fs::path file_path, alphabet_ref a, + int seq_num, int start_index, int end_index) { fs::fstream data_file; data_file.open(file_path, std::ios::in); @@ -180,7 +179,7 @@ void Sequence::load_fasta(fs::path file_path, int seq_num, throw mussa_load_error("Sequence File: "+file_path.string()+" not found"); } else { try { - load_fasta(data_file, seq_num, start_index, end_index); + load_fasta(data_file, a, seq_num, start_index, end_index); } catch(sequence_empty_error e) { // there doesn't appear to be any sequence // catch and rethrow to include the filename @@ -198,8 +197,15 @@ void Sequence::load_fasta(fs::path file_path, int seq_num, } } +void Sequence::load_fasta(std::iostream& file, + int seq_num, int start_index, int end_index) +{ + load_fasta(file, alphabet, seq_num, start_index, end_index); +} + void -Sequence::load_fasta(std::iostream& data_file, int seq_num, +Sequence::load_fasta(std::iostream& data_file, alphabet_ref a, + int seq_num, int start_index, int end_index) { std::string file_data_line; @@ -208,6 +214,7 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num, std::string rev_comp; std::string sequence_raw; std::string seq_tmp; // holds sequence during basic filtering + const Alphabet &alpha = get_alphabet(a); if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); @@ -230,7 +237,18 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num, multiplatform_getline(data_file,file_data_line); if (file_data_line.substr(0,1) == ">") read_seq = false; - else sequence_raw += file_data_line; + else { + for (std::string::const_iterator line_i = file_data_line.begin(); + line_i != file_data_line.end(); + ++line_i) + { + if(alpha.exists(*line_i)) { + sequence_raw += *line_i; + } else { + throw sequence_invalid_load_error("Unrecognized characters in fasta sequence"); + } + } + } } // Lastly, if subselection of the sequence was specified we keep cut out @@ -244,50 +262,35 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num, std::string msg("The selected sequence appears to be empty"); throw sequence_empty_error(msg); } - set_filtered_sequence(sequence_raw, start_index, end_index-start_index); + set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index); } else { std::string errormsg("There were no fasta sequences"); throw sequence_empty_file_error(errormsg); } } -void Sequence::set_filtered_sequence(const std::string &old_seq, +void Sequence::set_filtered_sequence(const std::string &in_seq, + alphabet_ref alphabet_, size_type start, size_type count, strand_type strand_) { - char conversionTable[257]; - + alphabet = alphabet_; if ( count == npos) - count = old_seq.size() - start; + count = in_seq.size() - start; boost::shared_ptr new_seq(new seq_string); new_seq->reserve(count); - // Make a conversion table - - // everything we don't specify below will become 'N' - for(int table_i=0; table_i < 256; table_i++) - { - conversionTable[table_i] = 'N'; - } - // add end of string character for printing out table for testing purposes - conversionTable[256] = '\0'; - - // we want these to map to themselves - ie not to change - conversionTable[(int)'A'] = 'A'; - conversionTable[(int)'T'] = 'T'; - conversionTable[(int)'G'] = 'G'; - conversionTable[(int)'C'] = 'C'; - // this is to upcase - conversionTable[(int)'a'] = 'A'; - conversionTable[(int)'t'] = 'T'; - conversionTable[(int)'g'] = 'G'; - conversionTable[(int)'c'] = 'C'; - // finally, the actual conversion loop - for(std::string::size_type seq_index = 0; seq_index < count; seq_index++) + const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets + std::string::const_iterator seq_i = in_seq.begin()+start; + for(size_type i = 0; i != count; ++i, ++seq_i) { - new_seq->append(1, conversionTable[ (int)old_seq[seq_index+start]]); + if (alpha_impl.exists(*seq_i)) { + new_seq->append(1, *seq_i); + } else { + new_seq->append(1, 'N'); + } } parent = 0; seq = new_seq; @@ -450,7 +453,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) ((spirit::ch_p('>')|spirit::str_p(">")) >> (*(spirit::print_p))[spirit::assign_a(name)] >> spirit::eol_p >> - (+(spirit::chset<>(nucleic_iupac_alphabet.c_str())))[spirit::assign_a(seq)] + (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)] )[push_back_seq(query_seqs, name, seq)] ) >> *spirit::space_p @@ -523,44 +526,54 @@ Sequence::subseq(int start, int count) return new_seq; } -std::string -Sequence::rev_comp() const +std::string Sequence::create_reverse_map() const { - std::string rev_comp; - char conversionTable[257]; - int seq_i, table_i, len; - - len = length(); - rev_comp.reserve(len); - // make a conversion table - // init all parts of conversion table to '~' character - // '~' I doubt will ever appear in a sequence file (jeez, I hope) - // and may the fleas of 1000 camels infest the genitals of any biologist (and - // seven generations of their progeny) who decides to make it mean - // something special!!! - // PS - double the curse for any smartass non-biologist who tries it as well - for(table_i=0; table_i < 256; table_i++) - { - conversionTable[table_i] = '~'; - } - // add end of string character for printing out table for testing purposes - conversionTable[256] = '\0'; - - // add in the characters for the bases we want to convert - conversionTable[(int)'A'] = 'T'; - conversionTable[(int)'T'] = 'A'; - conversionTable[(int)'G'] = 'C'; - conversionTable[(int)'C'] = 'G'; - conversionTable[(int)'N'] = 'N'; + std::string rc_map(256, '~'); + // if we're rna, use U instead of T + // we might want to add an "is_rna" to sequence at somepoint + char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T'; + char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't'; + rc_map['A'] = TU ; rc_map['a'] = tu ; + rc_map['T'] = 'A'; rc_map['t'] = 'a'; + rc_map['U'] = 'A'; rc_map['u'] = 'a'; + rc_map['G'] = 'C'; rc_map['g'] = 'c'; + rc_map['C'] = 'G'; rc_map['c'] = 'g'; + rc_map['M'] = 'K'; rc_map['m'] = 'k'; + rc_map['R'] = 'Y'; rc_map['r'] = 'y'; + rc_map['W'] = 'W'; rc_map['w'] = 'w'; + rc_map['S'] = 'S'; rc_map['s'] = 's'; + rc_map['Y'] = 'R'; rc_map['y'] = 'r'; + rc_map['K'] = 'M'; rc_map['k'] = 'm'; + rc_map['V'] = 'B'; rc_map['v'] = 'b'; + rc_map['H'] = 'D'; rc_map['h'] = 'd'; + rc_map['D'] = 'H'; rc_map['d'] = 'h'; + rc_map['B'] = 'V'; rc_map['b'] = 'v'; + rc_map['N'] = 'N'; rc_map['n'] = 'n'; + rc_map['X'] = 'X'; rc_map['x'] = 'x'; + rc_map['?'] = '?'; + rc_map['.'] = '.'; + rc_map['-'] = '-'; + rc_map['~'] = '~'; // not really needed, but perhaps it's clearer. + return rc_map; +} - // finally, the actual conversion loop - for(seq_i = len - 1; seq_i >= 0; seq_i--) +Sequence Sequence::rev_comp() const +{ + std::string rev_comp; + rev_comp.reserve(length()); + + std::string rc_map = create_reverse_map(); + + // reverse and convert + seq_string::const_reverse_iterator seq_i; + seq_string::const_reverse_iterator seq_end = seq->rend(); + for(seq_i = seq->rbegin(); + seq_i != seq_end; + ++seq_i) { - table_i = (int) at(seq_i); - rev_comp += conversionTable[table_i]; + rev_comp.append(1, rc_map[*seq_i]); } - - return rev_comp; + return Sequence(rev_comp, alphabet); } void Sequence::set_fasta_header(std::string header_) @@ -596,9 +609,33 @@ Sequence::get_name() const return ""; } -void Sequence::set_sequence(const std::string& s) +const Alphabet& Sequence::get_alphabet() const +{ + return get_alphabet(alphabet); +} + +const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const { - set_filtered_sequence(s); + switch (alpha) { + case reduced_dna_alphabet: + return Alphabet::reduced_dna_alphabet; + case reduced_rna_alphabet: + return Alphabet::reduced_rna_alphabet; + case reduced_nucleic_alphabet: + return Alphabet::reduced_nucleic_alphabet; + case nucleic_alphabet: + return Alphabet::nucleic_alphabet; + case protein_alphabet: + return Alphabet::protein_alphabet; + default: + throw std::runtime_error("unrecognized alphabet type"); + break; + } +} + +void Sequence::set_sequence(const std::string& s, alphabet_ref a) +{ + set_filtered_sequence(s, a); } std::string Sequence::get_sequence() const @@ -736,7 +773,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) } getline(load_file, file_data_line); // looks like the sequence is written as a single line - set_filtered_sequence(file_data_line); + set_filtered_sequence(file_data_line, reduced_dna_alphabet); getline(load_file, file_data_line); getline(load_file, file_data_line); if (file_data_line == "") @@ -793,107 +830,6 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) load_file.close(); } -std::string -Sequence::rc_motif(std::string a_motif) const -{ - std::string rev_comp; - char conversionTable[257]; - int seq_i, table_i, len; - - len = a_motif.length(); - rev_comp.reserve(len); - - for(table_i=0; table_i < 256; table_i++) - { - conversionTable[table_i] = '~'; - } - // add end of std::string character for printing out table for testing purposes - conversionTable[256] = '\0'; - - // add in the characters for the bases we want to convert (IUPAC) - conversionTable[(int)'A'] = 'T'; - conversionTable[(int)'T'] = 'A'; - conversionTable[(int)'G'] = 'C'; - conversionTable[(int)'C'] = 'G'; - conversionTable[(int)'N'] = 'N'; - conversionTable[(int)'M'] = 'K'; - conversionTable[(int)'R'] = 'Y'; - conversionTable[(int)'W'] = 'W'; - conversionTable[(int)'S'] = 'S'; - conversionTable[(int)'Y'] = 'R'; - conversionTable[(int)'K'] = 'M'; - conversionTable[(int)'V'] = 'B'; - conversionTable[(int)'H'] = 'D'; - conversionTable[(int)'D'] = 'H'; - conversionTable[(int)'B'] = 'V'; - - // finally, the actual conversion loop - for(seq_i = len - 1; seq_i >= 0; seq_i--) - { - //std::cout << "** i = " << seq_i << " bp = " << - table_i = (int) a_motif[seq_i]; - rev_comp += conversionTable[table_i]; - } - - //std::cout << "seq: " << a_motif << std::endl; - //std::cout << "rc: " << rev_comp << std::endl; - - return rev_comp; -} - -std::string -Sequence::motif_normalize(const std::string& a_motif) -{ - std::string valid_motif; - int seq_i, len; - - len = a_motif.length(); - valid_motif.reserve(len); - - // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present. - // current nonIUPAC symbols are omitted, which is not reported atm - for(seq_i = 0; seq_i < len; seq_i++) - { - if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A')) - valid_motif += 'A'; - else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T')) - valid_motif += 'T'; - else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G')) - valid_motif += 'G'; - else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C')) - valid_motif += 'C'; - else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N')) - valid_motif += 'N'; - else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M')) - valid_motif += 'M'; - else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R')) - valid_motif += 'R'; - else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W')) - valid_motif += 'W'; - else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S')) - valid_motif += 'S'; - else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y')) - valid_motif += 'Y'; - else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K')) - valid_motif += 'G'; - else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V')) - valid_motif += 'V'; - else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H')) - valid_motif += 'H'; - else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D')) - valid_motif += 'D'; - else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B')) - valid_motif += 'B'; - else { - std::string msg = "Letter "; - msg += a_motif[seq_i]; - msg += " is not a valid IUPAC symbol"; - throw motif_normalize_error(msg); - } - } - //std::cout << "valid_motif is: " << valid_motif << std::endl; - return valid_motif; -} void Sequence::add_motif(const Sequence& a_motif) { @@ -918,107 +854,78 @@ const std::list& Sequence::motifs() const } std::vector -Sequence::find_motif(const std::string& a_motif) const +Sequence::find_motif(const Sequence& a_motif) const { std::vector motif_match_starts; - std::string norm_motif_rc; + Sequence norm_motif_rc; motif_match_starts.clear(); + // std::cout << "motif is: " << norm_motif << std::endl; - //std::cout << "motif is: " << a_motif << std::endl; - std::string norm_motif = motif_normalize(a_motif); - //std::cout << "motif is: " << a_motif << std::endl; - - if (norm_motif.size() > 0) + if (a_motif.size() > 0) { //std::cout << "Sequence: none blank motif\n"; - motif_scan(norm_motif, &motif_match_starts); + motif_scan(a_motif, &motif_match_starts); - norm_motif_rc = rc_motif(a_motif); + norm_motif_rc = a_motif.rev_comp();; // make sure not to do search again if it is a palindrome - if (norm_motif_rc != norm_motif) { + if (norm_motif_rc != a_motif) { motif_scan(norm_motif_rc, &motif_match_starts); } } return motif_match_starts; } -std::vector -Sequence::find_motif(const Sequence& a_motif) const -{ - return find_motif(a_motif.get_sequence()); -} - void -Sequence::motif_scan(std::string a_motif, std::vector * motif_match_starts) const +Sequence::motif_scan(const Sequence& a_motif, std::vector * motif_match_starts) const { // if there's no sequence we can't scan for it? // should this throw an exception? if (!seq) return; - std::string::const_iterator seq_c = seq->begin(); - std::string::size_type seq_i; - int motif_i, motif_len; + std::string::size_type seq_i = 0; + Sequence::size_type motif_i = 0; + Sequence::size_type motif_len = a_motif.length(); + Sequence::value_type motif_char; + Sequence::value_type seq_char; - //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl; - motif_len = a_motif.length(); - - //std::cout << "motif_length: " << motif_len << std::endl; - //std::cout << "RAAARRRRR\n"; - - motif_i = 0; - - //std::cout << "motif: " << a_motif << std::endl; - - //std::cout << "Sequence: motif, length= " << length << std::endl; - seq_i = 0; - while (seq_i < length()) + while (seq_i < seq->length()) { - //std::cout << seq_c[seq_i]; - //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " "; // this is pretty much a straight translation of Nora's python code // to match iupac letter codes - if (a_motif[motif_i] =='N') + motif_char = toupper(a_motif[motif_i]); + seq_char = toupper(seq->at(seq_i)); + if (motif_char =='N') motif_i++; - else if (a_motif[motif_i] == seq_c[seq_i]) + else if (motif_char == seq_char) motif_i++; - else if ((a_motif[motif_i] =='M') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C'))) + else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C'))) motif_i++; - else if ((a_motif[motif_i] =='R') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G'))) + else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G'))) motif_i++; - else if ((a_motif[motif_i] =='W') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T'))) + else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T'))) motif_i++; - else if ((a_motif[motif_i] =='S') && - ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G'))) + else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G'))) motif_i++; - else if ((a_motif[motif_i] =='Y') && - ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T'))) + else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T'))) motif_i++; - else if ((a_motif[motif_i] =='K') && - ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T'))) + else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T'))) motif_i++; - else if ((a_motif[motif_i] =='V') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') || - (seq_c[seq_i]=='G'))) + else if ((motif_char =='V') && + ((seq_char=='A') || (seq_char=='C') || (seq_char=='G'))) motif_i++; - else if ((a_motif[seq_i] =='H') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') || - (seq_c[seq_i]=='T'))) + else if ((motif_char =='H') && + ((seq_char=='A') || (seq_char=='C') || (seq_char=='T'))) motif_i++; - else if ((a_motif[motif_i] =='D') && - ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') || - (seq_c[seq_i]=='T'))) + else if ((motif_char =='D') && + ((seq_char=='A') || (seq_char=='G') || (seq_char=='T'))) motif_i++; - else if ((a_motif[motif_i] =='B') && - ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') || - (seq_c[seq_i]=='T'))) + else if ((motif_char =='B') && + ((seq_char=='C') || (seq_char=='G') || (seq_char=='T'))) motif_i++; - else { + // if a motif doesn't match, erase our current trial and try again seq_i -= motif_i; motif_i = 0; } @@ -1026,7 +933,6 @@ Sequence::motif_scan(std::string a_motif, std::vector * motif_match_starts) // end Nora stuff, now we see if a match is found this pass if (motif_i == motif_len) { - //std::cout << "!!"; annot new_motif; motif_match_starts->push_back(seq_i - motif_len + 1); motif_i = 0; @@ -1118,9 +1024,14 @@ bool operator==(const Sequence& x, const Sequence& y) // since the length of the two sequences is equal, we only need to // test one. for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) { - if (*xseq_i != *yseq_i) { + if (toupper(*xseq_i) != toupper(*yseq_i)) { return false; } } return true; } + +bool operator!=(const Sequence& x, const Sequence& y) +{ + return not operator==(x, y); +} diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 5010729..dc9d594 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -31,6 +31,8 @@ #include +#include "alg/alphabet.hpp" + // Sequence data class //! Attach annotation information to a sequence track @@ -87,6 +89,11 @@ BOOST_CLASS_EXPORT(motif); //! functions need the serialization support to be in-class. class seq_string : public std::string { +public: + typedef std::string::iterator iterator; + typedef std::string::reverse_iterator reverse_iterator; + typedef std::string::const_iterator const_iterator; + typedef std::string::const_reverse_iterator const_reverse_iterator; private: friend class boost::serialization::access; template @@ -94,7 +101,7 @@ private: //ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(std::string); ar & boost::serialization::make_nvp("bases", boost::serialization::base_object(*this) - ); + ); } }; @@ -102,43 +109,39 @@ private: class Sequence { public: + typedef std::string::value_type value_type; typedef std::string::difference_type difference_type; typedef std::string::iterator iterator; + typedef std::string::reverse_iterator reverse_iterator; typedef std::string::const_iterator const_iterator; + typedef std::string::const_reverse_iterator const_reverse_iterator; typedef std::string::reference reference; typedef std::string::const_reference const_reference; typedef std::string::size_type size_type; static const size_type npos = std::string::npos; enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand }; - - // some standard dna alphabets - // Include nl (\012), and cr (\015) to make sequence parsing eol - // convention independent. - - static const std::string dna_alphabet; - static const std::string rna_alphabet; - //! this is the general iupac alphabet for nucleotides - static const std::string nucleic_iupac_alphabet; - //! the protein alphabet - static const std::string protein_alphabet; - - Sequence(); - ~Sequence(); - Sequence(const char* seq); - Sequence(const std::string& seq); + enum alphabet_ref { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, + nucleic_alphabet, protein_alphabet }; + + Sequence(alphabet_ref a = reduced_nucleic_alphabet); + Sequence(const char* seq, alphabet_ref a = reduced_nucleic_alphabet); + Sequence(const std::string& seq, alphabet_ref a = reduced_nucleic_alphabet); Sequence(const Sequence& seq); + ~Sequence(); //! assignment to constant sequences Sequence &operator=(const Sequence&); friend std::ostream& operator<<(std::ostream&, const Sequence&); friend bool operator<(const Sequence&, const Sequence&); friend bool operator==(const Sequence&, const Sequence&); + friend bool operator!=(const Sequence&, const Sequence&); const_reference operator[](size_type) const; //! set sequence to a (sub)string containing nothing but AGCTN - void set_filtered_sequence(const std::string& seq, - size_type start=0, - size_type count=npos, + void set_filtered_sequence(const std::string& seq, + alphabet_ref a, + size_type start=0, + size_type count=npos, strand_type strand=UnknownStrand); //! retrive element at specific position @@ -164,11 +167,13 @@ public: //! return a subsequence, copying over any appropriate annotation Sequence subseq(int start=0, int count = std::string::npos); + //! reverse a character + std::string create_reverse_map() const; //! return a reverse compliment (this needs to be improved?) - std::string rev_comp() const; + Sequence rev_comp() const; //! set sequence (filtered) - void set_sequence(const std::string &); + void set_sequence(const std::string &, alphabet_ref); //! get sequence std::string get_sequence() const; //! set species name @@ -181,19 +186,32 @@ public: std::string get_fasta_header() const; //! get name (will return the first non-empty, of fasta_header, species) std::string get_name() const; + //! return a reference to whichever alphabet we're currently representing + const Alphabet& get_alphabet() const; + //! return a reference to whichever alphabet we're currently representing + const Alphabet& get_alphabet(alphabet_ref) const; + //! load sequence from fasta file using the sequences current alphabet + void load_fasta(const boost::filesystem::path file_path, int seq_num=1, + int start_index=0, int end_index=0); //! load sequence AGCT from fasta file //! \throw mussa_load_error //! \throw sequence_empty_error //! \throw sequence_empty_file_error - void load_fasta(const boost::filesystem::path file_path, int seq_num=1, - int start_index=0, int end_index=0); + void load_fasta(const boost::filesystem::path file_path, + alphabet_ref a, + int seq_num=1, + int start_index=0, int end_index=0); + void load_fasta(std::iostream& file, + int seq_num=1, int start_index=0, int end_index=0); //! load sequence from stream //! \throw mussa_load_error //! \throw sequence_empty_error //! \throw sequence_empty_file_error - void load_fasta(std::iostream& file, int seq_num=1, - int start_index=0, int end_index=0); + void load_fasta(std::iostream& file, + alphabet_ref a, + int seq_num=1, + int start_index=0, int end_index=0); //! load sequence annotations //! \throws mussa_load_error void load_annot(const boost::filesystem::path file_path, int start_index, int end_index); @@ -207,20 +225,12 @@ public: const std::list& motifs() const; //! add a motif to our list of motifs - //! \throws motif_normalize_error if there's something wrong with a_motif void add_motif(const Sequence& a_motif); //! clear our list of found motifs void clear_motifs(); //! search a sequence for a_motif //! \throws motif_normalize_error if there's something wrong with a_motif - std::vector find_motif(const std::string& a_motif) const; - //! search a sequence for a_motif - //! \throws motif_normalize_error if there's something wrong with a_motif - std::vector find_motif(const Sequence& a_motif) const; - //! convert IUPAC symbols to upperase - //! \throws motif_normalize_error if there is an invalid symbol - static std::string motif_normalize(const std::string& a_motif); - + std::vector find_motif(const Sequence& a_motif) const; //! annotate the current sequence with other sequences void find_sequences(std::list::iterator start, std::list::iterator end); @@ -233,6 +243,8 @@ private: Sequence *parent; //! hold a shared pointer to our sequence string boost::shared_ptr seq; + //! which alphabet we're using + alphabet_ref alphabet; //! start offset into the sequence size_type seq_start; //! number of basepairs of the shared sequence we represent @@ -249,7 +261,7 @@ private: //! a seperate list for motifs since we're currently not saving them std::list motif_list; - void motif_scan(std::string a_motif, std::vector * motif_match_starts) const; + void motif_scan(const Sequence& a_motif, std::vector * motif_match_starts) const; std::string rc_motif(std::string a_motif) const; //! look for a string sequence type and and it to an annotation list void add_string_annotation(std::string a_seq, std::string name); @@ -260,6 +272,7 @@ private: void serialize(Archive& ar, const unsigned int /*version*/) { ar & BOOST_SERIALIZATION_NVP(parent); ar & BOOST_SERIALIZATION_NVP(seq); + ar & BOOST_SERIALIZATION_NVP(alphabet); ar & BOOST_SERIALIZATION_NVP(seq_start); ar & BOOST_SERIALIZATION_NVP(seq_count); ar & BOOST_SERIALIZATION_NVP(strand); diff --git a/alg/test/CMakeLists.txt b/alg/test/CMakeLists.txt index 64767bb..7e5b7a8 100644 --- a/alg/test/CMakeLists.txt +++ b/alg/test/CMakeLists.txt @@ -2,10 +2,19 @@ FIND_PACKAGE(OpenGL) INCLUDE(FindBoost) INCLUDE(Platform) -SET(SOURCES test_annotation_color.cpp test_color.cpp test_conserved_path.cpp - test_flp.cpp test_glseqbrowser.cpp test_glsequence.cpp - test_main.cpp test_mussa.cpp test_nway.cpp - test_sequence.cpp test_sequence_location.cpp ) +SET(SOURCES + test_alphabet.cpp + test_annotation_color.cpp + test_color.cpp + test_conserved_path.cpp + test_flp.cpp + test_glseqbrowser.cpp + test_glsequence.cpp + test_main.cpp + test_mussa.cpp + test_nway.cpp + test_sequence.cpp + test_sequence_location.cpp ) GET_MUSSA_COMPILE_FLAGS(ALG_TEST_CFLAGS) GET_MUSSA_LINK_FLAGS(ALG_TEST_LDFLAGS) diff --git a/alg/test/test_alphabet.cpp b/alg/test/test_alphabet.cpp new file mode 100644 index 0000000..bfe6e3e --- /dev/null +++ b/alg/test/test_alphabet.cpp @@ -0,0 +1,21 @@ +#include + +#include +#include +#include +#include + +#include "alg/alphabet.hpp" +#include "mussa_exceptions.hpp" + +BOOST_AUTO_TEST_CASE( alphabet_simple ) +{ + const Alphabet &a = Alphabet::reduced_dna_alphabet; + // exists is case insensitive + BOOST_CHECK_EQUAL( a.exists('a'), true); + BOOST_CHECK_EQUAL( a.exists('A'), true); + BOOST_CHECK_EQUAL( a.exists('Q'), false); + BOOST_CHECK_EQUAL( a.exists('q'), false); + + BOOST_CHECK_EQUAL( a.c_str(), "AaCcGgTtNn\012\015"); // copied from alphabet.cpp +} diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index c588555..d4b2821 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -65,33 +65,66 @@ BOOST_AUTO_TEST_CASE( sequence_eol_conventions ) BOOST_AUTO_TEST_CASE( sequence_filter ) { const char *core_seq = "AATTGGCC"; - Sequence s1(core_seq); + Sequence s1(core_seq, Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL(s1, core_seq); - Sequence s2("aattggcc"); + Sequence s2("aattggcc", Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL(s2, "AATTGGCC"); BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT"); + BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now BOOST_CHECK_EQUAL(s2.size(), s2.size()); - BOOST_CHECK_EQUAL(s2.get_sequence(), core_seq); + BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc"); - Sequence s3("asdfg"); + Sequence s3("asdfg", Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL(s3, "ANNNG"); BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN"); - s3.set_filtered_sequence("AAGGCCTT", 0, 2); + s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2); BOOST_CHECK_EQUAL(s3, "AA"); - s3.set_filtered_sequence("AAGGCCTT", 2, 2); + s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2); BOOST_CHECK_EQUAL( s3, "GG"); - s3.set_filtered_sequence("AAGGCCTT", 4); + s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4); BOOST_CHECK_EQUAL( s3, "CCTT"); s3 = "AAGGFF"; BOOST_CHECK_EQUAL(s3, "AAGGNN"); } +BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet ) +{ + std::string agct("AGCT"); + Sequence seq(agct, Sequence::nucleic_alphabet); + BOOST_CHECK_EQUAL(seq.size(), agct.size()); + BOOST_CHECK_EQUAL(seq.get_sequence(), agct); + + std::string bdv("BDv"); + Sequence seq_bdv(bdv, Sequence::nucleic_alphabet); + BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size()); + BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv); + +} + +BOOST_AUTO_TEST_CASE( sequence_default_alphabet ) +{ + std::string agct("AGCT"); + Sequence seq(agct); + BOOST_CHECK_EQUAL(seq.size(), agct.size()); + BOOST_CHECK_EQUAL(seq.get_sequence(), agct); + BOOST_CHECK_EQUAL(seq[0], agct[0]); + BOOST_CHECK_EQUAL(seq[1], agct[1]); + BOOST_CHECK_EQUAL(seq[2], agct[2]); + BOOST_CHECK_EQUAL(seq[3], agct[3]); + + std::string bdv("BDv"); + Sequence seq_bdv(bdv); + BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size()); + // default alphabet only allows AGCTUN + BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN"); +} + BOOST_AUTO_TEST_CASE( subseq_names ) { - Sequence s1("AAGGCCTT"); + Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet); s1.set_species("species"); s1.set_fasta_header("a fasta header"); Sequence s2 = s1.subseq(2,2); @@ -107,7 +140,7 @@ BOOST_AUTO_TEST_CASE( sequence_start_stop ) BOOST_CHECK_EQUAL( s1.stop(), 0 ); std::string seq_string("AAGGCCTT"); - Sequence s2(seq_string); + Sequence s2(seq_string, Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL( s2.start(), 0 ); BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() ); @@ -132,7 +165,7 @@ BOOST_AUTO_TEST_CASE( sequence_load ) fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq"); seq_path /= "human_mck_pro.fa"; Sequence s; - s.load_fasta(seq_path); + s.load_fasta(seq_path, Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC"); BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human " @@ -140,6 +173,113 @@ BOOST_AUTO_TEST_CASE( sequence_load ) "5' flank"); } +BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced ) +{ + std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n"); + std::stringstream reduced_dna_fasta(reduced_dna_fasta_string); + std::string invalid_dna_fasta_string(">wrong\nAUSSI\n"); + std::stringstream invalid_dna_fasta(invalid_dna_fasta_string); + std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n"); + std::stringstream reduced_rna_fasta(reduced_rna_fasta_string); + std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n"); + std::stringstream garbage_fasta(garbage_string); + + Sequence s; + s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet); + BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta, + Sequence::reduced_dna_alphabet), + sequence_invalid_load_error); + BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta, + Sequence::reduced_dna_alphabet), + sequence_invalid_load_error); + BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, + Sequence::reduced_dna_alphabet), + sequence_invalid_load_error); + +} + +BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced ) +{ + std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n"); + std::stringstream reduced_rna_fasta(reduced_rna_fasta_string); + std::string invalid_rna_fasta_string(">wrong\nATSSI\n"); + std::stringstream invalid_rna_fasta(invalid_rna_fasta_string); + std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n"); + std::stringstream reduced_dna_fasta(reduced_dna_fasta_string); + std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n"); + std::stringstream garbage_fasta(garbage_string); + + Sequence s; + s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet); + BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta, + Sequence::reduced_rna_alphabet), + sequence_invalid_load_error); + BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta, + Sequence::reduced_rna_alphabet), + sequence_invalid_load_error); + BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, + Sequence::reduced_rna_alphabet), + sequence_invalid_load_error); +} + +BOOST_AUTO_TEST_CASE( sequence_load_fasta_default ) +{ + std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n"); + std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string); + std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string); + std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n"); + std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string); + std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n"); + std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string); + std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string); + std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n"); + std::stringstream garbage_fasta(garbage_string); + + Sequence s; + Sequence specific; + // there's two copies of reduced_rna_fasta because i didn't feel like + // figuring out how to properly reset the read pointer in a stringstream + s.load_fasta(reduced_rna_fasta1); + specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet); + BOOST_CHECK_EQUAL(s, specific); + + s.load_fasta(reduced_dna_fasta1); + specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet); + BOOST_CHECK_EQUAL(s, specific); + + BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta), + sequence_invalid_load_error); + BOOST_CHECK_THROW(s.load_fasta(garbage_fasta), + sequence_invalid_load_error); +} + +BOOST_AUTO_TEST_CASE( sequence_reverse_complement ) +{ + std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?"); + Sequence seq(iupac_symbols, Sequence::nucleic_alphabet); + Sequence seqr = seq.rev_comp(); + + BOOST_CHECK( seq != seqr ); + BOOST_CHECK_EQUAL( seq, seqr.rev_comp() ); + BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols ); +} + +BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna ) +{ + std::string dna_str("AGCTN"); + Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet); + BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT"); + BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp()); +} + +BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna ) +{ + std::string rna_str("AGCUN"); + Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet); + BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU"); + BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp()); +} + BOOST_AUTO_TEST_CASE( annotation_load ) { string annot_data = "human\n" @@ -156,7 +296,7 @@ BOOST_AUTO_TEST_CASE( annotation_load ) ; string s(100, 'A'); s += "GCTGCTAATT"; - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); @@ -207,7 +347,7 @@ BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load) "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG" "AAAAA" "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC"; - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); seq.parse_annot(annot_data); std::list annots = seq.annotations(); BOOST_CHECK_EQUAL( annots.size(), 2); @@ -228,7 +368,7 @@ BOOST_AUTO_TEST_CASE( annotation_load_no_species_name ) ; string s(100, 'A'); s += "GCTGCTAATT"; - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); @@ -279,12 +419,12 @@ BOOST_AUTO_TEST_CASE ( sequence_size ) BOOST_AUTO_TEST_CASE( sequence_empty_equality ) { - Sequence szero(""); + Sequence szero("", Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL(szero.empty(), true); BOOST_CHECK_EQUAL(szero, szero); BOOST_CHECK_EQUAL(szero, ""); - Sequence sclear("AGCT"); + Sequence sclear("AGCT", Sequence::reduced_dna_alphabet); sclear.clear(); BOOST_CHECK_EQUAL(sclear.empty(), true); BOOST_CHECK_EQUAL(sclear, sclear); @@ -295,7 +435,7 @@ BOOST_AUTO_TEST_CASE( sequence_empty_equality ) BOOST_AUTO_TEST_CASE ( sequence_iterators ) { std::string seq_string = "AAGGCCTTNNTATA"; - Sequence s(seq_string); + Sequence s(seq_string, Sequence::reduced_dna_alphabet); const Sequence cs(s); std::string::size_type count = 0; @@ -324,7 +464,7 @@ BOOST_AUTO_TEST_CASE( sequence_motifs ) { string m("AAAA"); string bogus("AATTGGAA"); - Sequence s1("AAAAGGGGCCCCTTTT"); + Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet); list::const_iterator motif_i = s1.motifs().begin(); list::const_iterator motif_end = s1.motifs().end(); @@ -401,7 +541,7 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence ) "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); string gc("GCCCCC"); string gga("GGACACCTC"); - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); std::list query_list; std::list string_list; @@ -428,6 +568,13 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence ) } } BOOST_CHECK_EQUAL(seq.annotations().size(), count); + const std::list &a = seq.annotations(); + for (std::list::const_iterator annot_i = a.begin(); + annot_i != a.end(); + ++annot_i) + { + int count = annot_i->end - annot_i->begin ; + } } BOOST_AUTO_TEST_CASE( subseq_annotation_test ) @@ -440,7 +587,7 @@ BOOST_AUTO_TEST_CASE( subseq_annotation_test ) "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA" "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA" "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); seq.add_annotation(annot(0, 10, "0-10", "0-10")); @@ -477,7 +624,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update ) "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA" "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA" "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); // starting conditions BOOST_CHECK_EQUAL(seq.annotations().size(), 0); @@ -498,7 +645,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update ) BOOST_AUTO_TEST_CASE( out_operator ) { string s("AAGGCCTT"); - Sequence seq(s); + Sequence seq(s, Sequence::reduced_dna_alphabet); ostringstream buf; buf << s; @@ -507,7 +654,7 @@ BOOST_AUTO_TEST_CASE( out_operator ) BOOST_AUTO_TEST_CASE( get_name ) { - Sequence seq("AAGGCCTT"); + Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet); BOOST_CHECK_EQUAL( seq.get_name(), "" ); seq.set_species("hooman"); // anyone remember tradewars? @@ -519,7 +666,7 @@ BOOST_AUTO_TEST_CASE( get_name ) BOOST_AUTO_TEST_CASE( serialize_simple ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string); + Sequence seq(seq_string, Sequence::reduced_dna_alphabet); seq.set_species("ribbet"); std::ostringstream oss; // allocate/deallocate serialization components @@ -542,7 +689,7 @@ BOOST_AUTO_TEST_CASE( serialize_simple ) BOOST_AUTO_TEST_CASE( serialize_tree ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string); + Sequence seq(seq_string, Sequence::reduced_dna_alphabet); seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); @@ -572,7 +719,7 @@ BOOST_AUTO_TEST_CASE( serialize_tree ) BOOST_AUTO_TEST_CASE( serialize_xml_sequence ) { std::string seq_string = "AAGGCCTT"; - Sequence seq(seq_string); + Sequence seq(seq_string, Sequence::reduced_dna_alphabet); seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); @@ -599,7 +746,7 @@ BOOST_AUTO_TEST_CASE( serialize_xml_sequence ) BOOST_AUTO_TEST_CASE( serialize_xml_two ) { std::string seq_string = "AAGGCCTT"; - Sequence seq1(seq_string); + Sequence seq1(seq_string, Sequence::reduced_dna_alphabet); Sequence seq2(seq1); std::ostringstream oss; diff --git a/mussa_exceptions.hpp b/mussa_exceptions.hpp index 9adb529..065347a 100644 --- a/mussa_exceptions.hpp +++ b/mussa_exceptions.hpp @@ -55,7 +55,13 @@ public: sequence_load_error(msg) {}; }; - +// Empty unrecognized characters +class sequence_invalid_load_error : public sequence_load_error +{ +public: + explicit sequence_invalid_load_error(const std::string& msg) : + sequence_load_error(msg) {}; +}; //! failure running analysis class mussa_analysis_error : public mussa_error { diff --git a/qui/motif_editor/MotifElement.cpp b/qui/motif_editor/MotifElement.cpp index 5bbda0e..1804e94 100644 --- a/qui/motif_editor/MotifElement.cpp +++ b/qui/motif_editor/MotifElement.cpp @@ -1,11 +1,12 @@ #include "qui/motif_editor/MotifEditor.hpp" +#include "alg/alphabet.hpp" #include MotifElement::MotifElement() : enabled(true), color(Color(1.0,0.0,0.0,1.0)), - motif() + motif(Sequence::nucleic_alphabet) { } @@ -76,7 +77,7 @@ void MotifElement::setSequence(const Sequence& seq) //! set sequence text void MotifElement::setSequence(const std::string& seq_text) { - motif.set_sequence(seq_text); + motif.set_sequence(seq_text, Sequence::nucleic_alphabet); } QString MotifElement::getSequenceText() const diff --git a/qui/mussa_setup_dialog/MussaSetupWidget.cpp b/qui/mussa_setup_dialog/MussaSetupWidget.cpp index 55d6199..4e7b37b 100644 --- a/qui/mussa_setup_dialog/MussaSetupWidget.cpp +++ b/qui/mussa_setup_dialog/MussaSetupWidget.cpp @@ -69,10 +69,10 @@ MussaSetupWidget::MussaSetupWidget(QWidget *parent) row1Layout->addWidget(analysisNameLabel); row1Layout->addWidget(&analysisNameLineEdit); - row2Layout->addWidget(windowLabel); - row2Layout->addWidget(&windowLineEdit); row2Layout->addWidget(thresholdLabel); row2Layout->addWidget(&thresholdLineEdit); + row2Layout->addWidget(windowLabel); + row2Layout->addWidget(&windowLineEdit); row2Layout->addWidget(numOfSequencesLabel); row2Layout->addWidget(&numOfSequencesSpinBox); -- 2.30.2