)
QT4_WRAP_CPP(MOC_SOURCES ${MOC_HEADERS})
-SET(SOURCES annotation_colors.cpp
+SET(SOURCES alphabet.cpp
+ annotation_colors.cpp
color.cpp
conserved_path.cpp
flp.cpp
--- /dev/null
+#include "alg/alphabet.hpp"
+
+// some standard dna alphabets
+// Include nl (\012), and cr (\015) to make sequence parsing eol
+// convention independent.
+const Alphabet Alphabet::reduced_dna_alphabet("AaCcGgTtNn\012\015");
+const Alphabet Alphabet::reduced_rna_alphabet("AaCcGgUuNn\012\015");
+const Alphabet Alphabet::reduced_nucleic_alphabet("AaCcGgTtUuNn\012\015");
+//! this is the general iupac alphabet for nucleotides
+const Alphabet Alphabet::nucleic_alphabet(
+ "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015"
+);
+//! the protein alphabet
+const Alphabet Alphabet::protein_alphabet(
+ "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"
+);
+
+Alphabet::Alphabet(const char *a) :
+ alphabet(a)
+{
+ alphabet_set.insert(alphabet.begin(), alphabet.end());
+}
+
+void Alphabet::assign(const Alphabet& a)
+{
+ alphabet = a.alphabet;
+ alphabet_set.clear();
+ alphabet_set.insert(alphabet.begin(), alphabet.end());
+}
+
+const char *Alphabet::c_str() const
+{
+ alphabet.c_str();
+}
+
+bool Alphabet::exists(const char c) const
+{
+ return (alphabet_set.find(c) != alphabet_set.end());
+}
\ No newline at end of file
--- /dev/null
+#ifndef ALPHABET_HPP_
+#define ALPHABET_HPP_
+
+#include <boost/serialization/export.hpp>
+#include <boost/serialization/nvp.hpp>
+#include <boost/serialization/string.hpp>
+#include <boost/serialization/utility.hpp>
+#include <boost/serialization/version.hpp>
+
+#include <set>
+
+//! this is a helper class for sequence
+class Alphabet {
+friend class Sequence;
+public:
+ typedef std::string::const_iterator const_iterator;
+
+ //! return reference to the characters in our alphabet
+ const char *c_str() const;
+ //! case-insensitive test to check a character for existence in our alphabet
+ bool exists(const char) const;
+
+ // note, if you want to define an alphabet for a sequence, you probably want
+ // to update the enumeration in Sequence, and Sequence::get_sequence
+ //! The standard DNA alphabet, with unique, and unknown characters
+ static const Alphabet reduced_dna_alphabet;
+ //! The standard RNA alphabet, with unique, and unknown characters
+ static const Alphabet reduced_rna_alphabet;
+ //! The standard DNA/RNA alphabet, with unique, and unknown characters
+ static const Alphabet reduced_nucleic_alphabet;
+ //! this is the general IUPAC alphabet for nucleotides
+ static const Alphabet nucleic_alphabet;
+ //! the protein alphabet
+ static const Alphabet protein_alphabet;
+
+private:
+ //! what are allowable symbols in our alphabet
+ std::string alphabet;
+ //! internal variable to make exists() faster
+ std::set<std::string::value_type> alphabet_set;
+
+ //! some necessary string api access
+ Alphabet(const char *a);
+ //! allow sequence to copy one alphabet to another (needed when unserializing)
+ void assign(const Alphabet& a);
+ const_iterator begin() const { return alphabet.begin(); }
+ const_iterator end() const { return alphabet.end(); }
+
+ // serialization
+ friend class boost::serialization::access;
+ template<class Archive>
+ void serialize(Archive& ar, const unsigned int /*version*/) {
+ ar & BOOST_SERIALIZATION_NVP(alphabet);
+ alphabet_set.clear();
+ alphabet_set.insert(alphabet.begin(), alphabet.end());
+ }
+};
+
+#endif /*ALPHABET_*/
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
- Sequence seq(seq_string);
+ Sequence seq(seq_string, Sequence::nucleic_alphabet);
// shouldn't we have a better field than "fasta header" and speices?
seq.set_fasta_header(name);
// we need to clear the name in case the next motif doesn't have one.
void Mussa::load_motifs(std::istream &in)
{
std::string data;
- const char *alphabet = Sequence::nucleic_iupac_alphabet.c_str();
+ const char *alphabet = Alphabet::nucleic_alphabet.c_str();
string seq;
string name;
float red = 1.0;
#include "mussa_exceptions.hpp"
#include <string>
+#include <stdexcept>
#include <iostream>
#include <sstream>
+#include <set>
annot::annot()
: begin(0),
{
}
-const std::string Sequence::dna_alphabet("AaCcGgTtNn\012\015");
-const std::string Sequence::rna_alphabet("AaCcGgNnUu\012\015");
- //! this is the general iupac alphabet for nucleotides
-const std::string Sequence::nucleic_iupac_alphabet("AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015");
- //! the protein alphabet
-const std::string Sequence::protein_alphabet("AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015");
-Sequence::Sequence()
+Sequence::Sequence(alphabet_ref alphabet_)
: parent(0),
+ alphabet(alphabet_),
seq_start(0),
seq_count(0),
strand(UnknownStrand)
{
}
-Sequence::Sequence(const char *seq)
+Sequence::Sequence(const char *seq, alphabet_ref alphabet_)
: parent(0),
+ alphabet(alphabet_),
seq_start(0),
seq_count(0),
strand(UnknownStrand),
header(""),
species("")
{
- set_filtered_sequence(seq);
+ set_filtered_sequence(seq, alphabet);
}
-Sequence::Sequence(const std::string& seq)
+Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_)
: parent(0),
+ alphabet(alphabet_),
seq_start(0),
seq_count(0),
strand(UnknownStrand),
header(""),
species("")
{
- set_filtered_sequence(seq);
+ set_filtered_sequence(seq, alphabet);
}
Sequence::Sequence(const Sequence& o)
: parent(o.parent),
seq(o.seq),
+ alphabet(o.alphabet),
seq_start(o.seq_start),
seq_count(o.seq_count),
strand(o.strand),
if (this != &s) {
parent = s.parent;
seq = s.seq;
+ alphabet = s.alphabet;
seq_start = s.seq_start;
seq_count = s.seq_count;
strand = s.strand;
}
}
+void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
+{
+ load_fasta(file_path, alphabet, seq_num, start_index, end_index);
+}
+
//! load a fasta file into a sequence
-/*!
- * \param file_path the location of the fasta file in the filesystem
- * \param seq_num which sequence in the file to load
- * \param start_index starting position in the fasta sequence, 0 for beginning
- * \param end_index ending position in the fasta sequence, 0 for end
- * \return error message, empty string if no error. (gag!)
- */
-void Sequence::load_fasta(fs::path file_path, int seq_num,
- int start_index, int end_index)
+void Sequence::load_fasta(fs::path file_path, alphabet_ref a,
+ int seq_num, int start_index, int end_index)
{
fs::fstream data_file;
data_file.open(file_path, std::ios::in);
throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
} else {
try {
- load_fasta(data_file, seq_num, start_index, end_index);
+ load_fasta(data_file, a, seq_num, start_index, end_index);
} catch(sequence_empty_error e) {
// there doesn't appear to be any sequence
// catch and rethrow to include the filename
}
}
+void Sequence::load_fasta(std::iostream& file,
+ int seq_num, int start_index, int end_index)
+{
+ load_fasta(file, alphabet, seq_num, start_index, end_index);
+}
+
void
-Sequence::load_fasta(std::iostream& data_file, int seq_num,
+Sequence::load_fasta(std::iostream& data_file, alphabet_ref a,
+ int seq_num,
int start_index, int end_index)
{
std::string file_data_line;
std::string rev_comp;
std::string sequence_raw;
std::string seq_tmp; // holds sequence during basic filtering
+ const Alphabet &alpha = get_alphabet(a);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
multiplatform_getline(data_file,file_data_line);
if (file_data_line.substr(0,1) == ">")
read_seq = false;
- else sequence_raw += file_data_line;
+ else {
+ for (std::string::const_iterator line_i = file_data_line.begin();
+ line_i != file_data_line.end();
+ ++line_i)
+ {
+ if(alpha.exists(*line_i)) {
+ sequence_raw += *line_i;
+ } else {
+ throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
+ }
+ }
+ }
}
// Lastly, if subselection of the sequence was specified we keep cut out
std::string msg("The selected sequence appears to be empty");
throw sequence_empty_error(msg);
}
- set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
+ set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
} else {
std::string errormsg("There were no fasta sequences");
throw sequence_empty_file_error(errormsg);
}
}
-void Sequence::set_filtered_sequence(const std::string &old_seq,
+void Sequence::set_filtered_sequence(const std::string &in_seq,
+ alphabet_ref alphabet_,
size_type start,
size_type count,
strand_type strand_)
{
- char conversionTable[257];
-
+ alphabet = alphabet_;
if ( count == npos)
- count = old_seq.size() - start;
+ count = in_seq.size() - start;
boost::shared_ptr<seq_string> new_seq(new seq_string);
new_seq->reserve(count);
- // Make a conversion table
-
- // everything we don't specify below will become 'N'
- for(int table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = 'N';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // we want these to map to themselves - ie not to change
- conversionTable[(int)'A'] = 'A';
- conversionTable[(int)'T'] = 'T';
- conversionTable[(int)'G'] = 'G';
- conversionTable[(int)'C'] = 'C';
- // this is to upcase
- conversionTable[(int)'a'] = 'A';
- conversionTable[(int)'t'] = 'T';
- conversionTable[(int)'g'] = 'G';
- conversionTable[(int)'c'] = 'C';
-
// finally, the actual conversion loop
- for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
+ const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets
+ std::string::const_iterator seq_i = in_seq.begin()+start;
+ for(size_type i = 0; i != count; ++i, ++seq_i)
{
- new_seq->append(1, conversionTable[ (int)old_seq[seq_index+start]]);
+ if (alpha_impl.exists(*seq_i)) {
+ new_seq->append(1, *seq_i);
+ } else {
+ new_seq->append(1, 'N');
+ }
}
parent = 0;
seq = new_seq;
((spirit::ch_p('>')|spirit::str_p(">")) >>
(*(spirit::print_p))[spirit::assign_a(name)] >>
spirit::eol_p >>
- (+(spirit::chset<>(nucleic_iupac_alphabet.c_str())))[spirit::assign_a(seq)]
+ (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
)[push_back_seq(query_seqs, name, seq)]
) >>
*spirit::space_p
return new_seq;
}
-std::string
-Sequence::rev_comp() const
+std::string Sequence::create_reverse_map() const
{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = length();
- rev_comp.reserve(len);
- // make a conversion table
- // init all parts of conversion table to '~' character
- // '~' I doubt will ever appear in a sequence file (jeez, I hope)
- // and may the fleas of 1000 camels infest the genitals of any biologist (and
- // seven generations of their progeny) who decides to make it mean
- // something special!!!
- // PS - double the curse for any smartass non-biologist who tries it as well
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // add in the characters for the bases we want to convert
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
+ std::string rc_map(256, '~');
+ // if we're rna, use U instead of T
+ // we might want to add an "is_rna" to sequence at somepoint
+ char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T';
+ char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't';
+ rc_map['A'] = TU ; rc_map['a'] = tu ;
+ rc_map['T'] = 'A'; rc_map['t'] = 'a';
+ rc_map['U'] = 'A'; rc_map['u'] = 'a';
+ rc_map['G'] = 'C'; rc_map['g'] = 'c';
+ rc_map['C'] = 'G'; rc_map['c'] = 'g';
+ rc_map['M'] = 'K'; rc_map['m'] = 'k';
+ rc_map['R'] = 'Y'; rc_map['r'] = 'y';
+ rc_map['W'] = 'W'; rc_map['w'] = 'w';
+ rc_map['S'] = 'S'; rc_map['s'] = 's';
+ rc_map['Y'] = 'R'; rc_map['y'] = 'r';
+ rc_map['K'] = 'M'; rc_map['k'] = 'm';
+ rc_map['V'] = 'B'; rc_map['v'] = 'b';
+ rc_map['H'] = 'D'; rc_map['h'] = 'd';
+ rc_map['D'] = 'H'; rc_map['d'] = 'h';
+ rc_map['B'] = 'V'; rc_map['b'] = 'v';
+ rc_map['N'] = 'N'; rc_map['n'] = 'n';
+ rc_map['X'] = 'X'; rc_map['x'] = 'x';
+ rc_map['?'] = '?';
+ rc_map['.'] = '.';
+ rc_map['-'] = '-';
+ rc_map['~'] = '~'; // not really needed, but perhaps it's clearer.
+ return rc_map;
+}
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
+Sequence Sequence::rev_comp() const
+{
+ std::string rev_comp;
+ rev_comp.reserve(length());
+
+ std::string rc_map = create_reverse_map();
+
+ // reverse and convert
+ seq_string::const_reverse_iterator seq_i;
+ seq_string::const_reverse_iterator seq_end = seq->rend();
+ for(seq_i = seq->rbegin();
+ seq_i != seq_end;
+ ++seq_i)
{
- table_i = (int) at(seq_i);
- rev_comp += conversionTable[table_i];
+ rev_comp.append(1, rc_map[*seq_i]);
}
-
- return rev_comp;
+ return Sequence(rev_comp, alphabet);
}
void Sequence::set_fasta_header(std::string header_)
return "";
}
-void Sequence::set_sequence(const std::string& s)
+const Alphabet& Sequence::get_alphabet() const
+{
+ return get_alphabet(alphabet);
+}
+
+const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const
{
- set_filtered_sequence(s);
+ switch (alpha) {
+ case reduced_dna_alphabet:
+ return Alphabet::reduced_dna_alphabet;
+ case reduced_rna_alphabet:
+ return Alphabet::reduced_rna_alphabet;
+ case reduced_nucleic_alphabet:
+ return Alphabet::reduced_nucleic_alphabet;
+ case nucleic_alphabet:
+ return Alphabet::nucleic_alphabet;
+ case protein_alphabet:
+ return Alphabet::protein_alphabet;
+ default:
+ throw std::runtime_error("unrecognized alphabet type");
+ break;
+ }
+}
+
+void Sequence::set_sequence(const std::string& s, alphabet_ref a)
+{
+ set_filtered_sequence(s, a);
}
std::string Sequence::get_sequence() const
}
getline(load_file, file_data_line);
// looks like the sequence is written as a single line
- set_filtered_sequence(file_data_line);
+ set_filtered_sequence(file_data_line, reduced_dna_alphabet);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
load_file.close();
}
-std::string
-Sequence::rc_motif(std::string a_motif) const
-{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = a_motif.length();
- rev_comp.reserve(len);
-
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
- }
- // add end of std::string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // add in the characters for the bases we want to convert (IUPAC)
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
- conversionTable[(int)'M'] = 'K';
- conversionTable[(int)'R'] = 'Y';
- conversionTable[(int)'W'] = 'W';
- conversionTable[(int)'S'] = 'S';
- conversionTable[(int)'Y'] = 'R';
- conversionTable[(int)'K'] = 'M';
- conversionTable[(int)'V'] = 'B';
- conversionTable[(int)'H'] = 'D';
- conversionTable[(int)'D'] = 'H';
- conversionTable[(int)'B'] = 'V';
-
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
- {
- //std::cout << "** i = " << seq_i << " bp = " <<
- table_i = (int) a_motif[seq_i];
- rev_comp += conversionTable[table_i];
- }
-
- //std::cout << "seq: " << a_motif << std::endl;
- //std::cout << "rc: " << rev_comp << std::endl;
-
- return rev_comp;
-}
-
-std::string
-Sequence::motif_normalize(const std::string& a_motif)
-{
- std::string valid_motif;
- int seq_i, len;
-
- len = a_motif.length();
- valid_motif.reserve(len);
-
- // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
- // current nonIUPAC symbols are omitted, which is not reported atm
- for(seq_i = 0; seq_i < len; seq_i++)
- {
- if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
- valid_motif += 'A';
- else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
- valid_motif += 'T';
- else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
- valid_motif += 'C';
- else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
- valid_motif += 'N';
- else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
- valid_motif += 'M';
- else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
- valid_motif += 'R';
- else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
- valid_motif += 'W';
- else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
- valid_motif += 'S';
- else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
- valid_motif += 'Y';
- else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
- valid_motif += 'V';
- else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
- valid_motif += 'H';
- else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
- valid_motif += 'D';
- else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
- valid_motif += 'B';
- else {
- std::string msg = "Letter ";
- msg += a_motif[seq_i];
- msg += " is not a valid IUPAC symbol";
- throw motif_normalize_error(msg);
- }
- }
- //std::cout << "valid_motif is: " << valid_motif << std::endl;
- return valid_motif;
-}
void Sequence::add_motif(const Sequence& a_motif)
{
}
std::vector<int>
-Sequence::find_motif(const std::string& a_motif) const
+Sequence::find_motif(const Sequence& a_motif) const
{
std::vector<int> motif_match_starts;
- std::string norm_motif_rc;
+ Sequence norm_motif_rc;
motif_match_starts.clear();
+ // std::cout << "motif is: " << norm_motif << std::endl;
- //std::cout << "motif is: " << a_motif << std::endl;
- std::string norm_motif = motif_normalize(a_motif);
- //std::cout << "motif is: " << a_motif << std::endl;
-
- if (norm_motif.size() > 0)
+ if (a_motif.size() > 0)
{
//std::cout << "Sequence: none blank motif\n";
- motif_scan(norm_motif, &motif_match_starts);
+ motif_scan(a_motif, &motif_match_starts);
- norm_motif_rc = rc_motif(a_motif);
+ norm_motif_rc = a_motif.rev_comp();;
// make sure not to do search again if it is a palindrome
- if (norm_motif_rc != norm_motif) {
+ if (norm_motif_rc != a_motif) {
motif_scan(norm_motif_rc, &motif_match_starts);
}
}
return motif_match_starts;
}
-std::vector<int>
-Sequence::find_motif(const Sequence& a_motif) const
-{
- return find_motif(a_motif.get_sequence());
-}
-
void
-Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts) const
+Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
{
// if there's no sequence we can't scan for it?
// should this throw an exception?
if (!seq) return;
- std::string::const_iterator seq_c = seq->begin();
- std::string::size_type seq_i;
- int motif_i, motif_len;
+ std::string::size_type seq_i = 0;
+ Sequence::size_type motif_i = 0;
+ Sequence::size_type motif_len = a_motif.length();
+ Sequence::value_type motif_char;
+ Sequence::value_type seq_char;
- //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl;
- motif_len = a_motif.length();
-
- //std::cout << "motif_length: " << motif_len << std::endl;
- //std::cout << "RAAARRRRR\n";
-
- motif_i = 0;
-
- //std::cout << "motif: " << a_motif << std::endl;
-
- //std::cout << "Sequence: motif, length= " << length << std::endl;
- seq_i = 0;
- while (seq_i < length())
+ while (seq_i < seq->length())
{
- //std::cout << seq_c[seq_i];
- //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
- if (a_motif[motif_i] =='N')
+ motif_char = toupper(a_motif[motif_i]);
+ seq_char = toupper(seq->at(seq_i));
+ if (motif_char =='N')
motif_i++;
- else if (a_motif[motif_i] == seq_c[seq_i])
+ else if (motif_char == seq_char)
motif_i++;
- else if ((a_motif[motif_i] =='M') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+ else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
motif_i++;
- else if ((a_motif[motif_i] =='R') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='W') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='S') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='Y') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='K') &&
- ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='V') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='G')))
+ else if ((motif_char =='V') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[seq_i] =='H') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='H') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='D') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='D') &&
+ ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='B') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='B') &&
+ ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
-
else
{
+ // if a motif doesn't match, erase our current trial and try again
seq_i -= motif_i;
motif_i = 0;
}
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- //std::cout << "!!";
annot new_motif;
motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
// since the length of the two sequences is equal, we only need to
// test one.
for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) {
- if (*xseq_i != *yseq_i) {
+ if (toupper(*xseq_i) != toupper(*yseq_i)) {
return false;
}
}
return true;
}
+
+bool operator!=(const Sequence& x, const Sequence& y)
+{
+ return not operator==(x, y);
+}
#include <iostream>
+#include "alg/alphabet.hpp"
+
// Sequence data class
//! Attach annotation information to a sequence track
//! functions need the serialization support to be in-class.
class seq_string : public std::string
{
+public:
+ typedef std::string::iterator iterator;
+ typedef std::string::reverse_iterator reverse_iterator;
+ typedef std::string::const_iterator const_iterator;
+ typedef std::string::const_reverse_iterator const_reverse_iterator;
private:
friend class boost::serialization::access;
template<class Archive>
//ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(std::string);
ar & boost::serialization::make_nvp("bases",
boost::serialization::base_object<std::string>(*this)
- );
+ );
}
};
class Sequence
{
public:
+ typedef std::string::value_type value_type;
typedef std::string::difference_type difference_type;
typedef std::string::iterator iterator;
+ typedef std::string::reverse_iterator reverse_iterator;
typedef std::string::const_iterator const_iterator;
+ typedef std::string::const_reverse_iterator const_reverse_iterator;
typedef std::string::reference reference;
typedef std::string::const_reference const_reference;
typedef std::string::size_type size_type;
static const size_type npos = std::string::npos;
enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand };
-
- // some standard dna alphabets
- // Include nl (\012), and cr (\015) to make sequence parsing eol
- // convention independent.
-
- static const std::string dna_alphabet;
- static const std::string rna_alphabet;
- //! this is the general iupac alphabet for nucleotides
- static const std::string nucleic_iupac_alphabet;
- //! the protein alphabet
- static const std::string protein_alphabet;
-
- Sequence();
- ~Sequence();
- Sequence(const char* seq);
- Sequence(const std::string& seq);
+ enum alphabet_ref { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet,
+ nucleic_alphabet, protein_alphabet };
+
+ Sequence(alphabet_ref a = reduced_nucleic_alphabet);
+ Sequence(const char* seq, alphabet_ref a = reduced_nucleic_alphabet);
+ Sequence(const std::string& seq, alphabet_ref a = reduced_nucleic_alphabet);
Sequence(const Sequence& seq);
+ ~Sequence();
//! assignment to constant sequences
Sequence &operator=(const Sequence&);
friend std::ostream& operator<<(std::ostream&, const Sequence&);
friend bool operator<(const Sequence&, const Sequence&);
friend bool operator==(const Sequence&, const Sequence&);
+ friend bool operator!=(const Sequence&, const Sequence&);
const_reference operator[](size_type) const;
//! set sequence to a (sub)string containing nothing but AGCTN
- void set_filtered_sequence(const std::string& seq,
- size_type start=0,
- size_type count=npos,
+ void set_filtered_sequence(const std::string& seq,
+ alphabet_ref a,
+ size_type start=0,
+ size_type count=npos,
strand_type strand=UnknownStrand);
//! retrive element at specific position
//! return a subsequence, copying over any appropriate annotation
Sequence subseq(int start=0, int count = std::string::npos);
+ //! reverse a character
+ std::string create_reverse_map() const;
//! return a reverse compliment (this needs to be improved?)
- std::string rev_comp() const;
+ Sequence rev_comp() const;
//! set sequence (filtered)
- void set_sequence(const std::string &);
+ void set_sequence(const std::string &, alphabet_ref);
//! get sequence
std::string get_sequence() const;
//! set species name
std::string get_fasta_header() const;
//! get name (will return the first non-empty, of fasta_header, species)
std::string get_name() const;
+ //! return a reference to whichever alphabet we're currently representing
+ const Alphabet& get_alphabet() const;
+ //! return a reference to whichever alphabet we're currently representing
+ const Alphabet& get_alphabet(alphabet_ref) const;
+ //! load sequence from fasta file using the sequences current alphabet
+ void load_fasta(const boost::filesystem::path file_path, int seq_num=1,
+ int start_index=0, int end_index=0);
//! load sequence AGCT from fasta file
//! \throw mussa_load_error
//! \throw sequence_empty_error
//! \throw sequence_empty_file_error
- void load_fasta(const boost::filesystem::path file_path, int seq_num=1,
- int start_index=0, int end_index=0);
+ void load_fasta(const boost::filesystem::path file_path,
+ alphabet_ref a,
+ int seq_num=1,
+ int start_index=0, int end_index=0);
+ void load_fasta(std::iostream& file,
+ int seq_num=1, int start_index=0, int end_index=0);
//! load sequence from stream
//! \throw mussa_load_error
//! \throw sequence_empty_error
//! \throw sequence_empty_file_error
- void load_fasta(std::iostream& file, int seq_num=1,
- int start_index=0, int end_index=0);
+ void load_fasta(std::iostream& file,
+ alphabet_ref a,
+ int seq_num=1,
+ int start_index=0, int end_index=0);
//! load sequence annotations
//! \throws mussa_load_error
void load_annot(const boost::filesystem::path file_path, int start_index, int end_index);
const std::list<motif>& motifs() const;
//! add a motif to our list of motifs
- //! \throws motif_normalize_error if there's something wrong with a_motif
void add_motif(const Sequence& a_motif);
//! clear our list of found motifs
void clear_motifs();
//! search a sequence for a_motif
//! \throws motif_normalize_error if there's something wrong with a_motif
- std::vector<int> find_motif(const std::string& a_motif) const;
- //! search a sequence for a_motif
- //! \throws motif_normalize_error if there's something wrong with a_motif
- std::vector<int> find_motif(const Sequence& a_motif) const;
- //! convert IUPAC symbols to upperase
- //! \throws motif_normalize_error if there is an invalid symbol
- static std::string motif_normalize(const std::string& a_motif);
-
+ std::vector<int> find_motif(const Sequence& a_motif) const;
//! annotate the current sequence with other sequences
void find_sequences(std::list<Sequence>::iterator start,
std::list<Sequence>::iterator end);
Sequence *parent;
//! hold a shared pointer to our sequence string
boost::shared_ptr<seq_string> seq;
+ //! which alphabet we're using
+ alphabet_ref alphabet;
//! start offset into the sequence
size_type seq_start;
//! number of basepairs of the shared sequence we represent
//! a seperate list for motifs since we're currently not saving them
std::list<motif> motif_list;
- void motif_scan(std::string a_motif, std::vector<int> * motif_match_starts) const;
+ void motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const;
std::string rc_motif(std::string a_motif) const;
//! look for a string sequence type and and it to an annotation list
void add_string_annotation(std::string a_seq, std::string name);
void serialize(Archive& ar, const unsigned int /*version*/) {
ar & BOOST_SERIALIZATION_NVP(parent);
ar & BOOST_SERIALIZATION_NVP(seq);
+ ar & BOOST_SERIALIZATION_NVP(alphabet);
ar & BOOST_SERIALIZATION_NVP(seq_start);
ar & BOOST_SERIALIZATION_NVP(seq_count);
ar & BOOST_SERIALIZATION_NVP(strand);
INCLUDE(FindBoost)
INCLUDE(Platform)
-SET(SOURCES test_annotation_color.cpp test_color.cpp test_conserved_path.cpp
- test_flp.cpp test_glseqbrowser.cpp test_glsequence.cpp
- test_main.cpp test_mussa.cpp test_nway.cpp
- test_sequence.cpp test_sequence_location.cpp )
+SET(SOURCES
+ test_alphabet.cpp
+ test_annotation_color.cpp
+ test_color.cpp
+ test_conserved_path.cpp
+ test_flp.cpp
+ test_glseqbrowser.cpp
+ test_glsequence.cpp
+ test_main.cpp
+ test_mussa.cpp
+ test_nway.cpp
+ test_sequence.cpp
+ test_sequence_location.cpp )
GET_MUSSA_COMPILE_FLAGS(ALG_TEST_CFLAGS)
GET_MUSSA_LINK_FLAGS(ALG_TEST_LDFLAGS)
--- /dev/null
+#include <boost/test/auto_unit_test.hpp>
+
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/xml_oarchive.hpp>
+#include <boost/archive/xml_iarchive.hpp>
+
+#include "alg/alphabet.hpp"
+#include "mussa_exceptions.hpp"
+
+BOOST_AUTO_TEST_CASE( alphabet_simple )
+{
+ const Alphabet &a = Alphabet::reduced_dna_alphabet;
+ // exists is case insensitive
+ BOOST_CHECK_EQUAL( a.exists('a'), true);
+ BOOST_CHECK_EQUAL( a.exists('A'), true);
+ BOOST_CHECK_EQUAL( a.exists('Q'), false);
+ BOOST_CHECK_EQUAL( a.exists('q'), false);
+
+ BOOST_CHECK_EQUAL( a.c_str(), "AaCcGgTtNn\012\015"); // copied from alphabet.cpp
+}
BOOST_AUTO_TEST_CASE( sequence_filter )
{
const char *core_seq = "AATTGGCC";
- Sequence s1(core_seq);
+ Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL(s1, core_seq);
- Sequence s2("aattggcc");
+ Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL(s2, "AATTGGCC");
BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
+ BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
BOOST_CHECK_EQUAL(s2.size(), s2.size());
- BOOST_CHECK_EQUAL(s2.get_sequence(), core_seq);
+ BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
- Sequence s3("asdfg");
+ Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL(s3, "ANNNG");
BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
- s3.set_filtered_sequence("AAGGCCTT", 0, 2);
+ s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2);
BOOST_CHECK_EQUAL(s3, "AA");
- s3.set_filtered_sequence("AAGGCCTT", 2, 2);
+ s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
BOOST_CHECK_EQUAL( s3, "GG");
- s3.set_filtered_sequence("AAGGCCTT", 4);
+ s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
BOOST_CHECK_EQUAL( s3, "CCTT");
s3 = "AAGGFF";
BOOST_CHECK_EQUAL(s3, "AAGGNN");
}
+BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
+{
+ std::string agct("AGCT");
+ Sequence seq(agct, Sequence::nucleic_alphabet);
+ BOOST_CHECK_EQUAL(seq.size(), agct.size());
+ BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
+
+ std::string bdv("BDv");
+ Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
+ BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
+ BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
+
+}
+
+BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
+{
+ std::string agct("AGCT");
+ Sequence seq(agct);
+ BOOST_CHECK_EQUAL(seq.size(), agct.size());
+ BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
+ BOOST_CHECK_EQUAL(seq[0], agct[0]);
+ BOOST_CHECK_EQUAL(seq[1], agct[1]);
+ BOOST_CHECK_EQUAL(seq[2], agct[2]);
+ BOOST_CHECK_EQUAL(seq[3], agct[3]);
+
+ std::string bdv("BDv");
+ Sequence seq_bdv(bdv);
+ BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
+ // default alphabet only allows AGCTUN
+ BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");
+}
+
BOOST_AUTO_TEST_CASE( subseq_names )
{
- Sequence s1("AAGGCCTT");
+ Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
s1.set_species("species");
s1.set_fasta_header("a fasta header");
Sequence s2 = s1.subseq(2,2);
BOOST_CHECK_EQUAL( s1.stop(), 0 );
std::string seq_string("AAGGCCTT");
- Sequence s2(seq_string);
+ Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL( s2.start(), 0 );
BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
seq_path /= "human_mck_pro.fa";
Sequence s;
- s.load_fasta(seq_path);
+ s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
"5' flank");
}
+BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
+{
+ std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+ std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
+ std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
+ std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
+ std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
+ std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
+ std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+ std::stringstream garbage_fasta(garbage_string);
+
+ Sequence s;
+ s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
+ BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta,
+ Sequence::reduced_dna_alphabet),
+ sequence_invalid_load_error);
+ BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta,
+ Sequence::reduced_dna_alphabet),
+ sequence_invalid_load_error);
+ BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
+ Sequence::reduced_dna_alphabet),
+ sequence_invalid_load_error);
+
+}
+
+BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
+{
+ std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
+ std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
+ std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
+ std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
+ std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+ std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
+ std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+ std::stringstream garbage_fasta(garbage_string);
+
+ Sequence s;
+ s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
+ BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta,
+ Sequence::reduced_rna_alphabet),
+ sequence_invalid_load_error);
+ BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta,
+ Sequence::reduced_rna_alphabet),
+ sequence_invalid_load_error);
+ BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
+ Sequence::reduced_rna_alphabet),
+ sequence_invalid_load_error);
+}
+
+BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
+{
+ std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
+ std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
+ std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
+ std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
+ std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
+ std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+ std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
+ std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
+ std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+ std::stringstream garbage_fasta(garbage_string);
+
+ Sequence s;
+ Sequence specific;
+ // there's two copies of reduced_rna_fasta because i didn't feel like
+ // figuring out how to properly reset the read pointer in a stringstream
+ s.load_fasta(reduced_rna_fasta1);
+ specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
+ BOOST_CHECK_EQUAL(s, specific);
+
+ s.load_fasta(reduced_dna_fasta1);
+ specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
+ BOOST_CHECK_EQUAL(s, specific);
+
+ BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta),
+ sequence_invalid_load_error);
+ BOOST_CHECK_THROW(s.load_fasta(garbage_fasta),
+ sequence_invalid_load_error);
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
+{
+ std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
+ Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
+ Sequence seqr = seq.rev_comp();
+
+ BOOST_CHECK( seq != seqr );
+ BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
+ BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
+{
+ std::string dna_str("AGCTN");
+ Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
+ BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");
+ BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
+{
+ std::string rna_str("AGCUN");
+ Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
+ BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");
+ BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
+}
+
BOOST_AUTO_TEST_CASE( annotation_load )
{
string annot_data = "human\n"
;
string s(100, 'A');
s += "GCTGCTAATT";
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
//istringstream annot_stream(annot_data);
seq.parse_annot(annot_data, 0, 0);
"TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
"AAAAA"
"GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
seq.parse_annot(annot_data);
std::list<annot> annots = seq.annotations();
BOOST_CHECK_EQUAL( annots.size(), 2);
;
string s(100, 'A');
s += "GCTGCTAATT";
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
//istringstream annot_stream(annot_data);
seq.parse_annot(annot_data, 0, 0);
BOOST_AUTO_TEST_CASE( sequence_empty_equality )
{
- Sequence szero("");
+ Sequence szero("", Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL(szero.empty(), true);
BOOST_CHECK_EQUAL(szero, szero);
BOOST_CHECK_EQUAL(szero, "");
- Sequence sclear("AGCT");
+ Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
sclear.clear();
BOOST_CHECK_EQUAL(sclear.empty(), true);
BOOST_CHECK_EQUAL(sclear, sclear);
BOOST_AUTO_TEST_CASE ( sequence_iterators )
{
std::string seq_string = "AAGGCCTTNNTATA";
- Sequence s(seq_string);
+ Sequence s(seq_string, Sequence::reduced_dna_alphabet);
const Sequence cs(s);
std::string::size_type count = 0;
{
string m("AAAA");
string bogus("AATTGGAA");
- Sequence s1("AAAAGGGGCCCCTTTT");
+ Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
list<motif>::const_iterator motif_i = s1.motifs().begin();
list<motif>::const_iterator motif_end = s1.motifs().end();
"AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
string gc("GCCCCC");
string gga("GGACACCTC");
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
std::list<Sequence> query_list;
std::list<string> string_list;
}
}
BOOST_CHECK_EQUAL(seq.annotations().size(), count);
+ const std::list<annot> &a = seq.annotations();
+ for (std::list<annot>::const_iterator annot_i = a.begin();
+ annot_i != a.end();
+ ++annot_i)
+ {
+ int count = annot_i->end - annot_i->begin ;
+ }
}
BOOST_AUTO_TEST_CASE( subseq_annotation_test )
"GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
"CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
"AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
seq.add_annotation(annot(0, 10, "0-10", "0-10"));
"GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
"CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
"AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
// starting conditions
BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
BOOST_AUTO_TEST_CASE( out_operator )
{
string s("AAGGCCTT");
- Sequence seq(s);
+ Sequence seq(s, Sequence::reduced_dna_alphabet);
ostringstream buf;
buf << s;
BOOST_AUTO_TEST_CASE( get_name )
{
- Sequence seq("AAGGCCTT");
+ Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
BOOST_CHECK_EQUAL( seq.get_name(), "" );
seq.set_species("hooman"); // anyone remember tradewars?
BOOST_AUTO_TEST_CASE( serialize_simple )
{
std::string seq_string = "AAGGCCTT";
- Sequence seq(seq_string);
+ Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
seq.set_species("ribbet");
std::ostringstream oss;
// allocate/deallocate serialization components
BOOST_AUTO_TEST_CASE( serialize_tree )
{
std::string seq_string = "AAGGCCTT";
- Sequence seq(seq_string);
+ Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
seq.set_species("ribbet");
seq.add_motif("AA");
seq.add_motif("GC");
BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
{
std::string seq_string = "AAGGCCTT";
- Sequence seq(seq_string);
+ Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
seq.set_species("ribbet");
seq.add_motif("AA");
seq.add_motif("GC");
BOOST_AUTO_TEST_CASE( serialize_xml_two )
{
std::string seq_string = "AAGGCCTT";
- Sequence seq1(seq_string);
+ Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
Sequence seq2(seq1);
std::ostringstream oss;
sequence_load_error(msg) {};
};
-
+// Empty unrecognized characters
+class sequence_invalid_load_error : public sequence_load_error
+{
+public:
+ explicit sequence_invalid_load_error(const std::string& msg) :
+ sequence_load_error(msg) {};
+};
//! failure running analysis
class mussa_analysis_error : public mussa_error
{
#include "qui/motif_editor/MotifEditor.hpp"
+#include "alg/alphabet.hpp"
#include <iostream>
MotifElement::MotifElement() :
enabled(true),
color(Color(1.0,0.0,0.0,1.0)),
- motif()
+ motif(Sequence::nucleic_alphabet)
{
}
//! set sequence text
void MotifElement::setSequence(const std::string& seq_text)
{
- motif.set_sequence(seq_text);
+ motif.set_sequence(seq_text, Sequence::nucleic_alphabet);
}
QString MotifElement::getSequenceText() const
row1Layout->addWidget(analysisNameLabel);
row1Layout->addWidget(&analysisNameLineEdit);
- row2Layout->addWidget(windowLabel);
- row2Layout->addWidget(&windowLineEdit);
row2Layout->addWidget(thresholdLabel);
row2Layout->addWidget(&thresholdLineEdit);
+ row2Layout->addWidget(windowLabel);
+ row2Layout->addWidget(&windowLineEdit);
row2Layout->addWidget(numOfSequencesLabel);
row2Layout->addWidget(&numOfSequencesSpinBox);