X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=blobdiff_plain;f=alg%2Falphabet.cpp;h=32796ca662196165d27692bc3c3ecaa56093ea25;hp=56919199b6c7ccf30c05a6471b95187c6075f150;hb=75496e2c562d728af983c347527270eba360c6ee;hpb=b9755e1974201ff513c66b0fd684bde330c6fff6 diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp index 5691919..32796ca 100644 --- a/alg/alphabet.cpp +++ b/alg/alphabet.cpp @@ -1,48 +1,71 @@ #include +#include "mussa_exceptions.hpp" #include "alg/alphabet.hpp" // some standard dna alphabets // Include nl (\012), and cr (\015) to make sequence parsing eol // convention independent. const char *Alphabet::reduced_dna_cstr = "AaCcGgTtNn\012\015"; +const char *Alphabet::reduced_dna_reverse_cstr = "TtGgCcAaNn\012\015"; const char *Alphabet::reduced_rna_cstr = "AaCcGgUuNn\012\015"; +const char *Alphabet::reduced_rna_reverse_cstr = "UuGgCcAaNn\012\015"; const char *Alphabet::reduced_nucleic_cstr = "AaCcGgTtUuNn\012\015"; +const char *Alphabet::reduced_nucleic_reverse_cstr = "TtGgCcAaAaNn\012\015"; //! this is the general iupac alphabet for nucleotides +const char *Alphabet::dna_cstr = + "AaCcGgTtRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::dna_reverse_cstr = + "TtGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; +const char *Alphabet::rna_cstr = + "AaCcGgUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::rna_reverse_cstr = + "UuGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; const char *Alphabet::nucleic_cstr = - "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015"; + "AaCcGgTtUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::nucleic_reverse_cstr = + "TtGgCcAaAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; //! the protein alphabet const char *Alphabet::protein_cstr = "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"; const char *Alphabet::empty_cstr = ""; const Alphabet& Alphabet::reduced_dna_alphabet() { - static Alphabet *a = new Alphabet(reduced_dna_cstr); + static Alphabet *a = new Alphabet(reduced_dna_cstr, reduced_dna_reverse_cstr); return *a; } const Alphabet& Alphabet::reduced_rna_alphabet() { - static Alphabet *a = new Alphabet(reduced_rna_cstr); + static Alphabet *a = new Alphabet(reduced_rna_cstr, reduced_rna_reverse_cstr); return *a; } const Alphabet& Alphabet::reduced_nucleic_alphabet() { - static Alphabet *a = new Alphabet(reduced_nucleic_cstr); + static Alphabet *a = new Alphabet(reduced_nucleic_cstr, reduced_nucleic_reverse_cstr); + return *a; +} +const Alphabet& Alphabet::dna_alphabet() { + static Alphabet *a = new Alphabet(dna_cstr, dna_reverse_cstr); + return *a; +} +const Alphabet& Alphabet::rna_alphabet() { + static Alphabet *a = new Alphabet(rna_cstr, rna_reverse_cstr); return *a; } const Alphabet& Alphabet::nucleic_alphabet() { - static Alphabet *a = new Alphabet(nucleic_cstr); + static Alphabet *a = new Alphabet(nucleic_cstr, nucleic_reverse_cstr); return *a; } const Alphabet& Alphabet::protein_alphabet() { - static Alphabet *a = new Alphabet(protein_cstr); + static Alphabet *a = new Alphabet(protein_cstr, protein_cstr); return *a; } const Alphabet& Alphabet::empty_alphabet() { - static Alphabet *a = new Alphabet(empty_cstr); + static Alphabet *a = new Alphabet(empty_cstr, empty_cstr); return *a; } -Alphabet::Alphabet(const char *a) : - alphabet(a) +Alphabet::Alphabet(const char *a, const char *reverse_alphabet) : + alphabet(a), + complement_map(create_complement_map(reverse_alphabet)) { alphabet_set.insert(alphabet.begin(), alphabet.end()); } @@ -50,6 +73,7 @@ Alphabet::Alphabet(const char *a) : void Alphabet::assign(const Alphabet& a) { alphabet = a.alphabet; + complement_map = a.complement_map; alphabet_set.clear(); alphabet_set.insert(alphabet.begin(), alphabet.end()); } @@ -78,12 +102,47 @@ const Alphabet& Alphabet::get_alphabet(AlphabetRef alpha) return Alphabet::reduced_rna_alphabet(); case ::reduced_nucleic_alphabet: return Alphabet::reduced_nucleic_alphabet(); + case ::dna_alphabet: + return Alphabet::dna_alphabet(); + case ::rna_alphabet: + return Alphabet::rna_alphabet(); case ::nucleic_alphabet: return Alphabet::nucleic_alphabet(); case ::protein_alphabet: - return Alphabet::protein_alphabet(); + return Alphabet::protein_alphabet(); + case ::empty_alphabet: + return Alphabet::empty_alphabet(); default: throw std::runtime_error("unrecognized alphabet type"); break; } +} + +std::string Alphabet::create_complement_map(const std::string& ra) const +{ + if (ra.size() == 0) { + // reverse complement is undefined + throw mussa_error("no reverse complement"); + } + std::string rc_map(256,'~'); + + for(int i = 0; alphabet[i] and ra[i]; ++i) { + rc_map[ alphabet[i] ] = ra[i]; + } + return rc_map; +} + +boost::shared_ptr Alphabet::reverse_complement(const std::string &seq) const +{ + boost::shared_ptr new_seq(new std::string()); + + new_seq->reserve(seq.size()); + + for(std::string::const_reverse_iterator seq_i = seq.rbegin(); + seq_i != seq.rend(); + ++seq_i) + { + new_seq->append(1, complement_map[ *seq_i ]); + } + return new_seq; } \ No newline at end of file