X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=blobdiff_plain;f=alg%2Fsequence.cpp;h=47cca2a7dfdfb46c831491ee276f20ec5dd6f4a0;hp=91656abd029628b327e8adf3e62ff26df85b9f72;hb=f1724abab87d2e5b160620b10cb81eabf56aadeb;hpb=7d4fbcb6060a60a8ea25ca1303fcaaaf8574f24a diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 91656ab..47cca2a 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -79,9 +79,8 @@ motif::~motif() } -Sequence::Sequence(alphabet_ref alphabet_) - : seq(new SeqSpan("")), - alphabet(alphabet_), +Sequence::Sequence(AlphabetRef alphabet) + : seq(new SeqSpan("", alphabet)), strand(UnknownStrand) { } @@ -90,27 +89,24 @@ Sequence::~Sequence() { } -Sequence::Sequence(const char *seq, alphabet_ref alphabet_) - : alphabet(alphabet_), - strand(UnknownStrand), +Sequence::Sequence(const char *seq, AlphabetRef alphabet_) + : strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq, alphabet); + set_filtered_sequence(seq, alphabet_); } -Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_) - : alphabet(alphabet_), - strand(UnknownStrand), +Sequence::Sequence(const std::string& seq, AlphabetRef alphabet_) + : strand(UnknownStrand), header(""), species("") { - set_filtered_sequence(seq, alphabet); + set_filtered_sequence(seq, alphabet_); } Sequence::Sequence(const Sequence& o) : seq(o.seq), - alphabet(o.alphabet), strand(o.strand), header(o.header), species(o.species), @@ -121,7 +117,6 @@ Sequence::Sequence(const Sequence& o) Sequence::Sequence(const Sequence* o) : seq(o->seq), - alphabet(o->alphabet), strand(o->strand), header(o->header), species(o->species), @@ -130,9 +125,8 @@ Sequence::Sequence(const Sequence* o) { } -Sequence::Sequence(const SeqSpanRef& seq_ref, alphabet_ref alphabet_) +Sequence::Sequence(const SeqSpanRef& seq_ref) : seq(seq_ref), - alphabet(alphabet), strand(UnknownStrand), header(""), species("") @@ -143,7 +137,6 @@ Sequence &Sequence::operator=(const Sequence& s) { if (this != &s) { seq = s.seq; - alphabet = s.alphabet; strand = s.strand; header = s.header; species = s.species; @@ -171,11 +164,11 @@ static void multiplatform_getline(std::istream& in, std::string& line) void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index) { - load_fasta(file_path, alphabet, seq_num, start_index, end_index); + load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index); } //! load a fasta file into a sequence -void Sequence::load_fasta(fs::path file_path, alphabet_ref a, +void Sequence::load_fasta(fs::path file_path, AlphabetRef a, int seq_num, int start_index, int end_index) { fs::fstream data_file; @@ -212,11 +205,11 @@ void Sequence::load_fasta(fs::path file_path, alphabet_ref a, void Sequence::load_fasta(std::istream& file, int seq_num, int start_index, int end_index) { - load_fasta(file, alphabet, seq_num, start_index, end_index); + load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index); } void -Sequence::load_fasta(std::istream& data_file, alphabet_ref a, +Sequence::load_fasta(std::istream& data_file, AlphabetRef a, int seq_num, int start_index, int end_index) { @@ -227,7 +220,7 @@ Sequence::load_fasta(std::istream& data_file, alphabet_ref a, std::string rev_comp; std::string sequence_raw; std::string seq_tmp; // holds sequence during basic filtering - const Alphabet &alpha = get_alphabet(a); + const Alphabet &alpha = get_alphabet(); if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); @@ -288,19 +281,18 @@ Sequence::load_fasta(std::istream& data_file, alphabet_ref a, } void Sequence::set_filtered_sequence(const std::string &in_seq, - alphabet_ref alphabet_, + AlphabetRef alphabet_, size_type start, size_type count, strand_type strand_) { - alphabet = alphabet_; if ( count == npos) count = in_seq.size() - start; std::string new_seq; new_seq.reserve(count); // finally, the actual conversion loop - const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets + const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets std::string::const_iterator seq_i = in_seq.begin()+start; for(size_type i = 0; i != count; ++i, ++seq_i) { @@ -310,7 +302,7 @@ void Sequence::set_filtered_sequence(const std::string &in_seq, new_seq.append(1, 'N'); } } - SeqSpanRef new_seq_ref(new SeqSpan(new_seq)); + SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_)); seq = new_seq_ref; strand = strand_; } @@ -570,43 +562,14 @@ Sequence::subseq(size_type start, size_type count) const return new_seq; } -std::string Sequence::create_reverse_map() const -{ - std::string rc_map(256, '~'); - // if we're rna, use U instead of T - // we might want to add an "is_rna" to sequence at somepoint - char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T'; - char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't'; - rc_map['A'] = TU ; rc_map['a'] = tu ; - rc_map['T'] = 'A'; rc_map['t'] = 'a'; - rc_map['U'] = 'A'; rc_map['u'] = 'a'; - rc_map['G'] = 'C'; rc_map['g'] = 'c'; - rc_map['C'] = 'G'; rc_map['c'] = 'g'; - rc_map['M'] = 'K'; rc_map['m'] = 'k'; - rc_map['R'] = 'Y'; rc_map['r'] = 'y'; - rc_map['W'] = 'W'; rc_map['w'] = 'w'; - rc_map['S'] = 'S'; rc_map['s'] = 's'; - rc_map['Y'] = 'R'; rc_map['y'] = 'r'; - rc_map['K'] = 'M'; rc_map['k'] = 'm'; - rc_map['V'] = 'B'; rc_map['v'] = 'b'; - rc_map['H'] = 'D'; rc_map['h'] = 'd'; - rc_map['D'] = 'H'; rc_map['d'] = 'h'; - rc_map['B'] = 'V'; rc_map['b'] = 'v'; - rc_map['N'] = 'N'; rc_map['n'] = 'n'; - rc_map['X'] = 'X'; rc_map['x'] = 'x'; - rc_map['?'] = '?'; - rc_map['.'] = '.'; - rc_map['-'] = '-'; - rc_map['~'] = '~'; // not really needed, but perhaps it's clearer. - return rc_map; -} +// FIXME: This needs to be moved into SeqSpan Sequence Sequence::rev_comp() const { std::string rev_comp; rev_comp.reserve(length()); - std::string rc_map = create_reverse_map(); + std::string rc_map = seq->seq->create_reverse_complement_map(); // reverse and convert Sequence::const_reverse_iterator seq_i; @@ -617,7 +580,12 @@ Sequence Sequence::rev_comp() const { rev_comp.append(1, rc_map[*seq_i]); } - return Sequence(rev_comp, alphabet); + return Sequence(rev_comp, seq->seq->get_alphabet_ref()); +} + +const Alphabet& Sequence::get_alphabet() const +{ + return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet(); } void Sequence::set_fasta_header(std::string header_) @@ -653,31 +621,7 @@ Sequence::get_name() const return ""; } -const Alphabet& Sequence::get_alphabet() const -{ - return get_alphabet(alphabet); -} - -const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const -{ - switch (alpha) { - case reduced_dna_alphabet: - return Alphabet::reduced_dna_alphabet(); - case reduced_rna_alphabet: - return Alphabet::reduced_rna_alphabet(); - case reduced_nucleic_alphabet: - return Alphabet::reduced_nucleic_alphabet(); - case nucleic_alphabet: - return Alphabet::nucleic_alphabet(); - case protein_alphabet: - return Alphabet::protein_alphabet(); - default: - throw std::runtime_error("unrecognized alphabet type"); - break; - } -} - -void Sequence::set_sequence(const std::string& s, alphabet_ref a) +void Sequence::set_sequence(const std::string& s, AlphabetRef a) { set_filtered_sequence(s, a); }