From 75496e2c562d728af983c347527270eba360c6ee Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Sat, 24 Mar 2007 01:00:49 +0000 Subject: [PATCH] move strand into seqspan this patch moves strand storage from Sequence into SeqSpan, also when accessing sequence data from a SeqSpan that is currently on the minus side it correctly returns reverse complemented data. Unfortunately getting iterators that point to data thats a transformation of currently existing data is a giant pain and was making SeqSpan way to complicated. So I took the easy way out. SeqSpan will now cache a reverse complement of the region its pointing to and return iterators based on that. Also Alphabet can reverse complement a string, Drat, just realized that if one changes the region of SeqSpan, the rc_seq will need to be invalidated. --- alg/alphabet.cpp | 79 +++++++++++++-- alg/alphabet.hpp | 36 ++++++- alg/flp_seqcomp.cpp | 6 +- alg/motif_parser.cpp | 4 +- alg/mussa.cpp | 2 +- alg/seq.hpp | 8 +- alg/seq_span.cpp | 193 +++++++++++++++++++++++++++++++------ alg/seq_span.hpp | 39 +++++++- alg/sequence.cpp | 59 ++++-------- alg/sequence.hpp | 20 ++-- alg/test/test_alphabet.cpp | 146 ++++++++++++++++++++++++++++ alg/test/test_seq.cpp | 12 ++- alg/test/test_seq_span.cpp | 89 +++++++++++++++++ alg/test/test_sequence.cpp | 5 +- mussa_exceptions.hpp | 8 ++ 15 files changed, 599 insertions(+), 107 deletions(-) diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp index 5691919..32796ca 100644 --- a/alg/alphabet.cpp +++ b/alg/alphabet.cpp @@ -1,48 +1,71 @@ #include +#include "mussa_exceptions.hpp" #include "alg/alphabet.hpp" // some standard dna alphabets // Include nl (\012), and cr (\015) to make sequence parsing eol // convention independent. const char *Alphabet::reduced_dna_cstr = "AaCcGgTtNn\012\015"; +const char *Alphabet::reduced_dna_reverse_cstr = "TtGgCcAaNn\012\015"; const char *Alphabet::reduced_rna_cstr = "AaCcGgUuNn\012\015"; +const char *Alphabet::reduced_rna_reverse_cstr = "UuGgCcAaNn\012\015"; const char *Alphabet::reduced_nucleic_cstr = "AaCcGgTtUuNn\012\015"; +const char *Alphabet::reduced_nucleic_reverse_cstr = "TtGgCcAaAaNn\012\015"; //! this is the general iupac alphabet for nucleotides +const char *Alphabet::dna_cstr = + "AaCcGgTtRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::dna_reverse_cstr = + "TtGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; +const char *Alphabet::rna_cstr = + "AaCcGgUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::rna_reverse_cstr = + "UuGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; const char *Alphabet::nucleic_cstr = - "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015"; + "AaCcGgTtUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015"; +const char *Alphabet::nucleic_reverse_cstr = + "TtGgCcAaAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015"; //! the protein alphabet const char *Alphabet::protein_cstr = "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"; const char *Alphabet::empty_cstr = ""; const Alphabet& Alphabet::reduced_dna_alphabet() { - static Alphabet *a = new Alphabet(reduced_dna_cstr); + static Alphabet *a = new Alphabet(reduced_dna_cstr, reduced_dna_reverse_cstr); return *a; } const Alphabet& Alphabet::reduced_rna_alphabet() { - static Alphabet *a = new Alphabet(reduced_rna_cstr); + static Alphabet *a = new Alphabet(reduced_rna_cstr, reduced_rna_reverse_cstr); return *a; } const Alphabet& Alphabet::reduced_nucleic_alphabet() { - static Alphabet *a = new Alphabet(reduced_nucleic_cstr); + static Alphabet *a = new Alphabet(reduced_nucleic_cstr, reduced_nucleic_reverse_cstr); + return *a; +} +const Alphabet& Alphabet::dna_alphabet() { + static Alphabet *a = new Alphabet(dna_cstr, dna_reverse_cstr); + return *a; +} +const Alphabet& Alphabet::rna_alphabet() { + static Alphabet *a = new Alphabet(rna_cstr, rna_reverse_cstr); return *a; } const Alphabet& Alphabet::nucleic_alphabet() { - static Alphabet *a = new Alphabet(nucleic_cstr); + static Alphabet *a = new Alphabet(nucleic_cstr, nucleic_reverse_cstr); return *a; } const Alphabet& Alphabet::protein_alphabet() { - static Alphabet *a = new Alphabet(protein_cstr); + static Alphabet *a = new Alphabet(protein_cstr, protein_cstr); return *a; } const Alphabet& Alphabet::empty_alphabet() { - static Alphabet *a = new Alphabet(empty_cstr); + static Alphabet *a = new Alphabet(empty_cstr, empty_cstr); return *a; } -Alphabet::Alphabet(const char *a) : - alphabet(a) +Alphabet::Alphabet(const char *a, const char *reverse_alphabet) : + alphabet(a), + complement_map(create_complement_map(reverse_alphabet)) { alphabet_set.insert(alphabet.begin(), alphabet.end()); } @@ -50,6 +73,7 @@ Alphabet::Alphabet(const char *a) : void Alphabet::assign(const Alphabet& a) { alphabet = a.alphabet; + complement_map = a.complement_map; alphabet_set.clear(); alphabet_set.insert(alphabet.begin(), alphabet.end()); } @@ -78,12 +102,47 @@ const Alphabet& Alphabet::get_alphabet(AlphabetRef alpha) return Alphabet::reduced_rna_alphabet(); case ::reduced_nucleic_alphabet: return Alphabet::reduced_nucleic_alphabet(); + case ::dna_alphabet: + return Alphabet::dna_alphabet(); + case ::rna_alphabet: + return Alphabet::rna_alphabet(); case ::nucleic_alphabet: return Alphabet::nucleic_alphabet(); case ::protein_alphabet: - return Alphabet::protein_alphabet(); + return Alphabet::protein_alphabet(); + case ::empty_alphabet: + return Alphabet::empty_alphabet(); default: throw std::runtime_error("unrecognized alphabet type"); break; } +} + +std::string Alphabet::create_complement_map(const std::string& ra) const +{ + if (ra.size() == 0) { + // reverse complement is undefined + throw mussa_error("no reverse complement"); + } + std::string rc_map(256,'~'); + + for(int i = 0; alphabet[i] and ra[i]; ++i) { + rc_map[ alphabet[i] ] = ra[i]; + } + return rc_map; +} + +boost::shared_ptr Alphabet::reverse_complement(const std::string &seq) const +{ + boost::shared_ptr new_seq(new std::string()); + + new_seq->reserve(seq.size()); + + for(std::string::const_reverse_iterator seq_i = seq.rbegin(); + seq_i != seq.rend(); + ++seq_i) + { + new_seq->append(1, complement_map[ *seq_i ]); + } + return new_seq; } \ No newline at end of file diff --git a/alg/alphabet.hpp b/alg/alphabet.hpp index 513ea76..7652ddf 100644 --- a/alg/alphabet.hpp +++ b/alg/alphabet.hpp @@ -7,12 +7,16 @@ #include #include +#include + #include #include //! this is a helper class for sequence -enum AlphabetRef { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, - nucleic_alphabet, protein_alphabet, empty_alphabet=255 }; +enum AlphabetRef { reduced_dna_alphabet, dna_alphabet, + reduced_rna_alphabet, rna_alphabet, + reduced_nucleic_alphabet, nucleic_alphabet, + protein_alphabet, empty_alphabet=255 }; class Alphabet { friend class Sequence; @@ -28,19 +32,39 @@ public: //! return an alphabet given an AlphabetRef enumeration static const Alphabet &get_alphabet(AlphabetRef); + //! return a map to reverse complement an symbols from a nucleic alphabet + std::string create_complement_map(const std::string &) const; + //! return compelement map + std::string get_complement_map() const { return complement_map; } + + //! return a pointer to a reverse complemented string + boost::shared_ptr reverse_complement(const std::string &) const; + // note, if you want to define an alphabet for a sequence, you probably want // to update the enumeration in Sequence, and Sequence::get_sequence //! The standard DNA alphabet, with unique, and unknown characters static const char *reduced_dna_cstr; + static const char *reduced_dna_reverse_cstr; static const Alphabet &reduced_dna_alphabet(); //! The standard RNA alphabet, with unique, and unknown characters static const char *reduced_rna_cstr; + static const char *reduced_rna_reverse_cstr; static const Alphabet &reduced_rna_alphabet(); - //! The standard DNA/RNA alphabet, with unique, and unknown characters + //! The full IUPAC DNA alphabet, with unique, and unknown characters + static const char *dna_cstr; + static const char *dna_reverse_cstr; + static const Alphabet &dna_alphabet(); + //! the full IUPAC RNA alphabet + static const char *rna_cstr; + static const char *rna_reverse_cstr; + static const Alphabet &rna_alphabet(); + //! reduced (DNA/RNA) nucelic alphabet static const char *reduced_nucleic_cstr; + static const char *reduced_nucleic_reverse_cstr; static const Alphabet &reduced_nucleic_alphabet(); - //! this is the general IUPAC alphabet for nucleotides + //! reduced (DNA/RNA) nucelic alphabet static const char *nucleic_cstr; + static const char *nucleic_reverse_cstr; static const Alphabet &nucleic_alphabet(); //! the protein alphabet static const char *protein_cstr; @@ -52,11 +76,12 @@ public: private: //! what are allowable symbols in our alphabet std::string alphabet; + std::string complement_map; //! internal variable to make exists() faster std::set alphabet_set; //! some necessary string api access - Alphabet(const char *a); + Alphabet(const char *a, const char *reverse_a); //! allow sequence to copy one alphabet to another (needed when unserializing) void assign(const Alphabet& a); const_iterator begin() const { return alphabet.begin(); } @@ -67,6 +92,7 @@ private: template void serialize(Archive& ar, const unsigned int /*version*/) { ar & BOOST_SERIALIZATION_NVP(alphabet); + ar & BOOST_SERIALIZATION_NVP(complement_map); alphabet_set.clear(); alphabet_set.insert(alphabet.begin(), alphabet.end()); } diff --git a/alg/flp_seqcomp.cpp b/alg/flp_seqcomp.cpp index 1a1ea89..995a228 100644 --- a/alg/flp_seqcomp.cpp +++ b/alg/flp_seqcomp.cpp @@ -61,8 +61,10 @@ FLPs::seqcomp(const Sequence& sseq1, const Sequence& sseq2, bool is_RC) int start_i, seq1_i, seq2_i, win_i; // loop variables int matches; // number of matches in to a window int i2_offset; - const char *seq1 = sseq1.data(); - const char *seq2 = sseq2.data(); + std::string sseq1_str(sseq1.get_sequence()); + std::string sseq2_str(sseq2.get_sequence()); + const char *seq1 = sseq1_str.c_str(); + const char *seq2 = sseq2_str.c_str(); validate_sequence(sseq1); validate_sequence(sseq2); diff --git a/alg/motif_parser.cpp b/alg/motif_parser.cpp index c59c287..5ba3550 100644 --- a/alg/motif_parser.cpp +++ b/alg/motif_parser.cpp @@ -57,7 +57,7 @@ void motif_parser::push_motif::operator()( Iterator end) const { float red, green, blue, alpha; - Sequence seq(parsed->sequence, nucleic_alphabet); + Sequence seq(parsed->sequence, dna_alphabet, SeqSpan::PlusStrand); seq.set_fasta_header(parsed->name); alpha = 1.0; @@ -94,7 +94,7 @@ motif_parser::ParsedMotifs::ParsedMotifs( void motif_parser::ParsedMotifs::parse(const std::string &data) { - const char *alphabet = Alphabet::nucleic_cstr; + const char *alphabet = Alphabet::dna_cstr; // parse our string spirit::parse_info result; diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 295fd49..52d0dce 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -794,7 +794,7 @@ void Mussa::load_motifs(fs::path filename) void Mussa::load_motifs(std::istream &in) { std::string data; - const char *alphabet = Alphabet::nucleic_cstr; + const char *alphabet = Alphabet::dna_cstr; motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper); // slurp our data into a string diff --git a/alg/seq.hpp b/alg/seq.hpp index 5ac42a2..467fcaf 100644 --- a/alg/seq.hpp +++ b/alg/seq.hpp @@ -38,17 +38,21 @@ public: SeqString(AlphabetRef a=reduced_nucleic_alphabet) : std::string(), - alphabet(a) + alphabet(a), + rc_map(Alphabet::get_alphabet(a).get_complement_map()) {} SeqString(const std::string &s, AlphabetRef a=reduced_nucleic_alphabet) : std::string(s), - alphabet(a) + alphabet(a), + rc_map(Alphabet::get_alphabet(a).get_complement_map()) {} AlphabetRef get_alphabet_ref() { return alphabet; } const Alphabet& get_alphabet() const; std::string create_reverse_complement_map() const; + + const std::string rc_map; private: AlphabetRef alphabet; friend class boost::serialization::access; diff --git a/alg/seq_span.cpp b/alg/seq_span.cpp index 2870f0f..bf9a3dc 100644 --- a/alg/seq_span.cpp +++ b/alg/seq_span.cpp @@ -9,7 +9,8 @@ SeqSpan::SeqSpan(const SeqSpan &o) : seq(o.seq), seq_start(o.seq_start), seq_count(o.seq_count), - parent(o.parent) + parent(o.parent), + rc_seq(o.rc_seq) { } @@ -17,20 +18,37 @@ SeqSpan::SeqSpan(const SeqSpan *p) : seq(p->seq), seq_start(p->seq_start), seq_count(p->seq_count), - parent(p->parent) + parent(p->parent), + rc_seq(p->rc_seq) { } SeqSpan::SeqSpan(const std::string &seq_, - AlphabetRef a) - : seq(new SeqString(seq_, a)), - seq_start(0), + AlphabetRef a, + strand_type strand_) + : seq_start(0), seq_count(seq_.length()), parent() { + switch (strand_) { + case PlusStrand: + case SingleStrand: + // do nothing + seq_strand = strand_; + seq.reset(new SeqString(seq_, a)); + break; + default: + throw sequence_invalid_strand( + "Can only initialize a Plus or Single strand from a string" + ); + break; + } } -SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_) +SeqSpan::SeqSpan(const SeqSpanRef parent_, + size_type start_, + size_type count_, + strand_type strand_) : seq(parent_->seq), seq_start(parent_->seq_start + start_), parent(parent_) @@ -39,6 +57,62 @@ SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_) seq_count = parent_->seq_count; else seq_count = count_; + + // blech this strand logic is too long + switch(strand_) { + case UnknownStrand: + case PlusStrand: + case MinusStrand: + case BothStrand: + seq_strand = strand_; + break; + case SameStrand: + if (parent) { + seq_strand = parent->strand(); + } else { + throw sequence_invalid_strand("SameStrand is undefined with no parent"); + } + break; + case OppositeStrand: + if (parent) { + switch (parent->strand()) { + case PlusStrand: + seq_strand = MinusStrand; + break; + case MinusStrand: + seq_strand = PlusStrand; + break; + case UnknownStrand: + case BothStrand: + seq_strand = parent->strand(); + break; + case SingleStrand: + throw sequence_invalid_strand( + "OppositeStrand is undefined with SingleStrands" + ); + break; + default: + throw sequence_invalid_strand("Parent has an invalid strand type"); + } + } else { + throw sequence_invalid_strand("SameStrand is undefined with no parent"); + } + break; + case SingleStrand: + if (parent) { + if (parent->strand() == SingleStrand) { + seq_strand = SingleStrand; + } else { + throw sequence_invalid_strand("Can't change single strandedness"); + } + } else { + seq_strand = SingleStrand; + } + break; + default: + throw sequence_invalid_strand("unrecognized strand identifier"); + break; + } } ////// @@ -50,6 +124,7 @@ SeqSpan &SeqSpan::operator=(const SeqSpan& s) seq_start = s.seq_start; seq_count = s.seq_count; parent = s.parent; + rc_seq = s.rc_seq; } return *this; } @@ -76,17 +151,26 @@ bool operator<(const SeqSpan& a, const SeqSpan& b) } } */ -#include + bool operator==(const SeqSpan& a, const SeqSpan& b) { if (SeqSpan::isFamily(a, b)) { // can do fast comparison if (a.seq_start == b.seq_start and a.seq_count == b.seq_count) { - return true; + // unknown strands match anything + if (a.seq_strand == SeqSpan::UnknownStrand or + b.seq_strand == SeqSpan::UnknownStrand) { + return true; + } else { + // otherwise our stranded-ness needs to match + return (a.seq_strand == b.seq_strand); + } } else { return false; } - } + } + // if we're not part of the same seqspan heirarchy we're never equal + // the string equality is handled off in Sequence return false; } @@ -103,7 +187,11 @@ SeqSpan::const_reference SeqSpan::operator[](SeqSpan::size_type i) const SeqSpan::const_reference SeqSpan::at(SeqSpan::size_type i) const { if (!seq) throw std::out_of_range("empty sequence"); - return seq->at(i+seq_start); + if (seq_strand == PlusStrand) { + return seq->at(seq_start+i); + } else { + return seq->rc_map[seq->at((seq_start+seq_count-1)-i)]; + } } const char *SeqSpan::data() const @@ -116,36 +204,62 @@ const char *SeqSpan::data() const SeqSpan::const_iterator SeqSpan::begin() const { - if (seq and seq_count != 0) - return seq->begin()+seq_start; - else - return SeqSpan::const_iterator(0); + if (seq and seq_count != 0) { + if (seq_strand != MinusStrand) { + return seq->begin()+seq_start; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->begin(); + } + } + return SeqSpan::const_iterator(0); } SeqSpan::const_iterator SeqSpan::end() const { if (seq and seq_count != 0) { - return seq->begin() + seq_start + seq_count; - } else { - return SeqSpan::const_iterator(0); - } + if (seq_strand != MinusStrand) { + return seq->begin() + seq_start + seq_count; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->end(); + } + } + return SeqSpan::const_iterator(0); } SeqSpan::const_reverse_iterator SeqSpan::rbegin() const { - if (seq and seq_count != 0) - return seq->rbegin()+(seq->size()-(seq_start+seq_count)); - else - return SeqSpan::const_reverse_iterator(); + if (seq and seq_count != 0) { + if (seq_strand != MinusStrand) { + return seq->rbegin()+(seq->size()-(seq_start+seq_count)); + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->rbegin(); + } + } + return SeqSpan::const_reverse_iterator(); } SeqSpan::const_reverse_iterator SeqSpan::rend() const { if (seq and seq_count != 0) { - return rbegin() + seq_count; - } else { - return SeqSpan::const_reverse_iterator(); + if (seq_strand != MinusStrand) { + return rbegin() + seq_count; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->rend(); + } } + return SeqSpan::const_reverse_iterator(); } bool SeqSpan::empty() const @@ -160,7 +274,7 @@ SeqSpan::size_type SeqSpan::find_first_not_of( typedef std::set sequence_set; sequence_set match_set; - for(const_iterator query_item = query.begin(); + for(std::string::const_iterator query_item = query.begin(); query_item != query.end(); ++query_item) { @@ -226,18 +340,23 @@ void SeqSpan::setParentStop(SeqSpan::size_type v) setStop(parent->start() + v); } -SeqSpanRef SeqSpan::subseq(size_type start, size_type count) +SeqSpanRef SeqSpan::subseq(size_type start, size_type count, strand_type strand) { count = std::min(count, seq_count - start); - SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count)); + SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count, strand)); return new_span; } std::string SeqSpan::sequence() const { if (seq) { - return seq->substr(seq_start, seq_count); + if (seq_strand != MinusStrand) + return seq->substr(seq_start, seq_count); + else { + initialize_rc_seq(); + return *rc_seq; + } } else { return std::string(); } @@ -247,3 +366,19 @@ bool SeqSpan::isFamily(const SeqSpan& a, const SeqSpan& b) { return a.seq.get() == b.seq.get(); } + +void SeqSpan::initialize_rc_seq() const +{ + std::string new_rc_seq; + std::string rc_map(seq->get_alphabet().get_complement_map()); + + new_rc_seq.reserve(size()); + size_type i = (seq_start + seq_count); + while( i != seq_start ) { + --i; + new_rc_seq.append(1, rc_map[ seq->at(i) ]); + } + + SeqStringRef new_rc_ref(new SeqString(new_rc_seq, seq->get_alphabet_ref())); + const_cast(&rc_seq)->swap(new_rc_ref); +} diff --git a/alg/seq_span.hpp b/alg/seq_span.hpp index 8d19913..5a2da12 100644 --- a/alg/seq_span.hpp +++ b/alg/seq_span.hpp @@ -35,14 +35,34 @@ public: typedef SeqString::size_type size_type; typedef SeqString::value_type value_type; static const size_type npos = SeqString::npos; + //! Define strand types + /**! + * Unknown strand is treated as "either" strand + * Plus refers to the initially created strand + * Minus is the opposite strand + * Both is for any feature that applies to "both" strands + * (which may not actually be useful) + * Same strand is only used when creating a subsequence + * and implies the subsequence has the same orientation as the parent + * Opposite is only used for creating a subsequence + * and implies the subsequence has the opposite orientation as the parent + * Single indicates that this is single stranded and there can't be + * an opposite strand. + */ + enum strand_type { UnknownStrand, MinusStrand, PlusStrand, + BothStrand, SameStrand, OppositeStrand, SingleStrand }; public: SeqSpan(const SeqSpan &); SeqSpan(const SeqSpan *); explicit SeqSpan(const std::string &, - AlphabetRef = reduced_nucleic_alphabet + AlphabetRef a = dna_alphabet, + strand_type strand=PlusStrand ); - SeqSpan(const SeqSpanRef, size_type start=0, size_type count=npos); + SeqSpan(const SeqSpanRef, + size_type start=0, + size_type count=npos, + strand_type strand=SameStrand); //! assignment SeqSpan& operator=(const SeqSpan&); @@ -87,6 +107,7 @@ public: size_type stop() const { return seq_start + seq_count; } //! set one past the last position relative to the root sequence. void setStop(size_type); + strand_type strand() const { return seq_strand; } //! get start position relative to the parent sequence size_type parentStart() const; @@ -100,12 +121,17 @@ public: //! return a subsequence, copying over any appropriate annotation - SeqSpanRef subseq(size_type start=0, size_type count = std::string::npos); + SeqSpanRef subseq(size_type start=0, + size_type count = std::string::npos, + strand_type = PlusStrand); //! get sequence std::string sequence() const; //! are both sequences derived from the same sequence tree? static bool isFamily(const SeqSpan& a, const SeqSpan& b); + //! fill in our rc_seq variable + void initialize_rc_seq() const; + friend class Sequence; private: //! do not statically initialize, only create with new @@ -117,11 +143,15 @@ protected: size_type seq_start; //! how big we ware size_type seq_count; - // Do I need to track the strand here? + //! strand orientation + strand_type seq_strand; //! keep a reference to who our parent span is SeqSpanRef parent; + //! hold a reverse complement version of our sequence if needed + SeqStringRef rc_seq; + // boost::serialization support friend class boost::serialization::access; template @@ -129,6 +159,7 @@ protected: ar & BOOST_SERIALIZATION_NVP(seq); ar & BOOST_SERIALIZATION_NVP(seq_start); ar & BOOST_SERIALIZATION_NVP(seq_count); + ar & BOOST_SERIALIZATION_NVP(seq_strand); ar & BOOST_SERIALIZATION_NVP(parent); } }; diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 47cca2a..68fa014 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -80,8 +80,7 @@ motif::~motif() Sequence::Sequence(AlphabetRef alphabet) - : seq(new SeqSpan("", alphabet)), - strand(UnknownStrand) + : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)) { } @@ -89,25 +88,24 @@ Sequence::~Sequence() { } -Sequence::Sequence(const char *seq, AlphabetRef alphabet_) - : strand(UnknownStrand), - header(""), +Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_) + : header(""), species("") { - set_filtered_sequence(seq, alphabet_); + set_filtered_sequence(seq, alphabet_, 0, npos, strand_); } -Sequence::Sequence(const std::string& seq, AlphabetRef alphabet_) - : strand(UnknownStrand), - header(""), +Sequence::Sequence(const std::string& seq, + AlphabetRef alphabet_, + SeqSpan::strand_type strand_) + : header(""), species("") { - set_filtered_sequence(seq, alphabet_); + set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_); } Sequence::Sequence(const Sequence& o) : seq(o.seq), - strand(o.strand), header(o.header), species(o.species), annots(o.annots), @@ -117,7 +115,6 @@ Sequence::Sequence(const Sequence& o) Sequence::Sequence(const Sequence* o) : seq(o->seq), - strand(o->strand), header(o->header), species(o->species), annots(o->annots), @@ -127,7 +124,6 @@ Sequence::Sequence(const Sequence* o) Sequence::Sequence(const SeqSpanRef& seq_ref) : seq(seq_ref), - strand(UnknownStrand), header(""), species("") { @@ -137,7 +133,6 @@ Sequence &Sequence::operator=(const Sequence& s) { if (this != &s) { seq = s.seq; - strand = s.strand; header = s.header; species = s.species; annots = s.annots; @@ -220,7 +215,7 @@ Sequence::load_fasta(std::istream& data_file, AlphabetRef a, std::string rev_comp; std::string sequence_raw; std::string seq_tmp; // holds sequence during basic filtering - const Alphabet &alpha = get_alphabet(); + const Alphabet &alpha = Alphabet::get_alphabet(a); if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); @@ -273,7 +268,7 @@ Sequence::load_fasta(std::istream& data_file, AlphabetRef a, std::string msg("The selected sequence appears to be empty"); throw sequence_empty_error(msg); } - set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index); + set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand); } else { std::string errormsg("There were no fasta sequences"); throw sequence_empty_file_error(errormsg); @@ -284,7 +279,7 @@ void Sequence::set_filtered_sequence(const std::string &in_seq, AlphabetRef alphabet_, size_type start, size_type count, - strand_type strand_) + SeqSpan::strand_type strand_) { if ( count == npos) count = in_seq.size() - start; @@ -302,9 +297,8 @@ void Sequence::set_filtered_sequence(const std::string &in_seq, new_seq.append(1, 'N'); } } - SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_)); + SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_)); seq = new_seq_ref; - strand = strand_; } void @@ -547,15 +541,16 @@ void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count } Sequence -Sequence::subseq(size_type start, size_type count) const +Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const { + // FIXME: should i really allow a subsequence of an empty sequence? if (!seq) { Sequence new_seq; return new_seq; } Sequence new_seq = *this; - new_seq.seq = seq->subseq(start, count); + new_seq.seq = seq->subseq(start, count, strand); copy_children(new_seq, start, count); @@ -566,21 +561,8 @@ Sequence::subseq(size_type start, size_type count) const // FIXME: This needs to be moved into SeqSpan Sequence Sequence::rev_comp() const { - std::string rev_comp; - rev_comp.reserve(length()); - - std::string rc_map = seq->seq->create_reverse_complement_map(); - - // reverse and convert - Sequence::const_reverse_iterator seq_i; - Sequence::const_reverse_iterator seq_end = rend(); - for(seq_i = rbegin(); - seq_i != seq_end; - ++seq_i) - { - rev_comp.append(1, rc_map[*seq_i]); - } - return Sequence(rev_comp, seq->seq->get_alphabet_ref()); + // a reverse complement is the whole opposite strand + return subseq(0, npos, SeqSpan::OppositeStrand); } const Alphabet& Sequence::get_alphabet() const @@ -623,7 +605,7 @@ Sequence::get_name() const void Sequence::set_sequence(const std::string& s, AlphabetRef a) { - set_filtered_sequence(s, a); + set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand); } std::string Sequence::get_sequence() const @@ -640,7 +622,6 @@ void Sequence::clear() { seq.reset(); - strand = UnknownStrand; header.clear(); species.clear(); annots.clear(); @@ -696,7 +677,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) } getline(load_file, file_data_line); // looks like the sequence is written as a single line - set_filtered_sequence(file_data_line, reduced_dna_alphabet); + set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand); getline(load_file, file_data_line); getline(load_file, file_data_line); if (file_data_line == "") diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 4b93ef1..80aa49f 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -103,13 +103,14 @@ public: typedef SeqString::const_reference const_reference; typedef SeqString::size_type size_type; static const size_type npos = SeqString::npos; - enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand }; - Sequence(AlphabetRef a = reduced_nucleic_alphabet); + Sequence(AlphabetRef a = reduced_dna_alphabet); Sequence(const char* seq, - AlphabetRef a = reduced_nucleic_alphabet); + AlphabetRef a = reduced_dna_alphabet, + SeqSpan::strand_type strand = SeqSpan::PlusStrand); Sequence(const std::string& seq, - AlphabetRef a = reduced_nucleic_alphabet); + AlphabetRef a = reduced_dna_alphabet, + SeqSpan::strand_type strand = SeqSpan::PlusStrand); Sequence(const Sequence& seq); Sequence(const Sequence *); Sequence(const SeqSpanRef&); @@ -125,10 +126,10 @@ public: //! set sequence to a (sub)string containing nothing but AGCTN void set_filtered_sequence(const std::string& seq, - AlphabetRef a, + AlphabetRef a=reduced_dna_alphabet, size_type start=0, size_type count=npos, - strand_type strand=UnknownStrand); + SeqSpan::strand_type strand=SeqSpan::PlusStrand); //! retrive element at specific position const_reference at(size_type i) const { return seq->at(i); } @@ -159,7 +160,9 @@ public: size_type stop() const { return seq->parentStop(); } //! return a subsequence, copying over any appropriate annotation - Sequence subseq(size_type start=0, size_type count = npos) const; + Sequence subseq(size_type start=0, + size_type count = npos, + SeqSpan::strand_type strand = SeqSpan::SameStrand) const; //! reverse a character std::string create_reverse_map() const; //! return a reverse compliment (this needs to be improved?) @@ -234,8 +237,6 @@ public: protected: SeqSpanRef seq; - //! strand orientation - strand_type strand; //! fasta header std::string header; //! species name @@ -259,7 +260,6 @@ protected: template void serialize(Archive& ar, const unsigned int /*version*/) { ar & BOOST_SERIALIZATION_NVP(seq); - ar & BOOST_SERIALIZATION_NVP(strand); ar & BOOST_SERIALIZATION_NVP(header); ar & BOOST_SERIALIZATION_NVP(species); ar & BOOST_SERIALIZATION_NVP(annots); diff --git a/alg/test/test_alphabet.cpp b/alg/test/test_alphabet.cpp index 5f7d9f3..854c1b3 100644 --- a/alg/test/test_alphabet.cpp +++ b/alg/test/test_alphabet.cpp @@ -26,4 +26,150 @@ BOOST_AUTO_TEST_CASE( alphabet_equality) { Alphabet a(Alphabet::reduced_dna_alphabet()); BOOST_CHECK_EQUAL( a, Alphabet::reduced_dna_alphabet() ); +} + +/* +BOOST_AUTO_TEST_CASE( alphabet_rc_invalid ) +{ + Alphabet a1 = Alphabet::get_alphabet(empty_alphabet); + BOOST_CHECK_THROW(a1.get_complement_map(), mussa_error); + + Alphabet a2 = Alphabet::get_alphabet(protein_alphabet); + BOOST_CHECK_THROW(a2.create_complement_map(), mussa_error); +} +*/ + +BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_dna) +{ + Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet); + + std::string rc_map = a.get_complement_map(); + BOOST_CHECK_EQUAL(rc_map['A'], 'T'); + BOOST_CHECK_EQUAL(rc_map['a'], 't'); + BOOST_CHECK_EQUAL(rc_map['T'], 'A'); + BOOST_CHECK_EQUAL(rc_map['t'], 'a'); + BOOST_CHECK_EQUAL(rc_map['G'], 'C'); + BOOST_CHECK_EQUAL(rc_map['g'], 'c'); + BOOST_CHECK_EQUAL(rc_map['C'], 'G'); + BOOST_CHECK_EQUAL(rc_map['c'], 'g'); + BOOST_CHECK_EQUAL(rc_map['U'], '~'); + BOOST_CHECK_EQUAL(rc_map['u'], '~'); + BOOST_CHECK_EQUAL(rc_map['Q'], '~'); + BOOST_CHECK_EQUAL(rc_map['*'], '~'); +} + +BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_rna) +{ + Alphabet a = Alphabet::get_alphabet(reduced_rna_alphabet); + + std::string rc_map = a.get_complement_map(); + BOOST_CHECK_EQUAL(rc_map['A'], 'U'); + BOOST_CHECK_EQUAL(rc_map['a'], 'u'); + BOOST_CHECK_EQUAL(rc_map['G'], 'C'); + BOOST_CHECK_EQUAL(rc_map['g'], 'c'); + BOOST_CHECK_EQUAL(rc_map['C'], 'G'); + BOOST_CHECK_EQUAL(rc_map['c'], 'g'); + BOOST_CHECK_EQUAL(rc_map['T'], '~'); + BOOST_CHECK_EQUAL(rc_map['t'], '~'); + BOOST_CHECK_EQUAL(rc_map['U'], 'A'); + BOOST_CHECK_EQUAL(rc_map['u'], 'a'); + BOOST_CHECK_EQUAL(rc_map['Q'], '~'); + BOOST_CHECK_EQUAL(rc_map['*'], '~'); +} + +BOOST_AUTO_TEST_CASE( alphabet_rc_dna) +{ + Alphabet a = Alphabet::get_alphabet(dna_alphabet); + + std::string rc_map = a.get_complement_map(); + BOOST_CHECK_EQUAL(rc_map['A'], 'T'); + BOOST_CHECK_EQUAL(rc_map['a'], 't'); + BOOST_CHECK_EQUAL(rc_map['C'], 'G'); + BOOST_CHECK_EQUAL(rc_map['c'], 'g'); + BOOST_CHECK_EQUAL(rc_map['G'], 'C'); + BOOST_CHECK_EQUAL(rc_map['g'], 'c'); + BOOST_CHECK_EQUAL(rc_map['T'], 'A'); + BOOST_CHECK_EQUAL(rc_map['t'], 'a'); + BOOST_CHECK_EQUAL(rc_map['R'], 'Y'); + BOOST_CHECK_EQUAL(rc_map['r'], 'y'); + BOOST_CHECK_EQUAL(rc_map['Y'], 'R'); + BOOST_CHECK_EQUAL(rc_map['y'], 'r'); + BOOST_CHECK_EQUAL(rc_map['S'], 'S'); + BOOST_CHECK_EQUAL(rc_map['s'], 's'); + BOOST_CHECK_EQUAL(rc_map['W'], 'W'); + BOOST_CHECK_EQUAL(rc_map['w'], 'w'); + BOOST_CHECK_EQUAL(rc_map['K'], 'M'); + BOOST_CHECK_EQUAL(rc_map['k'], 'm'); + BOOST_CHECK_EQUAL(rc_map['M'], 'K'); + BOOST_CHECK_EQUAL(rc_map['m'], 'k'); + BOOST_CHECK_EQUAL(rc_map['B'], 'V'); + BOOST_CHECK_EQUAL(rc_map['b'], 'v'); + BOOST_CHECK_EQUAL(rc_map['V'], 'B'); + BOOST_CHECK_EQUAL(rc_map['v'], 'b'); + BOOST_CHECK_EQUAL(rc_map['D'], 'H'); + BOOST_CHECK_EQUAL(rc_map['d'], 'h'); + BOOST_CHECK_EQUAL(rc_map['H'], 'D'); + BOOST_CHECK_EQUAL(rc_map['h'], 'd'); + BOOST_CHECK_EQUAL(rc_map['N'], 'N'); + BOOST_CHECK_EQUAL(rc_map['n'], 'n'); + + BOOST_CHECK_EQUAL(rc_map['U'], '~'); + BOOST_CHECK_EQUAL(rc_map['u'], '~'); + BOOST_CHECK_EQUAL(rc_map['Q'], '~'); + BOOST_CHECK_EQUAL(rc_map['*'], '~'); +} + +BOOST_AUTO_TEST_CASE( alphabet_rc_rna) +{ + Alphabet a = Alphabet::get_alphabet(rna_alphabet); + + std::string rc_map = a.get_complement_map(); + BOOST_CHECK_EQUAL(rc_map['A'], 'U'); + BOOST_CHECK_EQUAL(rc_map['a'], 'u'); + BOOST_CHECK_EQUAL(rc_map['C'], 'G'); + BOOST_CHECK_EQUAL(rc_map['c'], 'g'); + BOOST_CHECK_EQUAL(rc_map['G'], 'C'); + BOOST_CHECK_EQUAL(rc_map['g'], 'c'); + BOOST_CHECK_EQUAL(rc_map['U'], 'A'); + BOOST_CHECK_EQUAL(rc_map['u'], 'a'); + BOOST_CHECK_EQUAL(rc_map['R'], 'Y'); + BOOST_CHECK_EQUAL(rc_map['r'], 'y'); + BOOST_CHECK_EQUAL(rc_map['Y'], 'R'); + BOOST_CHECK_EQUAL(rc_map['y'], 'r'); + BOOST_CHECK_EQUAL(rc_map['S'], 'S'); + BOOST_CHECK_EQUAL(rc_map['s'], 's'); + BOOST_CHECK_EQUAL(rc_map['W'], 'W'); + BOOST_CHECK_EQUAL(rc_map['w'], 'w'); + BOOST_CHECK_EQUAL(rc_map['K'], 'M'); + BOOST_CHECK_EQUAL(rc_map['k'], 'm'); + BOOST_CHECK_EQUAL(rc_map['M'], 'K'); + BOOST_CHECK_EQUAL(rc_map['m'], 'k'); + BOOST_CHECK_EQUAL(rc_map['B'], 'V'); + BOOST_CHECK_EQUAL(rc_map['b'], 'v'); + BOOST_CHECK_EQUAL(rc_map['V'], 'B'); + BOOST_CHECK_EQUAL(rc_map['v'], 'b'); + BOOST_CHECK_EQUAL(rc_map['D'], 'H'); + BOOST_CHECK_EQUAL(rc_map['d'], 'h'); + BOOST_CHECK_EQUAL(rc_map['H'], 'D'); + BOOST_CHECK_EQUAL(rc_map['h'], 'd'); + BOOST_CHECK_EQUAL(rc_map['N'], 'N'); + BOOST_CHECK_EQUAL(rc_map['n'], 'n'); + + BOOST_CHECK_EQUAL(rc_map['T'], '~'); + BOOST_CHECK_EQUAL(rc_map['t'], '~'); + BOOST_CHECK_EQUAL(rc_map['Q'], '~'); + BOOST_CHECK_EQUAL(rc_map['*'], '~'); +} + + +//enum AlphabetRef { reduced_nucleic_alphabet, nucleic_alphabet, }; +BOOST_AUTO_TEST_CASE( alphabet_reverse_complement ) +{ + Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet); + std::string seq("AAAAGCT"); + std::string known_rc_seq("AGCTTTT"); + boost::shared_ptr new_seq(a.reverse_complement(seq)); + + BOOST_CHECK_EQUAL(*new_seq, known_rc_seq); + } \ No newline at end of file diff --git a/alg/test/test_seq.cpp b/alg/test/test_seq.cpp index a016cb2..767bc5d 100644 --- a/alg/test/test_seq.cpp +++ b/alg/test/test_seq.cpp @@ -21,11 +21,21 @@ BOOST_AUTO_TEST_CASE( seqstring_string ) } // such an exciting unit test, making sure that a=b; a==b -BOOST_AUTO_TEST_CASE( seqstring_string_alphabet ) +BOOST_AUTO_TEST_CASE( seqstring_string_nucleic_alphabet ) { SeqString s("AGCT", nucleic_alphabet); BOOST_CHECK_EQUAL(s.get_alphabet_ref(), nucleic_alphabet); BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::nucleic_alphabet()); BOOST_CHECK_EQUAL(s.size(), 4); +} + +// such an exciting unit test, making sure that a=b; a==b +BOOST_AUTO_TEST_CASE( seqstring_string_dna_alphabet ) +{ + SeqString s("AGCT", dna_alphabet); + + BOOST_CHECK_EQUAL(s.get_alphabet_ref(), dna_alphabet); + BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::dna_alphabet()); + BOOST_CHECK_EQUAL(s.size(), 4); } \ No newline at end of file diff --git a/alg/test/test_seq_span.cpp b/alg/test/test_seq_span.cpp index 90e3d00..4989b6c 100644 --- a/alg/test/test_seq_span.cpp +++ b/alg/test/test_seq_span.cpp @@ -12,6 +12,7 @@ BOOST_AUTO_TEST_CASE( seqspan_from_string ) SeqSpanRef span1(new SeqSpan(str1)); BOOST_CHECK_EQUAL(span1->length(), str1.length()); BOOST_CHECK_EQUAL(span1->sequence(), str1); + BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand); } BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet ) @@ -23,6 +24,39 @@ BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet ) BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet()); } +BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_plusstrand ) +{ + std::string str1("AAGGCCUU"); + SeqSpanRef span1(new SeqSpan(str1, reduced_rna_alphabet, SeqSpan::PlusStrand)); + BOOST_CHECK_EQUAL(span1->length(), str1.length()); + BOOST_CHECK_EQUAL(span1->sequence(), str1); + BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet()); + BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand); +} + +BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_singlestrand ) +{ + std::string str1("AAAAGCT"); + SeqSpanRef span1(new SeqSpan(str1, reduced_dna_alphabet, SeqSpan::SingleStrand)); + BOOST_CHECK_EQUAL(span1->length(), str1.length()); + BOOST_CHECK_EQUAL(span1->sequence(), str1); + BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_dna_alphabet()); + // we always store strands as Plus + BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::SingleStrand); + BOOST_CHECK_EQUAL(span1->sequence(), "AAAAGCT"); + BOOST_CHECK_THROW(span1->subseq(0,2,SeqSpan::OppositeStrand), sequence_invalid_strand); +} + +BOOST_AUTO_TEST_CASE( seqspan_from_string_with_invalidstrand ) +{ + std::string s("AAAAGCT"); + BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::UnknownStrand), sequence_invalid_strand); + BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand); + BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::SameStrand), sequence_invalid_strand); + BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::OppositeStrand), sequence_invalid_strand); + BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand); +} + BOOST_AUTO_TEST_CASE( seqspan_from_seqspan ) { std::string str1("AAGGCCTT"); @@ -78,6 +112,13 @@ BOOST_AUTO_TEST_CASE( seqspan_at ) BOOST_CHECK_EQUAL( str1[2], seq2->at(0) ); BOOST_CHECK_EQUAL( (*seq1)[0], seq1->at(0) ); BOOST_CHECK_EQUAL( (*seq1)[2], (*seq2)[0] ); + + SeqSpanRef seq3 = seq1->subseq(0, 4, SeqSpan::OppositeStrand); + BOOST_CHECK_EQUAL( seq3->at(0), 'C'); + BOOST_CHECK_EQUAL( seq3->at(1), 'C'); + BOOST_CHECK_EQUAL( seq3->at(2), 'T'); + BOOST_CHECK_EQUAL( seq3->at(3), 'T'); + } BOOST_AUTO_TEST_CASE( seqspan_data ) @@ -111,6 +152,21 @@ BOOST_AUTO_TEST_CASE( seqspan_begin_end ) } } +BOOST_AUTO_TEST_CASE( seqspan_subseq_reverse_begin_end ) +{ + std::string str1("AAAACCTT"); + std::string str1rc("AAGGTTTT"); + SeqSpanRef seq1(new SeqSpan(str1)); + SeqSpanRef seq2(new SeqSpan(seq1, 0, SeqSpan::npos, SeqSpan::OppositeStrand )); + + + std::string::const_iterator str1rc_i = str1rc.begin(); + SeqSpan::const_iterator seq2_i = seq2->begin(); + for(; not ((str1rc_i == str1rc.end()) or (seq2_i == seq2->end())); ++str1rc_i, ++seq2_i) { + BOOST_CHECK_EQUAL( *str1rc_i, *seq2_i ); + } +} + BOOST_AUTO_TEST_CASE( seqspan_rbegin_rend ) { std::string str1("AAGGCCTT"); @@ -289,4 +345,37 @@ BOOST_AUTO_TEST_CASE( seqspan_stop_past_end ) s3->setStop(8); BOOST_CHECK_EQUAL( s3->size(), 4); +} + +BOOST_AUTO_TEST_CASE( seqspan_strand_sameother ) +{ + SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG")); + BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand); + + SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::SameStrand); + BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA"); + BOOST_CHECK_EQUAL(seq2->strand(), SeqSpan::PlusStrand); + SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::OppositeStrand); + BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT"); + BOOST_CHECK_EQUAL(seq3->strand(), SeqSpan::MinusStrand); + + // opposite of a plus strand should be minus + SeqSpanRef seq4 = seq2->subseq(0,4,SeqSpan::OppositeStrand); + BOOST_CHECK_EQUAL(seq4->sequence(), "TTTT"); + BOOST_CHECK_EQUAL(seq4->strand(), SeqSpan::MinusStrand); + // opposite of a minus strand should be plus + SeqSpanRef seq5 = seq3->subseq(0,4,SeqSpan::OppositeStrand); + BOOST_CHECK_EQUAL(seq5->sequence(), "AAAA"); + BOOST_CHECK_EQUAL(seq5->strand(), SeqSpan::PlusStrand); +} + +BOOST_AUTO_TEST_CASE( seqspan_strand_plusminus ) +{ + SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG")); + BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand); + + SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::PlusStrand); + BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA"); + SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::MinusStrand); + BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT"); } \ No newline at end of file diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 6da21f1..cf417d9 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -374,8 +374,9 @@ BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna ) { std::string rna_str("AGCUN"); Sequence rna_seq(rna_str, reduced_rna_alphabet); - BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU"); - BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp()); + BOOST_CHECK_EQUAL(rna_seq.rev_comp().get_sequence(), "NAGCU"); + BOOST_CHECK_EQUAL(rna_seq.get_sequence(), + rna_seq.rev_comp().rev_comp().get_sequence()); } BOOST_AUTO_TEST_CASE( sequence_reverse_complement_subseq ) diff --git a/mussa_exceptions.hpp b/mussa_exceptions.hpp index 3c44e60..1c577f4 100644 --- a/mussa_exceptions.hpp +++ b/mussa_exceptions.hpp @@ -71,6 +71,14 @@ public: mussa_error(msg) {}; }; +//! Invalid strand identifier +class sequence_invalid_strand : public mussa_error +{ +public: + explicit sequence_invalid_strand(const std::string& msg) : + mussa_error(msg) {}; +}; + //! Error loading sequence annotation class annotation_load_error : public sequence_load_error { -- 2.30.2