X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=blobdiff_plain;f=alg%2Fseq_span.cpp;h=bf9a3dc3664bb72d40a613a28521795db1265402;hp=2870f0f077f112f57cc2c60857926e2974af1f4b;hb=75496e2c562d728af983c347527270eba360c6ee;hpb=b9755e1974201ff513c66b0fd684bde330c6fff6 diff --git a/alg/seq_span.cpp b/alg/seq_span.cpp index 2870f0f..bf9a3dc 100644 --- a/alg/seq_span.cpp +++ b/alg/seq_span.cpp @@ -9,7 +9,8 @@ SeqSpan::SeqSpan(const SeqSpan &o) : seq(o.seq), seq_start(o.seq_start), seq_count(o.seq_count), - parent(o.parent) + parent(o.parent), + rc_seq(o.rc_seq) { } @@ -17,20 +18,37 @@ SeqSpan::SeqSpan(const SeqSpan *p) : seq(p->seq), seq_start(p->seq_start), seq_count(p->seq_count), - parent(p->parent) + parent(p->parent), + rc_seq(p->rc_seq) { } SeqSpan::SeqSpan(const std::string &seq_, - AlphabetRef a) - : seq(new SeqString(seq_, a)), - seq_start(0), + AlphabetRef a, + strand_type strand_) + : seq_start(0), seq_count(seq_.length()), parent() { + switch (strand_) { + case PlusStrand: + case SingleStrand: + // do nothing + seq_strand = strand_; + seq.reset(new SeqString(seq_, a)); + break; + default: + throw sequence_invalid_strand( + "Can only initialize a Plus or Single strand from a string" + ); + break; + } } -SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_) +SeqSpan::SeqSpan(const SeqSpanRef parent_, + size_type start_, + size_type count_, + strand_type strand_) : seq(parent_->seq), seq_start(parent_->seq_start + start_), parent(parent_) @@ -39,6 +57,62 @@ SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_) seq_count = parent_->seq_count; else seq_count = count_; + + // blech this strand logic is too long + switch(strand_) { + case UnknownStrand: + case PlusStrand: + case MinusStrand: + case BothStrand: + seq_strand = strand_; + break; + case SameStrand: + if (parent) { + seq_strand = parent->strand(); + } else { + throw sequence_invalid_strand("SameStrand is undefined with no parent"); + } + break; + case OppositeStrand: + if (parent) { + switch (parent->strand()) { + case PlusStrand: + seq_strand = MinusStrand; + break; + case MinusStrand: + seq_strand = PlusStrand; + break; + case UnknownStrand: + case BothStrand: + seq_strand = parent->strand(); + break; + case SingleStrand: + throw sequence_invalid_strand( + "OppositeStrand is undefined with SingleStrands" + ); + break; + default: + throw sequence_invalid_strand("Parent has an invalid strand type"); + } + } else { + throw sequence_invalid_strand("SameStrand is undefined with no parent"); + } + break; + case SingleStrand: + if (parent) { + if (parent->strand() == SingleStrand) { + seq_strand = SingleStrand; + } else { + throw sequence_invalid_strand("Can't change single strandedness"); + } + } else { + seq_strand = SingleStrand; + } + break; + default: + throw sequence_invalid_strand("unrecognized strand identifier"); + break; + } } ////// @@ -50,6 +124,7 @@ SeqSpan &SeqSpan::operator=(const SeqSpan& s) seq_start = s.seq_start; seq_count = s.seq_count; parent = s.parent; + rc_seq = s.rc_seq; } return *this; } @@ -76,17 +151,26 @@ bool operator<(const SeqSpan& a, const SeqSpan& b) } } */ -#include + bool operator==(const SeqSpan& a, const SeqSpan& b) { if (SeqSpan::isFamily(a, b)) { // can do fast comparison if (a.seq_start == b.seq_start and a.seq_count == b.seq_count) { - return true; + // unknown strands match anything + if (a.seq_strand == SeqSpan::UnknownStrand or + b.seq_strand == SeqSpan::UnknownStrand) { + return true; + } else { + // otherwise our stranded-ness needs to match + return (a.seq_strand == b.seq_strand); + } } else { return false; } - } + } + // if we're not part of the same seqspan heirarchy we're never equal + // the string equality is handled off in Sequence return false; } @@ -103,7 +187,11 @@ SeqSpan::const_reference SeqSpan::operator[](SeqSpan::size_type i) const SeqSpan::const_reference SeqSpan::at(SeqSpan::size_type i) const { if (!seq) throw std::out_of_range("empty sequence"); - return seq->at(i+seq_start); + if (seq_strand == PlusStrand) { + return seq->at(seq_start+i); + } else { + return seq->rc_map[seq->at((seq_start+seq_count-1)-i)]; + } } const char *SeqSpan::data() const @@ -116,36 +204,62 @@ const char *SeqSpan::data() const SeqSpan::const_iterator SeqSpan::begin() const { - if (seq and seq_count != 0) - return seq->begin()+seq_start; - else - return SeqSpan::const_iterator(0); + if (seq and seq_count != 0) { + if (seq_strand != MinusStrand) { + return seq->begin()+seq_start; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->begin(); + } + } + return SeqSpan::const_iterator(0); } SeqSpan::const_iterator SeqSpan::end() const { if (seq and seq_count != 0) { - return seq->begin() + seq_start + seq_count; - } else { - return SeqSpan::const_iterator(0); - } + if (seq_strand != MinusStrand) { + return seq->begin() + seq_start + seq_count; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->end(); + } + } + return SeqSpan::const_iterator(0); } SeqSpan::const_reverse_iterator SeqSpan::rbegin() const { - if (seq and seq_count != 0) - return seq->rbegin()+(seq->size()-(seq_start+seq_count)); - else - return SeqSpan::const_reverse_iterator(); + if (seq and seq_count != 0) { + if (seq_strand != MinusStrand) { + return seq->rbegin()+(seq->size()-(seq_start+seq_count)); + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->rbegin(); + } + } + return SeqSpan::const_reverse_iterator(); } SeqSpan::const_reverse_iterator SeqSpan::rend() const { if (seq and seq_count != 0) { - return rbegin() + seq_count; - } else { - return SeqSpan::const_reverse_iterator(); + if (seq_strand != MinusStrand) { + return rbegin() + seq_count; + } else { + if (not rc_seq) { + initialize_rc_seq(); + } + return rc_seq->rend(); + } } + return SeqSpan::const_reverse_iterator(); } bool SeqSpan::empty() const @@ -160,7 +274,7 @@ SeqSpan::size_type SeqSpan::find_first_not_of( typedef std::set sequence_set; sequence_set match_set; - for(const_iterator query_item = query.begin(); + for(std::string::const_iterator query_item = query.begin(); query_item != query.end(); ++query_item) { @@ -226,18 +340,23 @@ void SeqSpan::setParentStop(SeqSpan::size_type v) setStop(parent->start() + v); } -SeqSpanRef SeqSpan::subseq(size_type start, size_type count) +SeqSpanRef SeqSpan::subseq(size_type start, size_type count, strand_type strand) { count = std::min(count, seq_count - start); - SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count)); + SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count, strand)); return new_span; } std::string SeqSpan::sequence() const { if (seq) { - return seq->substr(seq_start, seq_count); + if (seq_strand != MinusStrand) + return seq->substr(seq_start, seq_count); + else { + initialize_rc_seq(); + return *rc_seq; + } } else { return std::string(); } @@ -247,3 +366,19 @@ bool SeqSpan::isFamily(const SeqSpan& a, const SeqSpan& b) { return a.seq.get() == b.seq.get(); } + +void SeqSpan::initialize_rc_seq() const +{ + std::string new_rc_seq; + std::string rc_map(seq->get_alphabet().get_complement_map()); + + new_rc_seq.reserve(size()); + size_type i = (seq_start + seq_count); + while( i != seq_start ) { + --i; + new_rc_seq.append(1, rc_map[ seq->at(i) ]); + } + + SeqStringRef new_rc_ref(new SeqString(new_rc_seq, seq->get_alphabet_ref())); + const_cast(&rc_seq)->swap(new_rc_ref); +}