#include <stdexcept>
+#include "mussa_exceptions.hpp"
#include "alg/alphabet.hpp"
// some standard dna alphabets
// Include nl (\012), and cr (\015) to make sequence parsing eol
// convention independent.
const char *Alphabet::reduced_dna_cstr = "AaCcGgTtNn\012\015";
+const char *Alphabet::reduced_dna_reverse_cstr = "TtGgCcAaNn\012\015";
const char *Alphabet::reduced_rna_cstr = "AaCcGgUuNn\012\015";
+const char *Alphabet::reduced_rna_reverse_cstr = "UuGgCcAaNn\012\015";
const char *Alphabet::reduced_nucleic_cstr = "AaCcGgTtUuNn\012\015";
+const char *Alphabet::reduced_nucleic_reverse_cstr = "TtGgCcAaAaNn\012\015";
//! this is the general iupac alphabet for nucleotides
+const char *Alphabet::dna_cstr =
+ "AaCcGgTtRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::dna_reverse_cstr =
+ "TtGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
+const char *Alphabet::rna_cstr =
+ "AaCcGgUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::rna_reverse_cstr =
+ "UuGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
const char *Alphabet::nucleic_cstr =
- "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015";
+ "AaCcGgTtUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::nucleic_reverse_cstr =
+ "TtGgCcAaAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
//! the protein alphabet
const char *Alphabet::protein_cstr =
"AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015";
const char *Alphabet::empty_cstr = "";
const Alphabet& Alphabet::reduced_dna_alphabet() {
- static Alphabet *a = new Alphabet(reduced_dna_cstr);
+ static Alphabet *a = new Alphabet(reduced_dna_cstr, reduced_dna_reverse_cstr);
return *a;
}
const Alphabet& Alphabet::reduced_rna_alphabet() {
- static Alphabet *a = new Alphabet(reduced_rna_cstr);
+ static Alphabet *a = new Alphabet(reduced_rna_cstr, reduced_rna_reverse_cstr);
return *a;
}
const Alphabet& Alphabet::reduced_nucleic_alphabet() {
- static Alphabet *a = new Alphabet(reduced_nucleic_cstr);
+ static Alphabet *a = new Alphabet(reduced_nucleic_cstr, reduced_nucleic_reverse_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::dna_alphabet() {
+ static Alphabet *a = new Alphabet(dna_cstr, dna_reverse_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::rna_alphabet() {
+ static Alphabet *a = new Alphabet(rna_cstr, rna_reverse_cstr);
return *a;
}
const Alphabet& Alphabet::nucleic_alphabet() {
- static Alphabet *a = new Alphabet(nucleic_cstr);
+ static Alphabet *a = new Alphabet(nucleic_cstr, nucleic_reverse_cstr);
return *a;
}
const Alphabet& Alphabet::protein_alphabet() {
- static Alphabet *a = new Alphabet(protein_cstr);
+ static Alphabet *a = new Alphabet(protein_cstr, protein_cstr);
return *a;
}
const Alphabet& Alphabet::empty_alphabet() {
- static Alphabet *a = new Alphabet(empty_cstr);
+ static Alphabet *a = new Alphabet(empty_cstr, empty_cstr);
return *a;
}
-Alphabet::Alphabet(const char *a) :
- alphabet(a)
+Alphabet::Alphabet(const char *a, const char *reverse_alphabet) :
+ alphabet(a),
+ complement_map(create_complement_map(reverse_alphabet))
{
alphabet_set.insert(alphabet.begin(), alphabet.end());
}
void Alphabet::assign(const Alphabet& a)
{
alphabet = a.alphabet;
+ complement_map = a.complement_map;
alphabet_set.clear();
alphabet_set.insert(alphabet.begin(), alphabet.end());
}
return Alphabet::reduced_rna_alphabet();
case ::reduced_nucleic_alphabet:
return Alphabet::reduced_nucleic_alphabet();
+ case ::dna_alphabet:
+ return Alphabet::dna_alphabet();
+ case ::rna_alphabet:
+ return Alphabet::rna_alphabet();
case ::nucleic_alphabet:
return Alphabet::nucleic_alphabet();
case ::protein_alphabet:
- return Alphabet::protein_alphabet();
+ return Alphabet::protein_alphabet();
+ case ::empty_alphabet:
+ return Alphabet::empty_alphabet();
default:
throw std::runtime_error("unrecognized alphabet type");
break;
}
+}
+
+std::string Alphabet::create_complement_map(const std::string& ra) const
+{
+ if (ra.size() == 0) {
+ // reverse complement is undefined
+ throw mussa_error("no reverse complement");
+ }
+ std::string rc_map(256,'~');
+
+ for(int i = 0; alphabet[i] and ra[i]; ++i) {
+ rc_map[ alphabet[i] ] = ra[i];
+ }
+ return rc_map;
+}
+
+boost::shared_ptr<std::string> Alphabet::reverse_complement(const std::string &seq) const
+{
+ boost::shared_ptr<std::string> new_seq(new std::string());
+
+ new_seq->reserve(seq.size());
+
+ for(std::string::const_reverse_iterator seq_i = seq.rbegin();
+ seq_i != seq.rend();
+ ++seq_i)
+ {
+ new_seq->append(1, complement_map[ *seq_i ]);
+ }
+ return new_seq;
}
\ No newline at end of file
#include <boost/serialization/utility.hpp>
#include <boost/serialization/version.hpp>
+#include <boost/shared_ptr.hpp>
+
#include <set>
#include <ostream>
//! this is a helper class for sequence
-enum AlphabetRef { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet,
- nucleic_alphabet, protein_alphabet, empty_alphabet=255 };
+enum AlphabetRef { reduced_dna_alphabet, dna_alphabet,
+ reduced_rna_alphabet, rna_alphabet,
+ reduced_nucleic_alphabet, nucleic_alphabet,
+ protein_alphabet, empty_alphabet=255 };
class Alphabet {
friend class Sequence;
//! return an alphabet given an AlphabetRef enumeration
static const Alphabet &get_alphabet(AlphabetRef);
+ //! return a map to reverse complement an symbols from a nucleic alphabet
+ std::string create_complement_map(const std::string &) const;
+ //! return compelement map
+ std::string get_complement_map() const { return complement_map; }
+
+ //! return a pointer to a reverse complemented string
+ boost::shared_ptr<std::string> reverse_complement(const std::string &) const;
+
// note, if you want to define an alphabet for a sequence, you probably want
// to update the enumeration in Sequence, and Sequence::get_sequence
//! The standard DNA alphabet, with unique, and unknown characters
static const char *reduced_dna_cstr;
+ static const char *reduced_dna_reverse_cstr;
static const Alphabet &reduced_dna_alphabet();
//! The standard RNA alphabet, with unique, and unknown characters
static const char *reduced_rna_cstr;
+ static const char *reduced_rna_reverse_cstr;
static const Alphabet &reduced_rna_alphabet();
- //! The standard DNA/RNA alphabet, with unique, and unknown characters
+ //! The full IUPAC DNA alphabet, with unique, and unknown characters
+ static const char *dna_cstr;
+ static const char *dna_reverse_cstr;
+ static const Alphabet &dna_alphabet();
+ //! the full IUPAC RNA alphabet
+ static const char *rna_cstr;
+ static const char *rna_reverse_cstr;
+ static const Alphabet &rna_alphabet();
+ //! reduced (DNA/RNA) nucelic alphabet
static const char *reduced_nucleic_cstr;
+ static const char *reduced_nucleic_reverse_cstr;
static const Alphabet &reduced_nucleic_alphabet();
- //! this is the general IUPAC alphabet for nucleotides
+ //! reduced (DNA/RNA) nucelic alphabet
static const char *nucleic_cstr;
+ static const char *nucleic_reverse_cstr;
static const Alphabet &nucleic_alphabet();
//! the protein alphabet
static const char *protein_cstr;
private:
//! what are allowable symbols in our alphabet
std::string alphabet;
+ std::string complement_map;
//! internal variable to make exists() faster
std::set<std::string::value_type> alphabet_set;
//! some necessary string api access
- Alphabet(const char *a);
+ Alphabet(const char *a, const char *reverse_a);
//! allow sequence to copy one alphabet to another (needed when unserializing)
void assign(const Alphabet& a);
const_iterator begin() const { return alphabet.begin(); }
template<class Archive>
void serialize(Archive& ar, const unsigned int /*version*/) {
ar & BOOST_SERIALIZATION_NVP(alphabet);
+ ar & BOOST_SERIALIZATION_NVP(complement_map);
alphabet_set.clear();
alphabet_set.insert(alphabet.begin(), alphabet.end());
}
int start_i, seq1_i, seq2_i, win_i; // loop variables
int matches; // number of matches in to a window
int i2_offset;
- const char *seq1 = sseq1.data();
- const char *seq2 = sseq2.data();
+ std::string sseq1_str(sseq1.get_sequence());
+ std::string sseq2_str(sseq2.get_sequence());
+ const char *seq1 = sseq1_str.c_str();
+ const char *seq2 = sseq2_str.c_str();
validate_sequence(sseq1);
validate_sequence(sseq2);
Iterator end) const
{
float red, green, blue, alpha;
- Sequence seq(parsed->sequence, nucleic_alphabet);
+ Sequence seq(parsed->sequence, dna_alphabet, SeqSpan::PlusStrand);
seq.set_fasta_header(parsed->name);
alpha = 1.0;
void motif_parser::ParsedMotifs::parse(const std::string &data)
{
- const char *alphabet = Alphabet::nucleic_cstr;
+ const char *alphabet = Alphabet::dna_cstr;
// parse our string
spirit::parse_info<std::string::const_iterator> result;
void Mussa::load_motifs(std::istream &in)
{
std::string data;
- const char *alphabet = Alphabet::nucleic_cstr;
+ const char *alphabet = Alphabet::dna_cstr;
motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper);
// slurp our data into a string
SeqString(AlphabetRef a=reduced_nucleic_alphabet) :
std::string(),
- alphabet(a)
+ alphabet(a),
+ rc_map(Alphabet::get_alphabet(a).get_complement_map())
{}
SeqString(const std::string &s, AlphabetRef a=reduced_nucleic_alphabet) :
std::string(s),
- alphabet(a)
+ alphabet(a),
+ rc_map(Alphabet::get_alphabet(a).get_complement_map())
{}
AlphabetRef get_alphabet_ref() { return alphabet; }
const Alphabet& get_alphabet() const;
std::string create_reverse_complement_map() const;
+
+ const std::string rc_map;
private:
AlphabetRef alphabet;
friend class boost::serialization::access;
: seq(o.seq),
seq_start(o.seq_start),
seq_count(o.seq_count),
- parent(o.parent)
+ parent(o.parent),
+ rc_seq(o.rc_seq)
{
}
: seq(p->seq),
seq_start(p->seq_start),
seq_count(p->seq_count),
- parent(p->parent)
+ parent(p->parent),
+ rc_seq(p->rc_seq)
{
}
SeqSpan::SeqSpan(const std::string &seq_,
- AlphabetRef a)
- : seq(new SeqString(seq_, a)),
- seq_start(0),
+ AlphabetRef a,
+ strand_type strand_)
+ : seq_start(0),
seq_count(seq_.length()),
parent()
{
+ switch (strand_) {
+ case PlusStrand:
+ case SingleStrand:
+ // do nothing
+ seq_strand = strand_;
+ seq.reset(new SeqString(seq_, a));
+ break;
+ default:
+ throw sequence_invalid_strand(
+ "Can only initialize a Plus or Single strand from a string"
+ );
+ break;
+ }
}
-SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_)
+SeqSpan::SeqSpan(const SeqSpanRef parent_,
+ size_type start_,
+ size_type count_,
+ strand_type strand_)
: seq(parent_->seq),
seq_start(parent_->seq_start + start_),
parent(parent_)
seq_count = parent_->seq_count;
else
seq_count = count_;
+
+ // blech this strand logic is too long
+ switch(strand_) {
+ case UnknownStrand:
+ case PlusStrand:
+ case MinusStrand:
+ case BothStrand:
+ seq_strand = strand_;
+ break;
+ case SameStrand:
+ if (parent) {
+ seq_strand = parent->strand();
+ } else {
+ throw sequence_invalid_strand("SameStrand is undefined with no parent");
+ }
+ break;
+ case OppositeStrand:
+ if (parent) {
+ switch (parent->strand()) {
+ case PlusStrand:
+ seq_strand = MinusStrand;
+ break;
+ case MinusStrand:
+ seq_strand = PlusStrand;
+ break;
+ case UnknownStrand:
+ case BothStrand:
+ seq_strand = parent->strand();
+ break;
+ case SingleStrand:
+ throw sequence_invalid_strand(
+ "OppositeStrand is undefined with SingleStrands"
+ );
+ break;
+ default:
+ throw sequence_invalid_strand("Parent has an invalid strand type");
+ }
+ } else {
+ throw sequence_invalid_strand("SameStrand is undefined with no parent");
+ }
+ break;
+ case SingleStrand:
+ if (parent) {
+ if (parent->strand() == SingleStrand) {
+ seq_strand = SingleStrand;
+ } else {
+ throw sequence_invalid_strand("Can't change single strandedness");
+ }
+ } else {
+ seq_strand = SingleStrand;
+ }
+ break;
+ default:
+ throw sequence_invalid_strand("unrecognized strand identifier");
+ break;
+ }
}
//////
seq_start = s.seq_start;
seq_count = s.seq_count;
parent = s.parent;
+ rc_seq = s.rc_seq;
}
return *this;
}
}
}
*/
-#include <iostream>
+
bool operator==(const SeqSpan& a, const SeqSpan& b)
{
if (SeqSpan::isFamily(a, b)) {
// can do fast comparison
if (a.seq_start == b.seq_start and a.seq_count == b.seq_count) {
- return true;
+ // unknown strands match anything
+ if (a.seq_strand == SeqSpan::UnknownStrand or
+ b.seq_strand == SeqSpan::UnknownStrand) {
+ return true;
+ } else {
+ // otherwise our stranded-ness needs to match
+ return (a.seq_strand == b.seq_strand);
+ }
} else {
return false;
}
- }
+ }
+ // if we're not part of the same seqspan heirarchy we're never equal
+ // the string equality is handled off in Sequence
return false;
}
SeqSpan::const_reference SeqSpan::at(SeqSpan::size_type i) const
{
if (!seq) throw std::out_of_range("empty sequence");
- return seq->at(i+seq_start);
+ if (seq_strand == PlusStrand) {
+ return seq->at(seq_start+i);
+ } else {
+ return seq->rc_map[seq->at((seq_start+seq_count-1)-i)];
+ }
}
const char *SeqSpan::data() const
SeqSpan::const_iterator SeqSpan::begin() const
{
- if (seq and seq_count != 0)
- return seq->begin()+seq_start;
- else
- return SeqSpan::const_iterator(0);
+ if (seq and seq_count != 0) {
+ if (seq_strand != MinusStrand) {
+ return seq->begin()+seq_start;
+ } else {
+ if (not rc_seq) {
+ initialize_rc_seq();
+ }
+ return rc_seq->begin();
+ }
+ }
+ return SeqSpan::const_iterator(0);
}
SeqSpan::const_iterator SeqSpan::end() const
{
if (seq and seq_count != 0) {
- return seq->begin() + seq_start + seq_count;
- } else {
- return SeqSpan::const_iterator(0);
- }
+ if (seq_strand != MinusStrand) {
+ return seq->begin() + seq_start + seq_count;
+ } else {
+ if (not rc_seq) {
+ initialize_rc_seq();
+ }
+ return rc_seq->end();
+ }
+ }
+ return SeqSpan::const_iterator(0);
}
SeqSpan::const_reverse_iterator SeqSpan::rbegin() const
{
- if (seq and seq_count != 0)
- return seq->rbegin()+(seq->size()-(seq_start+seq_count));
- else
- return SeqSpan::const_reverse_iterator();
+ if (seq and seq_count != 0) {
+ if (seq_strand != MinusStrand) {
+ return seq->rbegin()+(seq->size()-(seq_start+seq_count));
+ } else {
+ if (not rc_seq) {
+ initialize_rc_seq();
+ }
+ return rc_seq->rbegin();
+ }
+ }
+ return SeqSpan::const_reverse_iterator();
}
SeqSpan::const_reverse_iterator SeqSpan::rend() const
{
if (seq and seq_count != 0) {
- return rbegin() + seq_count;
- } else {
- return SeqSpan::const_reverse_iterator();
+ if (seq_strand != MinusStrand) {
+ return rbegin() + seq_count;
+ } else {
+ if (not rc_seq) {
+ initialize_rc_seq();
+ }
+ return rc_seq->rend();
+ }
}
+ return SeqSpan::const_reverse_iterator();
}
bool SeqSpan::empty() const
typedef std::set<std::string::value_type> sequence_set;
sequence_set match_set;
- for(const_iterator query_item = query.begin();
+ for(std::string::const_iterator query_item = query.begin();
query_item != query.end();
++query_item)
{
setStop(parent->start() + v);
}
-SeqSpanRef SeqSpan::subseq(size_type start, size_type count)
+SeqSpanRef SeqSpan::subseq(size_type start, size_type count, strand_type strand)
{
count = std::min<size_type>(count, seq_count - start);
- SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count));
+ SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count, strand));
return new_span;
}
std::string SeqSpan::sequence() const
{
if (seq) {
- return seq->substr(seq_start, seq_count);
+ if (seq_strand != MinusStrand)
+ return seq->substr(seq_start, seq_count);
+ else {
+ initialize_rc_seq();
+ return *rc_seq;
+ }
} else {
return std::string();
}
{
return a.seq.get() == b.seq.get();
}
+
+void SeqSpan::initialize_rc_seq() const
+{
+ std::string new_rc_seq;
+ std::string rc_map(seq->get_alphabet().get_complement_map());
+
+ new_rc_seq.reserve(size());
+ size_type i = (seq_start + seq_count);
+ while( i != seq_start ) {
+ --i;
+ new_rc_seq.append(1, rc_map[ seq->at(i) ]);
+ }
+
+ SeqStringRef new_rc_ref(new SeqString(new_rc_seq, seq->get_alphabet_ref()));
+ const_cast<SeqStringRef *>(&rc_seq)->swap(new_rc_ref);
+}
typedef SeqString::size_type size_type;
typedef SeqString::value_type value_type;
static const size_type npos = SeqString::npos;
+ //! Define strand types
+ /**!
+ * Unknown strand is treated as "either" strand
+ * Plus refers to the initially created strand
+ * Minus is the opposite strand
+ * Both is for any feature that applies to "both" strands
+ * (which may not actually be useful)
+ * Same strand is only used when creating a subsequence
+ * and implies the subsequence has the same orientation as the parent
+ * Opposite is only used for creating a subsequence
+ * and implies the subsequence has the opposite orientation as the parent
+ * Single indicates that this is single stranded and there can't be
+ * an opposite strand.
+ */
+ enum strand_type { UnknownStrand, MinusStrand, PlusStrand,
+ BothStrand, SameStrand, OppositeStrand, SingleStrand };
public:
SeqSpan(const SeqSpan &);
SeqSpan(const SeqSpan *);
explicit SeqSpan(const std::string &,
- AlphabetRef = reduced_nucleic_alphabet
+ AlphabetRef a = dna_alphabet,
+ strand_type strand=PlusStrand
);
- SeqSpan(const SeqSpanRef, size_type start=0, size_type count=npos);
+ SeqSpan(const SeqSpanRef,
+ size_type start=0,
+ size_type count=npos,
+ strand_type strand=SameStrand);
//! assignment
SeqSpan& operator=(const SeqSpan&);
size_type stop() const { return seq_start + seq_count; }
//! set one past the last position relative to the root sequence.
void setStop(size_type);
+ strand_type strand() const { return seq_strand; }
//! get start position relative to the parent sequence
size_type parentStart() const;
//! return a subsequence, copying over any appropriate annotation
- SeqSpanRef subseq(size_type start=0, size_type count = std::string::npos);
+ SeqSpanRef subseq(size_type start=0,
+ size_type count = std::string::npos,
+ strand_type = PlusStrand);
//! get sequence
std::string sequence() const;
//! are both sequences derived from the same sequence tree?
static bool isFamily(const SeqSpan& a, const SeqSpan& b);
+ //! fill in our rc_seq variable
+ void initialize_rc_seq() const;
+
friend class Sequence;
private:
//! do not statically initialize, only create with new
size_type seq_start;
//! how big we ware
size_type seq_count;
- // Do I need to track the strand here?
+ //! strand orientation
+ strand_type seq_strand;
//! keep a reference to who our parent span is
SeqSpanRef parent;
+ //! hold a reverse complement version of our sequence if needed
+ SeqStringRef rc_seq;
+
// boost::serialization support
friend class boost::serialization::access;
template<class Archive>
ar & BOOST_SERIALIZATION_NVP(seq);
ar & BOOST_SERIALIZATION_NVP(seq_start);
ar & BOOST_SERIALIZATION_NVP(seq_count);
+ ar & BOOST_SERIALIZATION_NVP(seq_strand);
ar & BOOST_SERIALIZATION_NVP(parent);
}
};
Sequence::Sequence(AlphabetRef alphabet)
- : seq(new SeqSpan("", alphabet)),
- strand(UnknownStrand)
+ : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand))
{
}
{
}
-Sequence::Sequence(const char *seq, AlphabetRef alphabet_)
- : strand(UnknownStrand),
- header(""),
+Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
+ : header(""),
species("")
{
- set_filtered_sequence(seq, alphabet_);
+ set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
}
-Sequence::Sequence(const std::string& seq, AlphabetRef alphabet_)
- : strand(UnknownStrand),
- header(""),
+Sequence::Sequence(const std::string& seq,
+ AlphabetRef alphabet_,
+ SeqSpan::strand_type strand_)
+ : header(""),
species("")
{
- set_filtered_sequence(seq, alphabet_);
+ set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
}
Sequence::Sequence(const Sequence& o)
: seq(o.seq),
- strand(o.strand),
header(o.header),
species(o.species),
annots(o.annots),
Sequence::Sequence(const Sequence* o)
: seq(o->seq),
- strand(o->strand),
header(o->header),
species(o->species),
annots(o->annots),
Sequence::Sequence(const SeqSpanRef& seq_ref)
: seq(seq_ref),
- strand(UnknownStrand),
header(""),
species("")
{
{
if (this != &s) {
seq = s.seq;
- strand = s.strand;
header = s.header;
species = s.species;
annots = s.annots;
std::string rev_comp;
std::string sequence_raw;
std::string seq_tmp; // holds sequence during basic filtering
- const Alphabet &alpha = get_alphabet();
+ const Alphabet &alpha = Alphabet::get_alphabet(a);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
std::string msg("The selected sequence appears to be empty");
throw sequence_empty_error(msg);
}
- set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
+ set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
} else {
std::string errormsg("There were no fasta sequences");
throw sequence_empty_file_error(errormsg);
AlphabetRef alphabet_,
size_type start,
size_type count,
- strand_type strand_)
+ SeqSpan::strand_type strand_)
{
if ( count == npos)
count = in_seq.size() - start;
new_seq.append(1, 'N');
}
}
- SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_));
+ SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
seq = new_seq_ref;
- strand = strand_;
}
void
}
Sequence
-Sequence::subseq(size_type start, size_type count) const
+Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
{
+ // FIXME: should i really allow a subsequence of an empty sequence?
if (!seq) {
Sequence new_seq;
return new_seq;
}
Sequence new_seq = *this;
- new_seq.seq = seq->subseq(start, count);
+ new_seq.seq = seq->subseq(start, count, strand);
copy_children(new_seq, start, count);
// FIXME: This needs to be moved into SeqSpan
Sequence Sequence::rev_comp() const
{
- std::string rev_comp;
- rev_comp.reserve(length());
-
- std::string rc_map = seq->seq->create_reverse_complement_map();
-
- // reverse and convert
- Sequence::const_reverse_iterator seq_i;
- Sequence::const_reverse_iterator seq_end = rend();
- for(seq_i = rbegin();
- seq_i != seq_end;
- ++seq_i)
- {
- rev_comp.append(1, rc_map[*seq_i]);
- }
- return Sequence(rev_comp, seq->seq->get_alphabet_ref());
+ // a reverse complement is the whole opposite strand
+ return subseq(0, npos, SeqSpan::OppositeStrand);
}
const Alphabet& Sequence::get_alphabet() const
void Sequence::set_sequence(const std::string& s, AlphabetRef a)
{
- set_filtered_sequence(s, a);
+ set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
}
std::string Sequence::get_sequence() const
Sequence::clear()
{
seq.reset();
- strand = UnknownStrand;
header.clear();
species.clear();
annots.clear();
}
getline(load_file, file_data_line);
// looks like the sequence is written as a single line
- set_filtered_sequence(file_data_line, reduced_dna_alphabet);
+ set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
typedef SeqString::const_reference const_reference;
typedef SeqString::size_type size_type;
static const size_type npos = SeqString::npos;
- enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand };
- Sequence(AlphabetRef a = reduced_nucleic_alphabet);
+ Sequence(AlphabetRef a = reduced_dna_alphabet);
Sequence(const char* seq,
- AlphabetRef a = reduced_nucleic_alphabet);
+ AlphabetRef a = reduced_dna_alphabet,
+ SeqSpan::strand_type strand = SeqSpan::PlusStrand);
Sequence(const std::string& seq,
- AlphabetRef a = reduced_nucleic_alphabet);
+ AlphabetRef a = reduced_dna_alphabet,
+ SeqSpan::strand_type strand = SeqSpan::PlusStrand);
Sequence(const Sequence& seq);
Sequence(const Sequence *);
Sequence(const SeqSpanRef&);
//! set sequence to a (sub)string containing nothing but AGCTN
void set_filtered_sequence(const std::string& seq,
- AlphabetRef a,
+ AlphabetRef a=reduced_dna_alphabet,
size_type start=0,
size_type count=npos,
- strand_type strand=UnknownStrand);
+ SeqSpan::strand_type strand=SeqSpan::PlusStrand);
//! retrive element at specific position
const_reference at(size_type i) const { return seq->at(i); }
size_type stop() const { return seq->parentStop(); }
//! return a subsequence, copying over any appropriate annotation
- Sequence subseq(size_type start=0, size_type count = npos) const;
+ Sequence subseq(size_type start=0,
+ size_type count = npos,
+ SeqSpan::strand_type strand = SeqSpan::SameStrand) const;
//! reverse a character
std::string create_reverse_map() const;
//! return a reverse compliment (this needs to be improved?)
protected:
SeqSpanRef seq;
- //! strand orientation
- strand_type strand;
//! fasta header
std::string header;
//! species name
template<class Archive>
void serialize(Archive& ar, const unsigned int /*version*/) {
ar & BOOST_SERIALIZATION_NVP(seq);
- ar & BOOST_SERIALIZATION_NVP(strand);
ar & BOOST_SERIALIZATION_NVP(header);
ar & BOOST_SERIALIZATION_NVP(species);
ar & BOOST_SERIALIZATION_NVP(annots);
{
Alphabet a(Alphabet::reduced_dna_alphabet());
BOOST_CHECK_EQUAL( a, Alphabet::reduced_dna_alphabet() );
+}
+
+/*
+BOOST_AUTO_TEST_CASE( alphabet_rc_invalid )
+{
+ Alphabet a1 = Alphabet::get_alphabet(empty_alphabet);
+ BOOST_CHECK_THROW(a1.get_complement_map(), mussa_error);
+
+ Alphabet a2 = Alphabet::get_alphabet(protein_alphabet);
+ BOOST_CHECK_THROW(a2.create_complement_map(), mussa_error);
+}
+*/
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_dna)
+{
+ Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet);
+
+ std::string rc_map = a.get_complement_map();
+ BOOST_CHECK_EQUAL(rc_map['A'], 'T');
+ BOOST_CHECK_EQUAL(rc_map['a'], 't');
+ BOOST_CHECK_EQUAL(rc_map['T'], 'A');
+ BOOST_CHECK_EQUAL(rc_map['t'], 'a');
+ BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+ BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+ BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+ BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+ BOOST_CHECK_EQUAL(rc_map['U'], '~');
+ BOOST_CHECK_EQUAL(rc_map['u'], '~');
+ BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+ BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_rna)
+{
+ Alphabet a = Alphabet::get_alphabet(reduced_rna_alphabet);
+
+ std::string rc_map = a.get_complement_map();
+ BOOST_CHECK_EQUAL(rc_map['A'], 'U');
+ BOOST_CHECK_EQUAL(rc_map['a'], 'u');
+ BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+ BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+ BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+ BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+ BOOST_CHECK_EQUAL(rc_map['T'], '~');
+ BOOST_CHECK_EQUAL(rc_map['t'], '~');
+ BOOST_CHECK_EQUAL(rc_map['U'], 'A');
+ BOOST_CHECK_EQUAL(rc_map['u'], 'a');
+ BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+ BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_dna)
+{
+ Alphabet a = Alphabet::get_alphabet(dna_alphabet);
+
+ std::string rc_map = a.get_complement_map();
+ BOOST_CHECK_EQUAL(rc_map['A'], 'T');
+ BOOST_CHECK_EQUAL(rc_map['a'], 't');
+ BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+ BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+ BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+ BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+ BOOST_CHECK_EQUAL(rc_map['T'], 'A');
+ BOOST_CHECK_EQUAL(rc_map['t'], 'a');
+ BOOST_CHECK_EQUAL(rc_map['R'], 'Y');
+ BOOST_CHECK_EQUAL(rc_map['r'], 'y');
+ BOOST_CHECK_EQUAL(rc_map['Y'], 'R');
+ BOOST_CHECK_EQUAL(rc_map['y'], 'r');
+ BOOST_CHECK_EQUAL(rc_map['S'], 'S');
+ BOOST_CHECK_EQUAL(rc_map['s'], 's');
+ BOOST_CHECK_EQUAL(rc_map['W'], 'W');
+ BOOST_CHECK_EQUAL(rc_map['w'], 'w');
+ BOOST_CHECK_EQUAL(rc_map['K'], 'M');
+ BOOST_CHECK_EQUAL(rc_map['k'], 'm');
+ BOOST_CHECK_EQUAL(rc_map['M'], 'K');
+ BOOST_CHECK_EQUAL(rc_map['m'], 'k');
+ BOOST_CHECK_EQUAL(rc_map['B'], 'V');
+ BOOST_CHECK_EQUAL(rc_map['b'], 'v');
+ BOOST_CHECK_EQUAL(rc_map['V'], 'B');
+ BOOST_CHECK_EQUAL(rc_map['v'], 'b');
+ BOOST_CHECK_EQUAL(rc_map['D'], 'H');
+ BOOST_CHECK_EQUAL(rc_map['d'], 'h');
+ BOOST_CHECK_EQUAL(rc_map['H'], 'D');
+ BOOST_CHECK_EQUAL(rc_map['h'], 'd');
+ BOOST_CHECK_EQUAL(rc_map['N'], 'N');
+ BOOST_CHECK_EQUAL(rc_map['n'], 'n');
+
+ BOOST_CHECK_EQUAL(rc_map['U'], '~');
+ BOOST_CHECK_EQUAL(rc_map['u'], '~');
+ BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+ BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_rna)
+{
+ Alphabet a = Alphabet::get_alphabet(rna_alphabet);
+
+ std::string rc_map = a.get_complement_map();
+ BOOST_CHECK_EQUAL(rc_map['A'], 'U');
+ BOOST_CHECK_EQUAL(rc_map['a'], 'u');
+ BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+ BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+ BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+ BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+ BOOST_CHECK_EQUAL(rc_map['U'], 'A');
+ BOOST_CHECK_EQUAL(rc_map['u'], 'a');
+ BOOST_CHECK_EQUAL(rc_map['R'], 'Y');
+ BOOST_CHECK_EQUAL(rc_map['r'], 'y');
+ BOOST_CHECK_EQUAL(rc_map['Y'], 'R');
+ BOOST_CHECK_EQUAL(rc_map['y'], 'r');
+ BOOST_CHECK_EQUAL(rc_map['S'], 'S');
+ BOOST_CHECK_EQUAL(rc_map['s'], 's');
+ BOOST_CHECK_EQUAL(rc_map['W'], 'W');
+ BOOST_CHECK_EQUAL(rc_map['w'], 'w');
+ BOOST_CHECK_EQUAL(rc_map['K'], 'M');
+ BOOST_CHECK_EQUAL(rc_map['k'], 'm');
+ BOOST_CHECK_EQUAL(rc_map['M'], 'K');
+ BOOST_CHECK_EQUAL(rc_map['m'], 'k');
+ BOOST_CHECK_EQUAL(rc_map['B'], 'V');
+ BOOST_CHECK_EQUAL(rc_map['b'], 'v');
+ BOOST_CHECK_EQUAL(rc_map['V'], 'B');
+ BOOST_CHECK_EQUAL(rc_map['v'], 'b');
+ BOOST_CHECK_EQUAL(rc_map['D'], 'H');
+ BOOST_CHECK_EQUAL(rc_map['d'], 'h');
+ BOOST_CHECK_EQUAL(rc_map['H'], 'D');
+ BOOST_CHECK_EQUAL(rc_map['h'], 'd');
+ BOOST_CHECK_EQUAL(rc_map['N'], 'N');
+ BOOST_CHECK_EQUAL(rc_map['n'], 'n');
+
+ BOOST_CHECK_EQUAL(rc_map['T'], '~');
+ BOOST_CHECK_EQUAL(rc_map['t'], '~');
+ BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+ BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+
+//enum AlphabetRef { reduced_nucleic_alphabet, nucleic_alphabet, };
+BOOST_AUTO_TEST_CASE( alphabet_reverse_complement )
+{
+ Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet);
+ std::string seq("AAAAGCT");
+ std::string known_rc_seq("AGCTTTT");
+ boost::shared_ptr<std::string> new_seq(a.reverse_complement(seq));
+
+ BOOST_CHECK_EQUAL(*new_seq, known_rc_seq);
+
}
\ No newline at end of file
}
// such an exciting unit test, making sure that a=b; a==b
-BOOST_AUTO_TEST_CASE( seqstring_string_alphabet )
+BOOST_AUTO_TEST_CASE( seqstring_string_nucleic_alphabet )
{
SeqString s("AGCT", nucleic_alphabet);
BOOST_CHECK_EQUAL(s.get_alphabet_ref(), nucleic_alphabet);
BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::nucleic_alphabet());
BOOST_CHECK_EQUAL(s.size(), 4);
+}
+
+// such an exciting unit test, making sure that a=b; a==b
+BOOST_AUTO_TEST_CASE( seqstring_string_dna_alphabet )
+{
+ SeqString s("AGCT", dna_alphabet);
+
+ BOOST_CHECK_EQUAL(s.get_alphabet_ref(), dna_alphabet);
+ BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::dna_alphabet());
+ BOOST_CHECK_EQUAL(s.size(), 4);
}
\ No newline at end of file
SeqSpanRef span1(new SeqSpan(str1));
BOOST_CHECK_EQUAL(span1->length(), str1.length());
BOOST_CHECK_EQUAL(span1->sequence(), str1);
+ BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand);
}
BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet )
BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet());
}
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_plusstrand )
+{
+ std::string str1("AAGGCCUU");
+ SeqSpanRef span1(new SeqSpan(str1, reduced_rna_alphabet, SeqSpan::PlusStrand));
+ BOOST_CHECK_EQUAL(span1->length(), str1.length());
+ BOOST_CHECK_EQUAL(span1->sequence(), str1);
+ BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet());
+ BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_singlestrand )
+{
+ std::string str1("AAAAGCT");
+ SeqSpanRef span1(new SeqSpan(str1, reduced_dna_alphabet, SeqSpan::SingleStrand));
+ BOOST_CHECK_EQUAL(span1->length(), str1.length());
+ BOOST_CHECK_EQUAL(span1->sequence(), str1);
+ BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_dna_alphabet());
+ // we always store strands as Plus
+ BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::SingleStrand);
+ BOOST_CHECK_EQUAL(span1->sequence(), "AAAAGCT");
+ BOOST_CHECK_THROW(span1->subseq(0,2,SeqSpan::OppositeStrand), sequence_invalid_strand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_invalidstrand )
+{
+ std::string s("AAAAGCT");
+ BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::UnknownStrand), sequence_invalid_strand);
+ BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand);
+ BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::SameStrand), sequence_invalid_strand);
+ BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::OppositeStrand), sequence_invalid_strand);
+ BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand);
+}
+
BOOST_AUTO_TEST_CASE( seqspan_from_seqspan )
{
std::string str1("AAGGCCTT");
BOOST_CHECK_EQUAL( str1[2], seq2->at(0) );
BOOST_CHECK_EQUAL( (*seq1)[0], seq1->at(0) );
BOOST_CHECK_EQUAL( (*seq1)[2], (*seq2)[0] );
+
+ SeqSpanRef seq3 = seq1->subseq(0, 4, SeqSpan::OppositeStrand);
+ BOOST_CHECK_EQUAL( seq3->at(0), 'C');
+ BOOST_CHECK_EQUAL( seq3->at(1), 'C');
+ BOOST_CHECK_EQUAL( seq3->at(2), 'T');
+ BOOST_CHECK_EQUAL( seq3->at(3), 'T');
+
}
BOOST_AUTO_TEST_CASE( seqspan_data )
}
}
+BOOST_AUTO_TEST_CASE( seqspan_subseq_reverse_begin_end )
+{
+ std::string str1("AAAACCTT");
+ std::string str1rc("AAGGTTTT");
+ SeqSpanRef seq1(new SeqSpan(str1));
+ SeqSpanRef seq2(new SeqSpan(seq1, 0, SeqSpan::npos, SeqSpan::OppositeStrand ));
+
+
+ std::string::const_iterator str1rc_i = str1rc.begin();
+ SeqSpan::const_iterator seq2_i = seq2->begin();
+ for(; not ((str1rc_i == str1rc.end()) or (seq2_i == seq2->end())); ++str1rc_i, ++seq2_i) {
+ BOOST_CHECK_EQUAL( *str1rc_i, *seq2_i );
+ }
+}
+
BOOST_AUTO_TEST_CASE( seqspan_rbegin_rend )
{
std::string str1("AAGGCCTT");
s3->setStop(8);
BOOST_CHECK_EQUAL( s3->size(), 4);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_strand_sameother )
+{
+ SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG"));
+ BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand);
+
+ SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::SameStrand);
+ BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA");
+ BOOST_CHECK_EQUAL(seq2->strand(), SeqSpan::PlusStrand);
+ SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::OppositeStrand);
+ BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT");
+ BOOST_CHECK_EQUAL(seq3->strand(), SeqSpan::MinusStrand);
+
+ // opposite of a plus strand should be minus
+ SeqSpanRef seq4 = seq2->subseq(0,4,SeqSpan::OppositeStrand);
+ BOOST_CHECK_EQUAL(seq4->sequence(), "TTTT");
+ BOOST_CHECK_EQUAL(seq4->strand(), SeqSpan::MinusStrand);
+ // opposite of a minus strand should be plus
+ SeqSpanRef seq5 = seq3->subseq(0,4,SeqSpan::OppositeStrand);
+ BOOST_CHECK_EQUAL(seq5->sequence(), "AAAA");
+ BOOST_CHECK_EQUAL(seq5->strand(), SeqSpan::PlusStrand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_strand_plusminus )
+{
+ SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG"));
+ BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand);
+
+ SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::PlusStrand);
+ BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA");
+ SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::MinusStrand);
+ BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT");
}
\ No newline at end of file
{
std::string rna_str("AGCUN");
Sequence rna_seq(rna_str, reduced_rna_alphabet);
- BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");
- BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
+ BOOST_CHECK_EQUAL(rna_seq.rev_comp().get_sequence(), "NAGCU");
+ BOOST_CHECK_EQUAL(rna_seq.get_sequence(),
+ rna_seq.rev_comp().rev_comp().get_sequence());
}
BOOST_AUTO_TEST_CASE( sequence_reverse_complement_subseq )
mussa_error(msg) {};
};
+//! Invalid strand identifier
+class sequence_invalid_strand : public mussa_error
+{
+public:
+ explicit sequence_invalid_strand(const std::string& msg) :
+ mussa_error(msg) {};
+};
+
//! Error loading sequence annotation
class annotation_load_error : public sequence_load_error
{