move strand into seqspan

author Diane Trout <diane@caltech.edu>

Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)

committer Diane Trout <diane@caltech.edu>

Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)
author Diane Trout <diane@caltech.edu>
Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)
committer Diane Trout <diane@caltech.edu>
Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)
diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp

index 56919199b6c7ccf30c05a6471b95187c6075f150..32796ca662196165d27692bc3c3ecaa56093ea25 100644 (file)
--- a/alg/alphabet.cpp
+++ b/alg/alphabet.cpp
@@ -1,48 +1,71 @@
  #include <stdexcept>
  
+#include "mussa_exceptions.hpp"
  #include "alg/alphabet.hpp"
  
  // some standard dna alphabets 
  // Include nl (\012), and cr (\015) to make sequence parsing eol 
  // convention independent.
  const char *Alphabet::reduced_dna_cstr = "AaCcGgTtNn\012\015";
+const char *Alphabet::reduced_dna_reverse_cstr = "TtGgCcAaNn\012\015";
  const char *Alphabet::reduced_rna_cstr = "AaCcGgUuNn\012\015";
+const char *Alphabet::reduced_rna_reverse_cstr = "UuGgCcAaNn\012\015";
  const char *Alphabet::reduced_nucleic_cstr = "AaCcGgTtUuNn\012\015";
+const char *Alphabet::reduced_nucleic_reverse_cstr = "TtGgCcAaAaNn\012\015";
  //! this is the general iupac alphabet for nucleotides
+const char *Alphabet::dna_cstr =
+  "AaCcGgTtRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::dna_reverse_cstr =
+  "TtGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
+const char *Alphabet::rna_cstr =
+  "AaCcGgUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::rna_reverse_cstr =
+  "UuGgCcAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
  const char *Alphabet::nucleic_cstr =
-  "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015";
+  "AaCcGgTtUuRrYySsWwKkMmBbDdHhVvNnXx-~.?\012\015";
+const char *Alphabet::nucleic_reverse_cstr =
+  "TtGgCcAaAaYyRrSsWwMmKkVvHhDdBbNnXx-~.?\012\015";
  //! the protein alphabet
  const char *Alphabet::protein_cstr = 
    "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015";
  const char *Alphabet::empty_cstr = "";
  
  const Alphabet& Alphabet::reduced_dna_alphabet() {
-  static Alphabet *a = new Alphabet(reduced_dna_cstr);
+  static Alphabet *a = new Alphabet(reduced_dna_cstr, reduced_dna_reverse_cstr);
    return *a;
  }
  const Alphabet& Alphabet::reduced_rna_alphabet() {
-  static Alphabet *a = new Alphabet(reduced_rna_cstr);
+  static Alphabet *a = new Alphabet(reduced_rna_cstr, reduced_rna_reverse_cstr);
    return *a;
  }
  const Alphabet& Alphabet::reduced_nucleic_alphabet() {
-  static Alphabet *a = new Alphabet(reduced_nucleic_cstr);
+  static Alphabet *a = new Alphabet(reduced_nucleic_cstr, reduced_nucleic_reverse_cstr);
+  return *a;
+}
+const Alphabet& Alphabet::dna_alphabet() {
+  static Alphabet *a = new Alphabet(dna_cstr, dna_reverse_cstr);
+  return *a;
+}
+const Alphabet& Alphabet::rna_alphabet() {
+  static Alphabet *a = new Alphabet(rna_cstr, rna_reverse_cstr);
    return *a;
  }
  const Alphabet& Alphabet::nucleic_alphabet() {
-  static Alphabet *a = new Alphabet(nucleic_cstr);
+  static Alphabet *a = new Alphabet(nucleic_cstr, nucleic_reverse_cstr);
    return *a;
  }
  const Alphabet& Alphabet::protein_alphabet() {
-  static Alphabet *a = new Alphabet(protein_cstr);
+  static Alphabet *a = new Alphabet(protein_cstr, protein_cstr);
    return *a;
  }
  const Alphabet& Alphabet::empty_alphabet() {
-  static Alphabet *a = new Alphabet(empty_cstr);
+  static Alphabet *a = new Alphabet(empty_cstr, empty_cstr);
    return *a;
  }
  
-Alphabet::Alphabet(const char *a) :
-  alphabet(a)
+Alphabet::Alphabet(const char *a, const char *reverse_alphabet) :
+  alphabet(a),
+  complement_map(create_complement_map(reverse_alphabet))
  {
    alphabet_set.insert(alphabet.begin(), alphabet.end());  
  } 
@@ -50,6 +73,7 @@ Alphabet::Alphabet(const char *a) :
  void Alphabet::assign(const Alphabet& a)
  {
    alphabet = a.alphabet;
+  complement_map = a.complement_map;
    alphabet_set.clear();
    alphabet_set.insert(alphabet.begin(), alphabet.end());
  }
@@ -78,12 +102,47 @@ const Alphabet& Alphabet::get_alphabet(AlphabetRef alpha)
        return Alphabet::reduced_rna_alphabet();
      case ::reduced_nucleic_alphabet:
        return Alphabet::reduced_nucleic_alphabet();
+    case ::dna_alphabet:
+      return Alphabet::dna_alphabet();
+    case ::rna_alphabet:
+      return Alphabet::rna_alphabet();
      case ::nucleic_alphabet:
        return Alphabet::nucleic_alphabet();
      case ::protein_alphabet:
-      return Alphabet::protein_alphabet();    
+      return Alphabet::protein_alphabet();  
+    case ::empty_alphabet:
+      return Alphabet::empty_alphabet();  
      default:
        throw std::runtime_error("unrecognized alphabet type");
        break;
    }
+}
+
+std::string Alphabet::create_complement_map(const std::string& ra) const
+{
+  if (ra.size() == 0) {
+    // reverse complement is undefined
+    throw mussa_error("no reverse complement");
+  }
+  std::string rc_map(256,'~');
+  
+  for(int i = 0; alphabet[i] and ra[i]; ++i) {
+    rc_map[ alphabet[i] ] = ra[i];    
+  }
+  return rc_map;
+}
+
+boost::shared_ptr<std::string> Alphabet::reverse_complement(const std::string &seq) const
+{
+  boost::shared_ptr<std::string> new_seq(new std::string());
+  
+  new_seq->reserve(seq.size());
+  
+  for(std::string::const_reverse_iterator seq_i = seq.rbegin();
+      seq_i != seq.rend();
+      ++seq_i)
+  {
+    new_seq->append(1, complement_map[ *seq_i ]);
+  }
+  return new_seq;
  }
 \ No newline at end of file
diff --git a/alg/alphabet.hpp b/alg/alphabet.hpp

index 513ea767b11081a9ea30c27d877f2c7d44f92306..7652ddfb08b27daa428a58eed9e691c7fb7bcb57 100644 (file)
--- a/alg/alphabet.hpp
+++ b/alg/alphabet.hpp
@@ -7,12 +7,16 @@
  #include <boost/serialization/utility.hpp>
  #include <boost/serialization/version.hpp>
  
+#include <boost/shared_ptr.hpp>
+
  #include <set>
  #include <ostream>
  
  //! this is a helper class for sequence
-enum AlphabetRef { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, 
-                   nucleic_alphabet, protein_alphabet, empty_alphabet=255 };
+enum AlphabetRef { reduced_dna_alphabet,     dna_alphabet, 
+                   reduced_rna_alphabet,     rna_alphabet,
+                   reduced_nucleic_alphabet, nucleic_alphabet, 
+                   protein_alphabet, empty_alphabet=255 };
                     
  class Alphabet {
  friend class Sequence;
@@ -28,19 +32,39 @@ public:
    
    //! return an alphabet given an AlphabetRef enumeration
    static const Alphabet &get_alphabet(AlphabetRef);
+  //! return a map to reverse complement an symbols from a nucleic alphabet
+  std::string create_complement_map(const std::string &) const;
+  //! return compelement map
+  std::string get_complement_map() const { return complement_map; }
+  
+  //! return a pointer to a reverse complemented string
+  boost::shared_ptr<std::string> reverse_complement(const std::string &) const;
+  
    // note, if you want to define an alphabet for a sequence, you probably want 
    // to update the enumeration in Sequence, and Sequence::get_sequence
    //! The standard DNA alphabet, with unique, and unknown characters
    static const char *reduced_dna_cstr;
+  static const char *reduced_dna_reverse_cstr;
    static const Alphabet &reduced_dna_alphabet();
    //! The standard RNA alphabet, with unique, and unknown characters
    static const char *reduced_rna_cstr;
+  static const char *reduced_rna_reverse_cstr;
    static const Alphabet &reduced_rna_alphabet();
-  //! The standard DNA/RNA alphabet, with unique, and unknown characters
+  //! The full IUPAC DNA alphabet, with unique, and unknown characters
+  static const char *dna_cstr;
+  static const char *dna_reverse_cstr;
+  static const Alphabet &dna_alphabet();
+  //! the full IUPAC RNA alphabet
+  static const char *rna_cstr;
+  static const char *rna_reverse_cstr;
+  static const Alphabet &rna_alphabet();
+  //! reduced (DNA/RNA) nucelic alphabet
    static const char *reduced_nucleic_cstr;
+  static const char *reduced_nucleic_reverse_cstr;
    static const Alphabet &reduced_nucleic_alphabet();
-  //! this is the general IUPAC alphabet for nucleotides
+  //! reduced (DNA/RNA) nucelic alphabet
    static const char *nucleic_cstr;
+  static const char *nucleic_reverse_cstr;
    static const Alphabet &nucleic_alphabet();
    //! the protein alphabet
    static const char *protein_cstr;  
@@ -52,11 +76,12 @@ public:
  private:
    //! what are allowable symbols in our alphabet
    std::string alphabet;
+  std::string complement_map;
    //! internal variable to make exists() faster
    std::set<std::string::value_type> alphabet_set;
    
    //! some necessary string api access
-  Alphabet(const char *a);
+  Alphabet(const char *a, const char *reverse_a);
    //! allow sequence to copy one alphabet to another (needed when unserializing) 
    void assign(const Alphabet& a);
    const_iterator begin() const { return alphabet.begin(); }
@@ -67,6 +92,7 @@ private:
    template<class Archive>
    void serialize(Archive& ar, const unsigned int /*version*/) {
      ar & BOOST_SERIALIZATION_NVP(alphabet);
+    ar & BOOST_SERIALIZATION_NVP(complement_map);
      alphabet_set.clear();
      alphabet_set.insert(alphabet.begin(), alphabet.end());
    }   
diff --git a/alg/flp_seqcomp.cpp b/alg/flp_seqcomp.cpp

index 1a1ea89c93bec9ee6917a91a40cbc3cd57e3e9eb..995a228ede7cc203a6a6c60a80342d8d2667e419 100644 (file)
--- a/alg/flp_seqcomp.cpp
+++ b/alg/flp_seqcomp.cpp
@@ -61,8 +61,10 @@ FLPs::seqcomp(const Sequence& sseq1, const Sequence& sseq2, bool is_RC)
    int start_i, seq1_i, seq2_i, win_i;      // loop variables
    int matches;                    // number of matches in to a window
    int i2_offset;
-  const char *seq1 = sseq1.data();
-  const char *seq2 = sseq2.data();
+  std::string sseq1_str(sseq1.get_sequence());
+  std::string sseq2_str(sseq2.get_sequence());
+  const char *seq1 = sseq1_str.c_str();
+  const char *seq2 = sseq2_str.c_str();
  
    validate_sequence(sseq1);
    validate_sequence(sseq2);
diff --git a/alg/motif_parser.cpp b/alg/motif_parser.cpp

index c59c2873e160d697ab46d2b08510d1d985f905cf..5ba355028f5fc699c23d991bde0a9caf706bef8b 100644 (file)
--- a/alg/motif_parser.cpp
+++ b/alg/motif_parser.cpp
@@ -57,7 +57,7 @@ void motif_parser::push_motif::operator()(
    Iterator end) const
  {
    float red, green, blue, alpha;
-  Sequence seq(parsed->sequence, nucleic_alphabet);
+  Sequence seq(parsed->sequence, dna_alphabet, SeqSpan::PlusStrand);
    seq.set_fasta_header(parsed->name);
    
    alpha = 1.0;
@@ -94,7 +94,7 @@ motif_parser::ParsedMotifs::ParsedMotifs(
  
  void motif_parser::ParsedMotifs::parse(const std::string &data)
  {
-  const char *alphabet = Alphabet::nucleic_cstr;
+  const char *alphabet = Alphabet::dna_cstr;
  
    // parse our string
    spirit::parse_info<std::string::const_iterator> result;
diff --git a/alg/mussa.cpp b/alg/mussa.cpp

index 295fd49be50b76ea04411d551cb4ffbbb602cf30..52d0dce6006b7371ee8dbf2771233ed246c32169 100644 (file)
--- a/alg/mussa.cpp
+++ b/alg/mussa.cpp
@@ -794,7 +794,7 @@ void Mussa::load_motifs(fs::path filename)
  void Mussa::load_motifs(std::istream &in)
  {
    std::string data;
-  const char *alphabet = Alphabet::nucleic_cstr;
+  const char *alphabet = Alphabet::dna_cstr;
    motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper);
  
    // slurp our data into a string
diff --git a/alg/seq.hpp b/alg/seq.hpp

index 5ac42a2bb220ec58630ec4c1138ca6c9ee8c422b..467fcaf9eb7ceafdb50bfd7b34ea118c4c824cf0 100644 (file)
--- a/alg/seq.hpp
+++ b/alg/seq.hpp
@@ -38,17 +38,21 @@ public:
    
    SeqString(AlphabetRef a=reduced_nucleic_alphabet) : 
      std::string(),
-    alphabet(a) 
+    alphabet(a),
+    rc_map(Alphabet::get_alphabet(a).get_complement_map()) 
    {}
    SeqString(const std::string &s, AlphabetRef a=reduced_nucleic_alphabet) : 
      std::string(s),
-    alphabet(a)
+    alphabet(a),
+    rc_map(Alphabet::get_alphabet(a).get_complement_map()) 
    {}
  
    AlphabetRef get_alphabet_ref() { return alphabet; }
    const Alphabet& get_alphabet() const;
    
    std::string create_reverse_complement_map() const;
+  
+  const std::string rc_map;
  private:
    AlphabetRef alphabet;
    friend class boost::serialization::access;
diff --git a/alg/seq_span.cpp b/alg/seq_span.cpp

index 2870f0f077f112f57cc2c60857926e2974af1f4b..bf9a3dc3664bb72d40a613a28521795db1265402 100644 (file)
--- a/alg/seq_span.cpp
+++ b/alg/seq_span.cpp
@@ -9,7 +9,8 @@ SeqSpan::SeqSpan(const SeqSpan &o)
    : seq(o.seq),
      seq_start(o.seq_start),
      seq_count(o.seq_count),
-    parent(o.parent)
+    parent(o.parent),
+    rc_seq(o.rc_seq)
  {
  }
  
@@ -17,20 +18,37 @@ SeqSpan::SeqSpan(const SeqSpan *p)
    : seq(p->seq),
      seq_start(p->seq_start),
      seq_count(p->seq_count),
-    parent(p->parent) 
+    parent(p->parent),
+    rc_seq(p->rc_seq) 
  {
  }
  
  SeqSpan::SeqSpan(const std::string &seq_,
-                 AlphabetRef a)
-  : seq(new SeqString(seq_, a)),
-    seq_start(0),
+                 AlphabetRef a,
+                 strand_type strand_)
+  : seq_start(0),
      seq_count(seq_.length()),
      parent()
  {
+  switch (strand_) {
+  case PlusStrand:
+  case SingleStrand:
+    // do nothing
+    seq_strand = strand_;
+    seq.reset(new SeqString(seq_, a));
+  break;
+  default:
+    throw sequence_invalid_strand(
+            "Can only initialize a Plus or Single strand from a string"
+          );
+    break;
+  }
  }
  
-SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_) 
+SeqSpan::SeqSpan(const SeqSpanRef parent_, 
+                 size_type start_, 
+                 size_type count_,
+                 strand_type strand_) 
    : seq(parent_->seq),
      seq_start(parent_->seq_start + start_),
      parent(parent_)    
@@ -39,6 +57,62 @@ SeqSpan::SeqSpan(const SeqSpanRef parent_, size_type start_, size_type count_)
      seq_count = parent_->seq_count;
    else
      seq_count = count_;
+  
+  // blech this strand logic is too long
+  switch(strand_) {
+  case UnknownStrand:
+  case PlusStrand:
+  case MinusStrand:
+  case BothStrand:
+    seq_strand = strand_;
+  break;
+  case SameStrand:
+    if (parent) {
+      seq_strand = parent->strand();
+    } else {
+      throw sequence_invalid_strand("SameStrand is undefined with no parent");
+    }
+  break;
+  case OppositeStrand:
+    if (parent) {
+      switch (parent->strand()) {
+      case PlusStrand:
+        seq_strand = MinusStrand;
+      break;
+      case MinusStrand:
+        seq_strand = PlusStrand;
+      break;
+      case UnknownStrand:
+      case BothStrand:
+        seq_strand = parent->strand();
+      break;
+      case SingleStrand:
+        throw sequence_invalid_strand(
+                "OppositeStrand is undefined with SingleStrands"
+              );
+        break;
+      default:
+        throw sequence_invalid_strand("Parent has an invalid strand type");
+      }
+    } else {
+      throw sequence_invalid_strand("SameStrand is undefined with no parent");
+    }
+  break;
+  case SingleStrand:
+    if (parent) {
+      if (parent->strand() == SingleStrand) {
+        seq_strand = SingleStrand;
+      } else {
+        throw  sequence_invalid_strand("Can't change single strandedness");
+      }
+    } else {
+      seq_strand = SingleStrand;
+    }      
+  break;
+  default:
+    throw sequence_invalid_strand("unrecognized strand identifier");
+    break;
+  }
  }
  
  //////
@@ -50,6 +124,7 @@ SeqSpan &SeqSpan::operator=(const SeqSpan& s)
      seq_start = s.seq_start;
      seq_count = s.seq_count;
      parent = s.parent;
+    rc_seq = s.rc_seq;
    }
    return *this;
  }
@@ -76,17 +151,26 @@ bool operator<(const SeqSpan& a, const SeqSpan& b)
    }
  }
  */
-#include <iostream>
+
  bool operator==(const SeqSpan& a, const SeqSpan& b)
  {
    if (SeqSpan::isFamily(a, b)) {
      // can do fast comparison
      if (a.seq_start == b.seq_start and a.seq_count == b.seq_count) {
-      return true;
+      // unknown strands match anything
+      if (a.seq_strand == SeqSpan::UnknownStrand or 
+          b.seq_strand == SeqSpan::UnknownStrand) {
+        return true;
+      } else { 
+        // otherwise our stranded-ness needs to match
+        return (a.seq_strand == b.seq_strand);
+      }
      } else {
        return false;
      }
-  } 
+  }
+  // if we're not part of the same seqspan heirarchy we're never equal
+  // the string equality is handled off in Sequence
    return false;
  }
  
@@ -103,7 +187,11 @@ SeqSpan::const_reference SeqSpan::operator[](SeqSpan::size_type i) const
  SeqSpan::const_reference SeqSpan::at(SeqSpan::size_type i) const
  {
    if (!seq) throw std::out_of_range("empty sequence");
-  return seq->at(i+seq_start);
+  if (seq_strand == PlusStrand) {
+    return seq->at(seq_start+i);
+  } else {
+    return seq->rc_map[seq->at((seq_start+seq_count-1)-i)];
+  }
  }
  
  const char *SeqSpan::data() const
@@ -116,36 +204,62 @@ const char *SeqSpan::data() const
  
  SeqSpan::const_iterator SeqSpan::begin() const
  {
-  if (seq and seq_count != 0)
-    return seq->begin()+seq_start;
-  else 
-    return SeqSpan::const_iterator(0);
+  if (seq and seq_count != 0) {
+    if (seq_strand != MinusStrand) {
+      return seq->begin()+seq_start;
+    } else {
+      if (not rc_seq) {
+        initialize_rc_seq();
+      }
+      return rc_seq->begin();
+    }
+  } 
+  return SeqSpan::const_iterator(0);
  }
  
  SeqSpan::const_iterator SeqSpan::end() const
  {
    if (seq and seq_count != 0) {
-    return seq->begin() + seq_start + seq_count;
-  } else {
-    return SeqSpan::const_iterator(0);
-  }
+    if (seq_strand != MinusStrand) {
+      return seq->begin() + seq_start + seq_count;
+    } else {
+      if (not rc_seq) {
+        initialize_rc_seq();
+      }
+      return rc_seq->end();
+    }
+  } 
+  return SeqSpan::const_iterator(0);
  }
  
  SeqSpan::const_reverse_iterator SeqSpan::rbegin() const
  {
-  if (seq and seq_count != 0)
-    return seq->rbegin()+(seq->size()-(seq_start+seq_count));
-  else 
-    return SeqSpan::const_reverse_iterator();
+  if (seq and seq_count != 0) {
+    if (seq_strand != MinusStrand) {
+      return seq->rbegin()+(seq->size()-(seq_start+seq_count));
+    } else {
+      if (not rc_seq) {
+        initialize_rc_seq();
+      }
+      return rc_seq->rbegin();
+    }
+  } 
+  return SeqSpan::const_reverse_iterator();
  }
  
  SeqSpan::const_reverse_iterator SeqSpan::rend() const
  {
    if (seq and seq_count != 0) {
-    return rbegin() + seq_count;
-  } else {
-    return SeqSpan::const_reverse_iterator();
+    if (seq_strand != MinusStrand) {
+      return rbegin() + seq_count;
+    } else {
+      if (not rc_seq) {
+        initialize_rc_seq();
+      }
+      return rc_seq->rend();
+    }
    }
+  return SeqSpan::const_reverse_iterator();
  }
  
  bool SeqSpan::empty() const
@@ -160,7 +274,7 @@ SeqSpan::size_type SeqSpan::find_first_not_of(
    typedef std::set<std::string::value_type> sequence_set;
    sequence_set match_set;
    
-  for(const_iterator query_item = query.begin();
+  for(std::string::const_iterator query_item = query.begin();
        query_item != query.end();
        ++query_item)
    {
@@ -226,18 +340,23 @@ void SeqSpan::setParentStop(SeqSpan::size_type v)
    setStop(parent->start() + v);
  }
  
-SeqSpanRef SeqSpan::subseq(size_type start, size_type count)
+SeqSpanRef SeqSpan::subseq(size_type start, size_type count, strand_type strand)
  {
    count = std::min<size_type>(count, seq_count - start);
      
-  SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count));
+  SeqSpanRef new_span(new SeqSpan(this->shared_from_this(), start, count, strand));
    return new_span;
  }
  
  std::string SeqSpan::sequence() const
  {
    if (seq) {
-    return seq->substr(seq_start, seq_count);
+    if (seq_strand != MinusStrand) 
+      return seq->substr(seq_start, seq_count);
+    else {
+      initialize_rc_seq();
+      return *rc_seq;
+    } 
    } else { 
      return std::string();
    }
@@ -247,3 +366,19 @@ bool SeqSpan::isFamily(const SeqSpan& a, const SeqSpan& b)
  {
    return a.seq.get() == b.seq.get(); 
  }
+
+void SeqSpan::initialize_rc_seq() const
+{
+  std::string new_rc_seq;
+  std::string rc_map(seq->get_alphabet().get_complement_map());
+  
+  new_rc_seq.reserve(size());
+  size_type i = (seq_start + seq_count);
+  while( i != seq_start ) {
+    --i;
+    new_rc_seq.append(1, rc_map[ seq->at(i) ]);
+  }
+  
+  SeqStringRef new_rc_ref(new SeqString(new_rc_seq, seq->get_alphabet_ref()));
+  const_cast<SeqStringRef *>(&rc_seq)->swap(new_rc_ref);
+}
diff --git a/alg/seq_span.hpp b/alg/seq_span.hpp

index 8d1991360df0bdcebee6e83cab7b926686630a9a..5a2da1268d3a62cf807f323d7e39bb450fea5841 100644 (file)
--- a/alg/seq_span.hpp
+++ b/alg/seq_span.hpp
@@ -35,14 +35,34 @@ public:
    typedef SeqString::size_type size_type;
    typedef SeqString::value_type value_type;
    static const size_type npos = SeqString::npos;
+  //! Define strand types
+  /**!
+   *  Unknown strand is treated as "either" strand
+   *  Plus refers to the initially created strand
+   *  Minus is the opposite strand
+   *  Both is for any feature that applies to "both" strands
+   *    (which may not actually be useful) 
+   *  Same strand is only used when creating a subsequence
+   *    and implies the subsequence has the same orientation as the parent
+   *  Opposite is only used for creating a subsequence
+   *    and implies the subsequence has the opposite orientation as the parent
+   *  Single indicates that this is single stranded and there can't be
+   *    an opposite strand.
+   */ 
+  enum strand_type { UnknownStrand, MinusStrand, PlusStrand, 
+                     BothStrand, SameStrand, OppositeStrand, SingleStrand };
  
  public:
    SeqSpan(const SeqSpan &);
    SeqSpan(const SeqSpan *);
    explicit SeqSpan(const std::string &, 
-               AlphabetRef = reduced_nucleic_alphabet
+               AlphabetRef a = dna_alphabet,
+               strand_type strand=PlusStrand
             );
-  SeqSpan(const SeqSpanRef, size_type start=0, size_type count=npos);
+  SeqSpan(const SeqSpanRef, 
+          size_type start=0, 
+          size_type count=npos, 
+          strand_type strand=SameStrand);
  
    //! assignment
    SeqSpan& operator=(const SeqSpan&);
@@ -87,6 +107,7 @@ public:
    size_type stop() const { return seq_start + seq_count; }
    //! set one past the last position relative to the root sequence.
    void setStop(size_type);
+  strand_type strand() const { return seq_strand; }
    
    //! get start position relative to the parent sequence
    size_type parentStart() const;
@@ -100,12 +121,17 @@ public:
    
  
    //! return a subsequence, copying over any appropriate annotation
-  SeqSpanRef subseq(size_type start=0, size_type count = std::string::npos);
+  SeqSpanRef subseq(size_type start=0, 
+                    size_type count = std::string::npos, 
+                    strand_type = PlusStrand);
    //! get sequence
    std::string sequence() const; 
    //! are both sequences derived from the same sequence tree?
    static bool isFamily(const SeqSpan& a, const SeqSpan& b);
    
+  //! fill in our rc_seq variable 
+  void initialize_rc_seq() const;
+  
    friend class Sequence;
  private:
    //! do not statically initialize, only create with new
@@ -117,11 +143,15 @@ protected:
    size_type seq_start;
    //! how big we ware
    size_type seq_count;  
-  // Do I need to track the strand here?
+  //! strand orientation
+  strand_type seq_strand;
    
    //! keep a reference to who our parent span is 
    SeqSpanRef parent;
  
+  //! hold a reverse complement version of our sequence if needed
+  SeqStringRef rc_seq;
+  
    // boost::serialization support
    friend class boost::serialization::access;
    template<class Archive>
@@ -129,6 +159,7 @@ protected:
      ar & BOOST_SERIALIZATION_NVP(seq);
      ar & BOOST_SERIALIZATION_NVP(seq_start);
      ar & BOOST_SERIALIZATION_NVP(seq_count);
+    ar & BOOST_SERIALIZATION_NVP(seq_strand);
      ar & BOOST_SERIALIZATION_NVP(parent);
    }
  };
diff --git a/alg/sequence.cpp b/alg/sequence.cpp

index 47cca2a7dfdfb46c831491ee276f20ec5dd6f4a0..68fa01460542956a6dabf5864786a50f47080737 100644 (file)
--- a/alg/sequence.cpp
+++ b/alg/sequence.cpp
@@ -80,8 +80,7 @@ motif::~motif()
  
  
  Sequence::Sequence(AlphabetRef alphabet)
-  : seq(new SeqSpan("", alphabet)),
-    strand(UnknownStrand)
+  : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand))
  {
  }
  
@@ -89,25 +88,24 @@ Sequence::~Sequence()
  {
  }
  
-Sequence::Sequence(const char *seq, AlphabetRef alphabet_)
-  : strand(UnknownStrand),
-    header(""),
+Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
+  : header(""),
      species("")
  {
-  set_filtered_sequence(seq, alphabet_);
+  set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
  }
  
-Sequence::Sequence(const std::string& seq, AlphabetRef alphabet_) 
-  : strand(UnknownStrand),
-    header(""),
+Sequence::Sequence(const std::string& seq, 
+                   AlphabetRef alphabet_,
+                   SeqSpan::strand_type strand_) 
+  : header(""),
      species("")
  {
-  set_filtered_sequence(seq, alphabet_);
+  set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
  }
  
  Sequence::Sequence(const Sequence& o)
    : seq(o.seq),
-    strand(o.strand),
      header(o.header),
      species(o.species),
      annots(o.annots),
@@ -117,7 +115,6 @@ Sequence::Sequence(const Sequence& o)
  
  Sequence::Sequence(const Sequence* o)
    : seq(o->seq),
-    strand(o->strand),
      header(o->header),
      species(o->species),
      annots(o->annots),
@@ -127,7 +124,6 @@ Sequence::Sequence(const Sequence* o)
  
  Sequence::Sequence(const SeqSpanRef& seq_ref)
    : seq(seq_ref),
-    strand(UnknownStrand),
      header(""),
      species("")
  {
@@ -137,7 +133,6 @@ Sequence &Sequence::operator=(const Sequence& s)
  {
    if (this != &s) {
      seq = s.seq;
-    strand = s.strand;
      header = s.header;
      species = s.species;
      annots = s.annots;
@@ -220,7 +215,7 @@ Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
    std::string rev_comp;
    std::string sequence_raw;
    std::string seq_tmp;      // holds sequence during basic filtering
-  const Alphabet &alpha = get_alphabet();
+  const Alphabet &alpha = Alphabet::get_alphabet(a);
  
    if (seq_num == 0) {
      throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
@@ -273,7 +268,7 @@ Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
        std::string msg("The selected sequence appears to be empty"); 
        throw sequence_empty_error(msg);
      }
-    set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
+    set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
    } else {
      std::string errormsg("There were no fasta sequences");
      throw sequence_empty_file_error(errormsg);
@@ -284,7 +279,7 @@ void Sequence::set_filtered_sequence(const std::string &in_seq,
                                       AlphabetRef alphabet_,
                                       size_type start,
                                       size_type count,
-                                     strand_type strand_)
+                                     SeqSpan::strand_type strand_)
  {
    if ( count == npos)
      count = in_seq.size() - start;
@@ -302,9 +297,8 @@ void Sequence::set_filtered_sequence(const std::string &in_seq,
        new_seq.append(1, 'N');
      }
    }
-  SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_)); 
+  SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_)); 
    seq = new_seq_ref;
-  strand = strand_;
  }
  
  void
@@ -547,15 +541,16 @@ void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count
    
  }
  Sequence
-Sequence::subseq(size_type start, size_type count) const
+Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
  {
+  // FIXME: should i really allow a subsequence of an empty sequence?
    if (!seq) {
      Sequence new_seq;
      return new_seq;
    }
  
    Sequence new_seq = *this;
-  new_seq.seq = seq->subseq(start, count);
+  new_seq.seq = seq->subseq(start, count, strand);
  
    copy_children(new_seq, start, count);
    
@@ -566,21 +561,8 @@ Sequence::subseq(size_type start, size_type count) const
  // FIXME: This needs to be moved into SeqSpan
  Sequence Sequence::rev_comp() const
  {
-  std::string rev_comp;
-  rev_comp.reserve(length());
-  
-  std::string rc_map = seq->seq->create_reverse_complement_map();
-
-  // reverse and convert
-  Sequence::const_reverse_iterator seq_i;
-  Sequence::const_reverse_iterator seq_end = rend();  
-  for(seq_i = rbegin(); 
-      seq_i != seq_end;
-      ++seq_i)
-  {
-    rev_comp.append(1, rc_map[*seq_i]);
-  }
-  return Sequence(rev_comp, seq->seq->get_alphabet_ref()); 
+  // a reverse complement is the whole opposite strand 
+  return subseq(0, npos, SeqSpan::OppositeStrand);
  }
  
  const Alphabet& Sequence::get_alphabet() const
@@ -623,7 +605,7 @@ Sequence::get_name() const
  
  void Sequence::set_sequence(const std::string& s, AlphabetRef a) 
  {
-  set_filtered_sequence(s, a);
+  set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
  }
  
  std::string Sequence::get_sequence() const
@@ -640,7 +622,6 @@ void
  Sequence::clear()
  {
    seq.reset();
-  strand = UnknownStrand;
    header.clear();
    species.clear();
    annots.clear();
@@ -696,7 +677,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
    }
    getline(load_file, file_data_line);
    // looks like the sequence is written as a single line
-  set_filtered_sequence(file_data_line, reduced_dna_alphabet);
+  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
    getline(load_file, file_data_line);
    getline(load_file, file_data_line);
    if (file_data_line == "<Annotations>")
diff --git a/alg/sequence.hpp b/alg/sequence.hpp

index 4b93ef132d81b93e924e153965407b32585c7851..80aa49f75e17bcf9ca572de2d9b8c53658ea31af 100644 (file)
--- a/alg/sequence.hpp
+++ b/alg/sequence.hpp
@@ -103,13 +103,14 @@ public:
    typedef SeqString::const_reference const_reference;
    typedef SeqString::size_type size_type;
    static const size_type npos = SeqString::npos;
-  enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand };
                        
-  Sequence(AlphabetRef a = reduced_nucleic_alphabet);
+  Sequence(AlphabetRef a = reduced_dna_alphabet);
    Sequence(const char* seq, 
-           AlphabetRef a = reduced_nucleic_alphabet);
+           AlphabetRef a = reduced_dna_alphabet,
+           SeqSpan::strand_type strand = SeqSpan::PlusStrand);
    Sequence(const std::string& seq,
-           AlphabetRef a = reduced_nucleic_alphabet);
+           AlphabetRef a = reduced_dna_alphabet,
+           SeqSpan::strand_type strand = SeqSpan::PlusStrand);
    Sequence(const Sequence& seq);
    Sequence(const Sequence *);
    Sequence(const SeqSpanRef&); 
@@ -125,10 +126,10 @@ public:
  
    //! set sequence to a (sub)string containing nothing but AGCTN
    void set_filtered_sequence(const std::string& seq,
-                             AlphabetRef a, 
+                             AlphabetRef a=reduced_dna_alphabet, 
                               size_type start=0,
                               size_type count=npos,
-                             strand_type strand=UnknownStrand);
+                             SeqSpan::strand_type strand=SeqSpan::PlusStrand);
  
    //! retrive element at specific position
    const_reference at(size_type i) const { return seq->at(i); }
@@ -159,7 +160,9 @@ public:
    size_type stop() const { return seq->parentStop(); }
  
    //! return a subsequence, copying over any appropriate annotation
-  Sequence subseq(size_type start=0, size_type count = npos) const;
+  Sequence subseq(size_type start=0, 
+                  size_type count = npos,
+                  SeqSpan::strand_type strand = SeqSpan::SameStrand) const;
    //! reverse a character
    std::string create_reverse_map() const;
    //! return a reverse compliment (this needs to be improved?)
@@ -234,8 +237,6 @@ public:
    
  protected:  
    SeqSpanRef seq;
-  //! strand orientation
-  strand_type strand;
    //! fasta header
    std::string header;
    //! species name
@@ -259,7 +260,6 @@ protected:
    template<class Archive>
    void serialize(Archive& ar, const unsigned int /*version*/) {
      ar & BOOST_SERIALIZATION_NVP(seq);
-    ar & BOOST_SERIALIZATION_NVP(strand);
      ar & BOOST_SERIALIZATION_NVP(header);
      ar & BOOST_SERIALIZATION_NVP(species);
      ar & BOOST_SERIALIZATION_NVP(annots);
diff --git a/alg/test/test_alphabet.cpp b/alg/test/test_alphabet.cpp

index 5f7d9f3ae03f8bab8b6a82e235b515881f92001a..854c1b3b24c01f189a5ea0326bec63b12c6f9728 100644 (file)
--- a/alg/test/test_alphabet.cpp
+++ b/alg/test/test_alphabet.cpp
@@ -26,4 +26,150 @@ BOOST_AUTO_TEST_CASE( alphabet_equality)
  {
    Alphabet a(Alphabet::reduced_dna_alphabet());
    BOOST_CHECK_EQUAL( a, Alphabet::reduced_dna_alphabet() );
+}
+
+/*
+BOOST_AUTO_TEST_CASE( alphabet_rc_invalid )
+{
+  Alphabet a1 = Alphabet::get_alphabet(empty_alphabet);
+  BOOST_CHECK_THROW(a1.get_complement_map(), mussa_error);
+
+  Alphabet a2 = Alphabet::get_alphabet(protein_alphabet);
+  BOOST_CHECK_THROW(a2.create_complement_map(), mussa_error);
+}
+*/
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_dna)
+{
+  Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet);
+  
+  std::string rc_map = a.get_complement_map();
+  BOOST_CHECK_EQUAL(rc_map['A'], 'T');
+  BOOST_CHECK_EQUAL(rc_map['a'], 't');
+  BOOST_CHECK_EQUAL(rc_map['T'], 'A');
+  BOOST_CHECK_EQUAL(rc_map['t'], 'a');
+  BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+  BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+  BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+  BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+  BOOST_CHECK_EQUAL(rc_map['U'], '~');
+  BOOST_CHECK_EQUAL(rc_map['u'], '~');
+  BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+  BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_reduced_rna)
+{
+  Alphabet a = Alphabet::get_alphabet(reduced_rna_alphabet);
+  
+  std::string rc_map = a.get_complement_map();
+  BOOST_CHECK_EQUAL(rc_map['A'], 'U');
+  BOOST_CHECK_EQUAL(rc_map['a'], 'u');
+  BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+  BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+  BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+  BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+  BOOST_CHECK_EQUAL(rc_map['T'], '~');
+  BOOST_CHECK_EQUAL(rc_map['t'], '~');
+  BOOST_CHECK_EQUAL(rc_map['U'], 'A');
+  BOOST_CHECK_EQUAL(rc_map['u'], 'a');
+  BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+  BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_dna)
+{
+  Alphabet a = Alphabet::get_alphabet(dna_alphabet);
+  
+  std::string rc_map = a.get_complement_map();
+  BOOST_CHECK_EQUAL(rc_map['A'], 'T');
+  BOOST_CHECK_EQUAL(rc_map['a'], 't');
+  BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+  BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+  BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+  BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+  BOOST_CHECK_EQUAL(rc_map['T'], 'A');
+  BOOST_CHECK_EQUAL(rc_map['t'], 'a');
+  BOOST_CHECK_EQUAL(rc_map['R'], 'Y');
+  BOOST_CHECK_EQUAL(rc_map['r'], 'y');
+  BOOST_CHECK_EQUAL(rc_map['Y'], 'R');
+  BOOST_CHECK_EQUAL(rc_map['y'], 'r');
+  BOOST_CHECK_EQUAL(rc_map['S'], 'S');
+  BOOST_CHECK_EQUAL(rc_map['s'], 's');
+  BOOST_CHECK_EQUAL(rc_map['W'], 'W');
+  BOOST_CHECK_EQUAL(rc_map['w'], 'w');
+  BOOST_CHECK_EQUAL(rc_map['K'], 'M');
+  BOOST_CHECK_EQUAL(rc_map['k'], 'm');
+  BOOST_CHECK_EQUAL(rc_map['M'], 'K');
+  BOOST_CHECK_EQUAL(rc_map['m'], 'k');
+  BOOST_CHECK_EQUAL(rc_map['B'], 'V');
+  BOOST_CHECK_EQUAL(rc_map['b'], 'v');
+  BOOST_CHECK_EQUAL(rc_map['V'], 'B');
+  BOOST_CHECK_EQUAL(rc_map['v'], 'b');
+  BOOST_CHECK_EQUAL(rc_map['D'], 'H');
+  BOOST_CHECK_EQUAL(rc_map['d'], 'h');
+  BOOST_CHECK_EQUAL(rc_map['H'], 'D');
+  BOOST_CHECK_EQUAL(rc_map['h'], 'd');
+  BOOST_CHECK_EQUAL(rc_map['N'], 'N');
+  BOOST_CHECK_EQUAL(rc_map['n'], 'n');
+
+  BOOST_CHECK_EQUAL(rc_map['U'], '~');
+  BOOST_CHECK_EQUAL(rc_map['u'], '~');
+  BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+  BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+BOOST_AUTO_TEST_CASE( alphabet_rc_rna)
+{
+  Alphabet a = Alphabet::get_alphabet(rna_alphabet);
+  
+  std::string rc_map = a.get_complement_map();
+  BOOST_CHECK_EQUAL(rc_map['A'], 'U');
+  BOOST_CHECK_EQUAL(rc_map['a'], 'u');
+  BOOST_CHECK_EQUAL(rc_map['C'], 'G');
+  BOOST_CHECK_EQUAL(rc_map['c'], 'g');
+  BOOST_CHECK_EQUAL(rc_map['G'], 'C');
+  BOOST_CHECK_EQUAL(rc_map['g'], 'c');
+  BOOST_CHECK_EQUAL(rc_map['U'], 'A');
+  BOOST_CHECK_EQUAL(rc_map['u'], 'a');
+  BOOST_CHECK_EQUAL(rc_map['R'], 'Y');
+  BOOST_CHECK_EQUAL(rc_map['r'], 'y');
+  BOOST_CHECK_EQUAL(rc_map['Y'], 'R');
+  BOOST_CHECK_EQUAL(rc_map['y'], 'r');
+  BOOST_CHECK_EQUAL(rc_map['S'], 'S');
+  BOOST_CHECK_EQUAL(rc_map['s'], 's');
+  BOOST_CHECK_EQUAL(rc_map['W'], 'W');
+  BOOST_CHECK_EQUAL(rc_map['w'], 'w');
+  BOOST_CHECK_EQUAL(rc_map['K'], 'M');
+  BOOST_CHECK_EQUAL(rc_map['k'], 'm');
+  BOOST_CHECK_EQUAL(rc_map['M'], 'K');
+  BOOST_CHECK_EQUAL(rc_map['m'], 'k');
+  BOOST_CHECK_EQUAL(rc_map['B'], 'V');
+  BOOST_CHECK_EQUAL(rc_map['b'], 'v');
+  BOOST_CHECK_EQUAL(rc_map['V'], 'B');
+  BOOST_CHECK_EQUAL(rc_map['v'], 'b');
+  BOOST_CHECK_EQUAL(rc_map['D'], 'H');
+  BOOST_CHECK_EQUAL(rc_map['d'], 'h');
+  BOOST_CHECK_EQUAL(rc_map['H'], 'D');
+  BOOST_CHECK_EQUAL(rc_map['h'], 'd');
+  BOOST_CHECK_EQUAL(rc_map['N'], 'N');
+  BOOST_CHECK_EQUAL(rc_map['n'], 'n');
+
+  BOOST_CHECK_EQUAL(rc_map['T'], '~');
+  BOOST_CHECK_EQUAL(rc_map['t'], '~');
+  BOOST_CHECK_EQUAL(rc_map['Q'], '~');
+  BOOST_CHECK_EQUAL(rc_map['*'], '~');
+}
+
+
+//enum AlphabetRef { reduced_nucleic_alphabet,   nucleic_alphabet, };
+BOOST_AUTO_TEST_CASE( alphabet_reverse_complement )
+{
+  Alphabet a = Alphabet::get_alphabet(reduced_dna_alphabet);
+  std::string seq("AAAAGCT");
+  std::string known_rc_seq("AGCTTTT");
+  boost::shared_ptr<std::string> new_seq(a.reverse_complement(seq));
+  
+  BOOST_CHECK_EQUAL(*new_seq, known_rc_seq);
+  
  }
 \ No newline at end of file
diff --git a/alg/test/test_seq.cpp b/alg/test/test_seq.cpp

index a016cb2c00c6e53bcfe65fc1c854462dccc3fef4..767bc5dd344bd3beaf908464af3784f1fdca8175 100644 (file)
--- a/alg/test/test_seq.cpp
+++ b/alg/test/test_seq.cpp
@@ -21,11 +21,21 @@ BOOST_AUTO_TEST_CASE( seqstring_string )
  }
  
  // such an exciting unit test, making sure that a=b; a==b
-BOOST_AUTO_TEST_CASE( seqstring_string_alphabet )
+BOOST_AUTO_TEST_CASE( seqstring_string_nucleic_alphabet )
  {
    SeqString s("AGCT", nucleic_alphabet);
  
    BOOST_CHECK_EQUAL(s.get_alphabet_ref(), nucleic_alphabet);
    BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::nucleic_alphabet());
    BOOST_CHECK_EQUAL(s.size(), 4);  
+}
+
+// such an exciting unit test, making sure that a=b; a==b
+BOOST_AUTO_TEST_CASE( seqstring_string_dna_alphabet )
+{
+  SeqString s("AGCT", dna_alphabet);
+
+  BOOST_CHECK_EQUAL(s.get_alphabet_ref(), dna_alphabet);
+  BOOST_CHECK_EQUAL(s.get_alphabet(), Alphabet::dna_alphabet());
+  BOOST_CHECK_EQUAL(s.size(), 4);  
  }
 \ No newline at end of file
diff --git a/alg/test/test_seq_span.cpp b/alg/test/test_seq_span.cpp

index 90e3d00928fa09ee921cd67157d2bae98cac47a3..4989b6cbc8dc37159ed8604da6b71990a8b1e199 100644 (file)
--- a/alg/test/test_seq_span.cpp
+++ b/alg/test/test_seq_span.cpp
@@ -12,6 +12,7 @@ BOOST_AUTO_TEST_CASE( seqspan_from_string )
    SeqSpanRef span1(new SeqSpan(str1));
    BOOST_CHECK_EQUAL(span1->length(), str1.length());
    BOOST_CHECK_EQUAL(span1->sequence(), str1);
+  BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand);
  }
  
  BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet )
@@ -23,6 +24,39 @@ BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet )
    BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet());
  }
  
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_plusstrand )
+{
+  std::string str1("AAGGCCUU");
+  SeqSpanRef span1(new SeqSpan(str1, reduced_rna_alphabet, SeqSpan::PlusStrand));
+  BOOST_CHECK_EQUAL(span1->length(), str1.length());
+  BOOST_CHECK_EQUAL(span1->sequence(), str1);
+  BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_rna_alphabet());
+  BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::PlusStrand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_alphabet_and_singlestrand )
+{
+  std::string str1("AAAAGCT");
+  SeqSpanRef span1(new SeqSpan(str1, reduced_dna_alphabet, SeqSpan::SingleStrand));
+  BOOST_CHECK_EQUAL(span1->length(), str1.length());
+  BOOST_CHECK_EQUAL(span1->sequence(), str1);
+  BOOST_CHECK_EQUAL(span1->get_alphabet(), Alphabet::reduced_dna_alphabet());
+  // we always store strands as Plus
+  BOOST_CHECK_EQUAL(span1->strand(), SeqSpan::SingleStrand);
+  BOOST_CHECK_EQUAL(span1->sequence(), "AAAAGCT");
+  BOOST_CHECK_THROW(span1->subseq(0,2,SeqSpan::OppositeStrand), sequence_invalid_strand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_from_string_with_invalidstrand )
+{
+  std::string s("AAAAGCT");
+  BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::UnknownStrand), sequence_invalid_strand);
+  BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand);
+  BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::SameStrand), sequence_invalid_strand);
+  BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::OppositeStrand), sequence_invalid_strand);
+  BOOST_CHECK_THROW(SeqSpan(s, reduced_dna_alphabet, SeqSpan::BothStrand), sequence_invalid_strand);
+}
+
  BOOST_AUTO_TEST_CASE( seqspan_from_seqspan )
  {
    std::string str1("AAGGCCTT");
@@ -78,6 +112,13 @@ BOOST_AUTO_TEST_CASE( seqspan_at )
    BOOST_CHECK_EQUAL( str1[2], seq2->at(0) ); 
    BOOST_CHECK_EQUAL( (*seq1)[0], seq1->at(0) );
    BOOST_CHECK_EQUAL( (*seq1)[2], (*seq2)[0] );
+  
+  SeqSpanRef seq3 = seq1->subseq(0, 4, SeqSpan::OppositeStrand);
+  BOOST_CHECK_EQUAL( seq3->at(0), 'C');
+  BOOST_CHECK_EQUAL( seq3->at(1), 'C');
+  BOOST_CHECK_EQUAL( seq3->at(2), 'T');
+  BOOST_CHECK_EQUAL( seq3->at(3), 'T');
+  
  }
  
  BOOST_AUTO_TEST_CASE( seqspan_data ) 
@@ -111,6 +152,21 @@ BOOST_AUTO_TEST_CASE( seqspan_begin_end )
    }   
  }
  
+BOOST_AUTO_TEST_CASE( seqspan_subseq_reverse_begin_end )
+{
+  std::string str1("AAAACCTT");
+  std::string str1rc("AAGGTTTT");
+  SeqSpanRef seq1(new SeqSpan(str1));
+  SeqSpanRef seq2(new SeqSpan(seq1, 0, SeqSpan::npos, SeqSpan::OppositeStrand ));
+  
+  
+  std::string::const_iterator str1rc_i = str1rc.begin();
+  SeqSpan::const_iterator seq2_i = seq2->begin();
+  for(; not ((str1rc_i == str1rc.end()) or (seq2_i == seq2->end())); ++str1rc_i, ++seq2_i) {
+    BOOST_CHECK_EQUAL( *str1rc_i, *seq2_i );
+  }   
+}
+
  BOOST_AUTO_TEST_CASE( seqspan_rbegin_rend )
  {
    std::string str1("AAGGCCTT");
@@ -289,4 +345,37 @@ BOOST_AUTO_TEST_CASE( seqspan_stop_past_end )
    
    s3->setStop(8);
    BOOST_CHECK_EQUAL( s3->size(), 4);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_strand_sameother )
+{
+  SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG"));
+  BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand);
+  
+  SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::SameStrand);
+  BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA");
+  BOOST_CHECK_EQUAL(seq2->strand(), SeqSpan::PlusStrand);
+  SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::OppositeStrand); 
+  BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT");
+  BOOST_CHECK_EQUAL(seq3->strand(), SeqSpan::MinusStrand);
+
+  // opposite of a plus strand should be minus
+  SeqSpanRef seq4 = seq2->subseq(0,4,SeqSpan::OppositeStrand);
+  BOOST_CHECK_EQUAL(seq4->sequence(), "TTTT");
+  BOOST_CHECK_EQUAL(seq4->strand(), SeqSpan::MinusStrand);
+  // opposite of a minus strand should be plus
+  SeqSpanRef seq5 = seq3->subseq(0,4,SeqSpan::OppositeStrand); 
+  BOOST_CHECK_EQUAL(seq5->sequence(), "AAAA");
+  BOOST_CHECK_EQUAL(seq5->strand(), SeqSpan::PlusStrand);
+}
+
+BOOST_AUTO_TEST_CASE( seqspan_strand_plusminus )
+{
+  SeqSpanRef seq1(new SeqSpan("AAAAAGGGGG"));
+  BOOST_CHECK_EQUAL(seq1->strand(), SeqSpan::PlusStrand);
+  
+  SeqSpanRef seq2 = seq1->subseq(0,4,SeqSpan::PlusStrand);
+  BOOST_CHECK_EQUAL(seq2->sequence(), "AAAA");
+  SeqSpanRef seq3 = seq1->subseq(0,4,SeqSpan::MinusStrand); 
+  BOOST_CHECK_EQUAL(seq3->sequence(), "TTTT");
  }
 \ No newline at end of file
diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp

index 6da21f166449c348c7b38c83f5dde803b7d01f54..cf417d95348496025280c6eadf28825ced652f0b 100644 (file)
--- a/alg/test/test_sequence.cpp
+++ b/alg/test/test_sequence.cpp
@@ -374,8 +374,9 @@ BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
  {
    std::string rna_str("AGCUN");
    Sequence rna_seq(rna_str, reduced_rna_alphabet);
-  BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");  
-  BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
+  BOOST_CHECK_EQUAL(rna_seq.rev_comp().get_sequence(), "NAGCU");  
+  BOOST_CHECK_EQUAL(rna_seq.get_sequence(), 
+                    rna_seq.rev_comp().rev_comp().get_sequence());
  }
  
  BOOST_AUTO_TEST_CASE( sequence_reverse_complement_subseq )
diff --git a/mussa_exceptions.hpp b/mussa_exceptions.hpp

index 3c44e6061705d1c370d4158eb8aa70791e6fd23f..1c577f4198bc210986822ccfd47bc1a77bf39810 100644 (file)
--- a/mussa_exceptions.hpp
+++ b/mussa_exceptions.hpp
@@ -71,6 +71,14 @@ public:
      mussa_error(msg) {};
  };
  
+//! Invalid strand identifier
+class sequence_invalid_strand : public mussa_error
+{
+public:
+  explicit sequence_invalid_strand(const std::string& msg) :
+    mussa_error(msg) {};
+};
+
  //! Error loading sequence annotation
  class annotation_load_error : public sequence_load_error
  {
author	Diane Trout <diane@caltech.edu>
	Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)
committer	Diane Trout <diane@caltech.edu>
	Sat, 24 Mar 2007 01:00:49 +0000 (01:00 +0000)
alg/alphabet.cpp		patch \| blob \| history
alg/alphabet.hpp		patch \| blob \| history
alg/flp_seqcomp.cpp		patch \| blob \| history
alg/motif_parser.cpp		patch \| blob \| history
alg/mussa.cpp		patch \| blob \| history
alg/seq.hpp		patch \| blob \| history
alg/seq_span.cpp		patch \| blob \| history
alg/seq_span.hpp		patch \| blob \| history
alg/sequence.cpp		patch \| blob \| history
alg/sequence.hpp		patch \| blob \| history
alg/test/test_alphabet.cpp		patch \| blob \| history
alg/test/test_seq.cpp		patch \| blob \| history
alg/test/test_seq_span.cpp		patch \| blob \| history
alg/test/test_sequence.cpp		patch \| blob \| history
mussa_exceptions.hpp		patch \| blob \| history