implement alphabet for sequence

author Diane Trout <diane@caltech.edu>

Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)

committer Diane Trout <diane@caltech.edu>

Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)
author Diane Trout <diane@caltech.edu>
Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)
committer Diane Trout <diane@caltech.edu>
Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)
diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt

index 5d36611b1abd464b006234ede090853714be18d6..0d6a84cef28b8625aa40a66bd40fd9b898b673bb 100644 (file)
--- a/alg/CMakeLists.txt
+++ b/alg/CMakeLists.txt
@@ -10,7 +10,8 @@ SET(MOC_HEADERS
     )
  QT4_WRAP_CPP(MOC_SOURCES ${MOC_HEADERS})
  
-SET(SOURCES annotation_colors.cpp 
+SET(SOURCES alphabet.cpp
+            annotation_colors.cpp 
              color.cpp 
              conserved_path.cpp 
              flp.cpp 
diff --git a/alg/alphabet.cpp b/alg/alphabet.cpp

new file mode 100644 (file)

index 0000000..bd29dd2
--- /dev/null
+++ b/alg/alphabet.cpp
@@ -0,0 +1,39 @@
+#include "alg/alphabet.hpp"
+
+// some standard dna alphabets 
+// Include nl (\012), and cr (\015) to make sequence parsing eol 
+// convention independent.
+const Alphabet Alphabet::reduced_dna_alphabet("AaCcGgTtNn\012\015");
+const Alphabet Alphabet::reduced_rna_alphabet("AaCcGgUuNn\012\015");
+const Alphabet Alphabet::reduced_nucleic_alphabet("AaCcGgTtUuNn\012\015");
+//! this is the general iupac alphabet for nucleotides
+const Alphabet Alphabet::nucleic_alphabet(
+  "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015"
+);
+//! the protein alphabet
+const Alphabet Alphabet::protein_alphabet(
+  "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"
+);
+
+Alphabet::Alphabet(const char *a) :
+  alphabet(a)
+{
+  alphabet_set.insert(alphabet.begin(), alphabet.end());  
+} 
+
+void Alphabet::assign(const Alphabet& a)
+{
+  alphabet = a.alphabet;
+  alphabet_set.clear();
+  alphabet_set.insert(alphabet.begin(), alphabet.end());
+}
+
+const char *Alphabet::c_str() const
+{
+  alphabet.c_str();
+}
+
+bool Alphabet::exists(const char c) const
+{
+  return (alphabet_set.find(c) != alphabet_set.end());
+}
+\ No newline at end of file
diff --git a/alg/alphabet.hpp b/alg/alphabet.hpp

new file mode 100644 (file)

index 0000000..e7d35c9
--- /dev/null
+++ b/alg/alphabet.hpp
@@ -0,0 +1,59 @@
+#ifndef ALPHABET_HPP_
+#define ALPHABET_HPP_
+
+#include <boost/serialization/export.hpp>
+#include <boost/serialization/nvp.hpp>
+#include <boost/serialization/string.hpp>
+#include <boost/serialization/utility.hpp>
+#include <boost/serialization/version.hpp>
+
+#include <set>
+
+//! this is a helper class for sequence
+class Alphabet {
+friend class Sequence;
+public:
+  typedef std::string::const_iterator const_iterator;
+
+  //! return reference to the characters in our alphabet  
+  const char *c_str() const;
+  //! case-insensitive test to check a character for existence in our alphabet
+  bool exists(const char) const;
+  
+  // note, if you want to define an alphabet for a sequence, you probably want 
+  // to update the enumeration in Sequence, and Sequence::get_sequence
+  //! The standard DNA alphabet, with unique, and unknown characters
+  static const Alphabet reduced_dna_alphabet;
+  //! The standard RNA alphabet, with unique, and unknown characters
+  static const Alphabet reduced_rna_alphabet;
+  //! The standard DNA/RNA alphabet, with unique, and unknown characters
+  static const Alphabet reduced_nucleic_alphabet;
+  //! this is the general IUPAC alphabet for nucleotides
+  static const Alphabet nucleic_alphabet;
+  //! the protein alphabet
+  static const Alphabet protein_alphabet;
+    
+private:
+  //! what are allowable symbols in our alphabet
+  std::string alphabet;
+  //! internal variable to make exists() faster
+  std::set<std::string::value_type> alphabet_set;
+  
+  //! some necessary string api access
+  Alphabet(const char *a);
+  //! allow sequence to copy one alphabet to another (needed when unserializing) 
+  void assign(const Alphabet& a);
+  const_iterator begin() const { return alphabet.begin(); }
+  const_iterator end() const { return alphabet.end(); }
+  
+  // serialization
+  friend class boost::serialization::access;
+  template<class Archive>
+  void serialize(Archive& ar, const unsigned int /*version*/) {
+    ar & BOOST_SERIALIZATION_NVP(alphabet);
+    alphabet_set.clear();
+    alphabet_set.insert(alphabet.begin(), alphabet.end());
+  }   
+};
+
+#endif /*ALPHABET_*/
diff --git a/alg/mussa.cpp b/alg/mussa.cpp

index 47a6b0fe464e72670a0e55b2726d8001ece47de6..50991c1e1d1411a574f9663b6f43ffbacfa8a0ef 100644 (file)
--- a/alg/mussa.cpp
+++ b/alg/mussa.cpp
@@ -792,7 +792,7 @@ struct push_back_motif {
    void operator()(std::string::const_iterator, 
                    std::string::const_iterator) const 
    {
-    Sequence seq(seq_string);
+    Sequence seq(seq_string, Sequence::nucleic_alphabet);
      // shouldn't we have a better field than "fasta header" and speices?
      seq.set_fasta_header(name);
      // we need to clear the name in case the next motif doesn't have one.
@@ -809,7 +809,7 @@ struct push_back_motif {
  void Mussa::load_motifs(std::istream &in)
  {
    std::string data;
-  const char *alphabet = Sequence::nucleic_iupac_alphabet.c_str();
+  const char *alphabet = Alphabet::nucleic_alphabet.c_str();
    string seq;
    string name;
    float red = 1.0;
diff --git a/alg/sequence.cpp b/alg/sequence.cpp

index 845a01617016dd17dfcf7e8c96bc55d0fa43acd8..ae58e5c87ed2da52f47193c679a5377b41aa03fe 100644 (file)
--- a/alg/sequence.cpp
+++ b/alg/sequence.cpp
@@ -34,8 +34,10 @@ namespace spirit = boost::spirit;
  #include "mussa_exceptions.hpp"
  
  #include <string>
+#include <stdexcept>
  #include <iostream>
  #include <sstream>
+#include <set>
  
  annot::annot() 
   : begin(0),
@@ -75,15 +77,10 @@ motif::~motif()
  {
  }
  
-const std::string Sequence::dna_alphabet("AaCcGgTtNn\012\015");
-const std::string Sequence::rna_alphabet("AaCcGgNnUu\012\015");
-  //! this is the general iupac alphabet for nucleotides
-const std::string Sequence::nucleic_iupac_alphabet("AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015");
-  //! the protein alphabet
-const std::string Sequence::protein_alphabet("AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015");
  
-Sequence::Sequence()
+Sequence::Sequence(alphabet_ref alphabet_)
    : parent(0),
+    alphabet(alphabet_),
      seq_start(0),
      seq_count(0),
      strand(UnknownStrand)
@@ -94,31 +91,34 @@ Sequence::~Sequence()
  {
  }
  
-Sequence::Sequence(const char *seq)
+Sequence::Sequence(const char *seq, alphabet_ref alphabet_)
    : parent(0),
+    alphabet(alphabet_),
      seq_start(0),
      seq_count(0),
      strand(UnknownStrand),
      header(""),
      species("")
  {
-  set_filtered_sequence(seq);
+  set_filtered_sequence(seq, alphabet);
  }
  
-Sequence::Sequence(const std::string& seq) 
+Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_) 
    : parent(0),
+    alphabet(alphabet_),
      seq_start(0),
      seq_count(0),
      strand(UnknownStrand),
      header(""),
      species("")
  {
-  set_filtered_sequence(seq);
+  set_filtered_sequence(seq, alphabet);
  }
  
  Sequence::Sequence(const Sequence& o)
    : parent(o.parent),
      seq(o.seq),
+    alphabet(o.alphabet),
      seq_start(o.seq_start),
      seq_count(o.seq_count),
      strand(o.strand),
@@ -134,6 +134,7 @@ Sequence &Sequence::operator=(const Sequence& s)
    if (this != &s) {
      parent = s.parent;
      seq = s.seq;
+    alphabet = s.alphabet;
      seq_start = s.seq_start;
      seq_count = s.seq_count;
      strand = s.strand;
@@ -161,16 +162,14 @@ static void multiplatform_getline(std::istream& in, std::string& line)
    }
  }
  
+void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
+{
+  load_fasta(file_path, alphabet, seq_num, start_index, end_index);
+}
+
  //! load a fasta file into a sequence
-/*! 
- * \param file_path the location of the fasta file in the filesystem
- * \param seq_num which sequence in the file to load
- * \param start_index starting position in the fasta sequence, 0 for beginning
- * \param end_index ending position in the fasta sequence, 0 for end
- * \return error message, empty string if no error. (gag!)
- */
-void Sequence::load_fasta(fs::path file_path, int seq_num,
-                          int start_index, int end_index)
+void Sequence::load_fasta(fs::path file_path, alphabet_ref a, 
+                          int seq_num, int start_index, int end_index)
  {
    fs::fstream data_file;
    data_file.open(file_path, std::ios::in);
@@ -180,7 +179,7 @@ void Sequence::load_fasta(fs::path file_path, int seq_num,
      throw mussa_load_error("Sequence File: "+file_path.string()+" not found"); 
    } else {
      try {
-      load_fasta(data_file, seq_num, start_index, end_index);
+      load_fasta(data_file, a, seq_num, start_index, end_index);
      } catch(sequence_empty_error e) {
        // there doesn't appear to be any sequence
        // catch and rethrow to include the filename
@@ -198,8 +197,15 @@ void Sequence::load_fasta(fs::path file_path, int seq_num,
    }
  }
  
+void Sequence::load_fasta(std::iostream& file, 
+                          int seq_num, int start_index, int end_index)
+{
+  load_fasta(file, alphabet, seq_num, start_index, end_index);
+}
+
  void
-Sequence::load_fasta(std::iostream& data_file, int seq_num, 
+Sequence::load_fasta(std::iostream& data_file, alphabet_ref a, 
+                     int seq_num, 
                       int start_index, int end_index)
  {
    std::string file_data_line;
@@ -208,6 +214,7 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num,
    std::string rev_comp;
    std::string sequence_raw;
    std::string seq_tmp;      // holds sequence during basic filtering
+  const Alphabet &alpha = get_alphabet(a);
  
    if (seq_num == 0) {
      throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
@@ -230,7 +237,18 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num,
        multiplatform_getline(data_file,file_data_line);
        if (file_data_line.substr(0,1) == ">")
          read_seq = false;
-      else sequence_raw += file_data_line;
+      else {
+        for (std::string::const_iterator line_i = file_data_line.begin();
+             line_i != file_data_line.end();
+             ++line_i)
+         {
+           if(alpha.exists(*line_i)) {
+             sequence_raw += *line_i;
+           } else {
+            throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
+           }
+         }
+      }
      }
  
      // Lastly, if subselection of the sequence was specified we keep cut out
@@ -244,50 +262,35 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num,
        std::string msg("The selected sequence appears to be empty"); 
        throw sequence_empty_error(msg);
      }
-    set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
+    set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
    } else {
      std::string errormsg("There were no fasta sequences");
      throw sequence_empty_file_error(errormsg);
    }
  }
  
-void Sequence::set_filtered_sequence(const std::string &old_seq, 
+void Sequence::set_filtered_sequence(const std::string &in_seq,
+                                     alphabet_ref alphabet_,
                                       size_type start,
                                       size_type count,
                                       strand_type strand_)
  {
-  char conversionTable[257];
-
+  alphabet = alphabet_;
    if ( count == npos)
-    count = old_seq.size() - start;
+    count = in_seq.size() - start;
    boost::shared_ptr<seq_string> new_seq(new seq_string);
    new_seq->reserve(count);
  
-  // Make a conversion table
-
-  // everything we don't specify below will become 'N'
-  for(int table_i=0; table_i < 256; table_i++)
-  {
-    conversionTable[table_i] = 'N';
-  }
-  // add end of string character for printing out table for testing purposes
-  conversionTable[256] = '\0';
-
-  // we want these to map to themselves - ie not to change
-  conversionTable[(int)'A'] = 'A';
-  conversionTable[(int)'T'] = 'T';
-  conversionTable[(int)'G'] = 'G';
-  conversionTable[(int)'C'] = 'C';
-  // this is to upcase
-  conversionTable[(int)'a'] = 'A';
-  conversionTable[(int)'t'] = 'T';
-  conversionTable[(int)'g'] = 'G';
-  conversionTable[(int)'c'] = 'C';
-
    // finally, the actual conversion loop
-  for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
+  const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets
+  std::string::const_iterator seq_i = in_seq.begin()+start;
+  for(size_type i = 0; i != count; ++i, ++seq_i)
    {
-    new_seq->append(1, conversionTable[ (int)old_seq[seq_index+start]]);
+    if (alpha_impl.exists(*seq_i)) {
+      new_seq->append(1, *seq_i);
+    } else {
+      new_seq->append(1, 'N');
+    }
    }
    parent = 0;
    seq = new_seq;
@@ -450,7 +453,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                        ((spirit::ch_p('>')|spirit::str_p("&gt;")) >> 
                           (*(spirit::print_p))[spirit::assign_a(name)] >>
                           spirit::eol_p >> 
-                         (+(spirit::chset<>(nucleic_iupac_alphabet.c_str())))[spirit::assign_a(seq)]
+                         (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
                         )[push_back_seq(query_seqs, name, seq)]
                        ) >>
                        *spirit::space_p
@@ -523,44 +526,54 @@ Sequence::subseq(int start, int count)
    return new_seq;
  }
  
-std::string
-Sequence::rev_comp() const
+std::string Sequence::create_reverse_map() const 
  {
-  std::string rev_comp;
-  char conversionTable[257];
-  int seq_i, table_i, len;
-
-  len = length();
-  rev_comp.reserve(len);
-  // make a conversion table
-  // init all parts of conversion table to '~' character
-  // '~' I doubt will ever appear in a sequence file (jeez, I hope)
-  // and may the fleas of 1000 camels infest the genitals of any biologist (and
-  // seven generations of their progeny) who decides to make it mean
-  // something special!!!
-  // PS - double the curse for any smartass non-biologist who tries it as well
-  for(table_i=0; table_i < 256; table_i++)
-  {
-    conversionTable[table_i] = '~';
-  }
-  // add end of string character for printing out table for testing purposes
-  conversionTable[256] = '\0';
-
-  // add in the characters for the bases we want to convert
-  conversionTable[(int)'A'] = 'T';
-  conversionTable[(int)'T'] = 'A';
-  conversionTable[(int)'G'] = 'C';
-  conversionTable[(int)'C'] = 'G';
-  conversionTable[(int)'N'] = 'N';
+  std::string rc_map(256, '~');
+  // if we're rna, use U instead of T
+  // we might want to add an "is_rna" to sequence at somepoint
+  char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T';
+  char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't';
+  rc_map['A'] = TU ; rc_map['a'] = tu ;
+  rc_map['T'] = 'A'; rc_map['t'] = 'a';
+  rc_map['U'] = 'A'; rc_map['u'] = 'a';
+  rc_map['G'] = 'C'; rc_map['g'] = 'c';
+  rc_map['C'] = 'G'; rc_map['c'] = 'g';
+  rc_map['M'] = 'K'; rc_map['m'] = 'k';
+  rc_map['R'] = 'Y'; rc_map['r'] = 'y';
+  rc_map['W'] = 'W'; rc_map['w'] = 'w';
+  rc_map['S'] = 'S'; rc_map['s'] = 's';
+  rc_map['Y'] = 'R'; rc_map['y'] = 'r';
+  rc_map['K'] = 'M'; rc_map['k'] = 'm';
+  rc_map['V'] = 'B'; rc_map['v'] = 'b';
+  rc_map['H'] = 'D'; rc_map['h'] = 'd';
+  rc_map['D'] = 'H'; rc_map['d'] = 'h';
+  rc_map['B'] = 'V'; rc_map['b'] = 'v';  
+  rc_map['N'] = 'N'; rc_map['n'] = 'n';
+  rc_map['X'] = 'X'; rc_map['x'] = 'x';
+  rc_map['?'] = '?'; 
+  rc_map['.'] = '.'; 
+  rc_map['-'] = '-'; 
+  rc_map['~'] = '~'; // not really needed, but perhaps it's clearer. 
+  return rc_map;
+}
  
-  // finally, the actual conversion loop
-  for(seq_i = len - 1; seq_i >= 0; seq_i--)
+Sequence Sequence::rev_comp() const
+{
+  std::string rev_comp;
+  rev_comp.reserve(length());
+  
+  std::string rc_map = create_reverse_map();
+
+  // reverse and convert
+  seq_string::const_reverse_iterator seq_i;
+  seq_string::const_reverse_iterator seq_end = seq->rend();  
+  for(seq_i = seq->rbegin(); 
+      seq_i != seq_end;
+      ++seq_i)
    {
-    table_i = (int) at(seq_i);
-    rev_comp += conversionTable[table_i];
+    rev_comp.append(1, rc_map[*seq_i]);
    }
-
-  return rev_comp;
+  return Sequence(rev_comp, alphabet); 
  }
  
  void Sequence::set_fasta_header(std::string header_)
@@ -596,9 +609,33 @@ Sequence::get_name() const
      return "";
  }
  
-void Sequence::set_sequence(const std::string& s) 
+const Alphabet& Sequence::get_alphabet() const
+{
+  return get_alphabet(alphabet);
+}
+
+const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const
  {
-  set_filtered_sequence(s);
+  switch (alpha) {
+    case reduced_dna_alphabet:
+      return Alphabet::reduced_dna_alphabet;
+    case reduced_rna_alphabet:
+      return Alphabet::reduced_rna_alphabet;
+    case reduced_nucleic_alphabet:
+      return Alphabet::reduced_nucleic_alphabet;
+    case nucleic_alphabet:
+      return Alphabet::nucleic_alphabet;
+    case protein_alphabet:
+      return Alphabet::protein_alphabet;    
+    default:
+      throw std::runtime_error("unrecognized alphabet type");
+      break;
+  }
+}
+
+void Sequence::set_sequence(const std::string& s, alphabet_ref a) 
+{
+  set_filtered_sequence(s, a);
  }
  
  std::string Sequence::get_sequence() const
@@ -736,7 +773,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
    }
    getline(load_file, file_data_line);
    // looks like the sequence is written as a single line
-  set_filtered_sequence(file_data_line);
+  set_filtered_sequence(file_data_line, reduced_dna_alphabet);
    getline(load_file, file_data_line);
    getline(load_file, file_data_line);
    if (file_data_line == "<Annotations>")
@@ -793,107 +830,6 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
    load_file.close();
  }
  
-std::string
-Sequence::rc_motif(std::string a_motif) const
-{
-  std::string rev_comp;
-  char conversionTable[257];
-  int seq_i, table_i, len;
-
-  len = a_motif.length();
-  rev_comp.reserve(len);
-
-  for(table_i=0; table_i < 256; table_i++)
-  {
-    conversionTable[table_i] = '~';
-  }
-  // add end of std::string character for printing out table for testing purposes
-  conversionTable[256] = '\0';
-
-  // add in the characters for the bases we want to convert (IUPAC)
-  conversionTable[(int)'A'] = 'T';
-  conversionTable[(int)'T'] = 'A';
-  conversionTable[(int)'G'] = 'C';
-  conversionTable[(int)'C'] = 'G';
-  conversionTable[(int)'N'] = 'N';
-  conversionTable[(int)'M'] = 'K';
-  conversionTable[(int)'R'] = 'Y';
-  conversionTable[(int)'W'] = 'W';
-  conversionTable[(int)'S'] = 'S';
-  conversionTable[(int)'Y'] = 'R';
-  conversionTable[(int)'K'] = 'M';
-  conversionTable[(int)'V'] = 'B';
-  conversionTable[(int)'H'] = 'D';
-  conversionTable[(int)'D'] = 'H';
-  conversionTable[(int)'B'] = 'V';
-
-  // finally, the actual conversion loop
-  for(seq_i = len - 1; seq_i >= 0; seq_i--)
-  {
-    //std::cout << "** i = " << seq_i << " bp = " << 
-    table_i = (int) a_motif[seq_i];
-    rev_comp += conversionTable[table_i];
-  }
-
-  //std::cout << "seq: " << a_motif << std::endl;
-  //std::cout << "rc:  " << rev_comp << std::endl;
-
-  return rev_comp;
-}
-
-std::string
-Sequence::motif_normalize(const std::string& a_motif)
-{
-  std::string valid_motif;
-  int seq_i, len;
-
-  len = a_motif.length();
-  valid_motif.reserve(len);
-
-  // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
-  // current nonIUPAC symbols are omitted, which is not reported atm
-  for(seq_i = 0; seq_i < len; seq_i++)
-  {
-    if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
-      valid_motif += 'A';
-    else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
-      valid_motif += 'T';
-    else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
-      valid_motif += 'G';
-    else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
-      valid_motif += 'C';
-    else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
-      valid_motif += 'N';
-    else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
-      valid_motif += 'M';
-    else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
-      valid_motif += 'R';
-    else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
-      valid_motif += 'W';
-    else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
-      valid_motif += 'S';
-    else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
-      valid_motif += 'Y';
-    else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
-      valid_motif += 'G';
-    else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
-      valid_motif += 'V';
-    else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
-      valid_motif += 'H';
-    else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
-      valid_motif += 'D';
-    else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
-      valid_motif += 'B';
-    else {
-      std::string msg = "Letter ";
-      msg += a_motif[seq_i];
-      msg += " is not a valid IUPAC symbol";
-      throw motif_normalize_error(msg);
-    }
-  }
-  //std::cout << "valid_motif is: " << valid_motif << std::endl;
-  return valid_motif;
-}
  
  void Sequence::add_motif(const Sequence& a_motif)
  {
@@ -918,107 +854,78 @@ const std::list<motif>& Sequence::motifs() const
  }
  
  std::vector<int>
-Sequence::find_motif(const std::string& a_motif) const
+Sequence::find_motif(const Sequence& a_motif) const
  {
    std::vector<int> motif_match_starts;
-  std::string norm_motif_rc;
+  Sequence norm_motif_rc;
  
    motif_match_starts.clear();
+  // std::cout << "motif is: " << norm_motif << std::endl;
  
-  //std::cout << "motif is: " << a_motif << std::endl;
-  std::string norm_motif = motif_normalize(a_motif);
-  //std::cout << "motif is: " << a_motif << std::endl;
-
-  if (norm_motif.size() > 0)
+  if (a_motif.size() > 0)
    {
      //std::cout << "Sequence: none blank motif\n";
-    motif_scan(norm_motif, &motif_match_starts);
+    motif_scan(a_motif, &motif_match_starts);
  
-    norm_motif_rc = rc_motif(a_motif);
+    norm_motif_rc = a_motif.rev_comp();;
      // make sure not to do search again if it is a palindrome
-    if (norm_motif_rc != norm_motif) {
+    if (norm_motif_rc != a_motif) {
        motif_scan(norm_motif_rc, &motif_match_starts);
      }
    }
    return motif_match_starts;
  }
  
-std::vector<int>
-Sequence::find_motif(const Sequence& a_motif) const
-{
-  return find_motif(a_motif.get_sequence());
-}
-
  void
-Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts) const
+Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
  {
    // if there's no sequence we can't scan for it?
    // should this throw an exception?
    if (!seq) return;
  
-  std::string::const_iterator seq_c = seq->begin();
-  std::string::size_type seq_i;
-  int motif_i, motif_len;
+  std::string::size_type seq_i = 0;
+  Sequence::size_type motif_i = 0;
+  Sequence::size_type motif_len = a_motif.length();
+  Sequence::value_type motif_char;
+  Sequence::value_type seq_char;
  
-  //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl; 
-  motif_len = a_motif.length();
-
-  //std::cout << "motif_length: " << motif_len << std::endl;
-  //std::cout << "RAAARRRRR\n";
-
-  motif_i = 0;
-
-  //std::cout << "motif: " << a_motif << std::endl;
-
-  //std::cout << "Sequence: motif, length= " << length << std::endl;
-  seq_i = 0;
-  while (seq_i < length())
+  while (seq_i < seq->length())
    {
-    //std::cout << seq_c[seq_i];
-    //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
      // this is pretty much a straight translation of Nora's python code
      // to match iupac letter codes
-    if (a_motif[motif_i] =='N')
+    motif_char = toupper(a_motif[motif_i]);
+    seq_char = toupper(seq->at(seq_i));
+    if (motif_char =='N')
        motif_i++;
-    else if (a_motif[motif_i] == seq_c[seq_i])
+    else if (motif_char == seq_char)
        motif_i++;
-    else if ((a_motif[motif_i] =='M') && 
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+    else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
        motif_i++;
-    else if ((a_motif[motif_i] =='R') && 
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+    else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
        motif_i++;
-    else if ((a_motif[motif_i] =='W') && 
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+    else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
        motif_i++;
-    else if ((a_motif[motif_i] =='S') && 
-             ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+    else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
        motif_i++;
-    else if ((a_motif[motif_i] =='Y') && 
-             ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+    else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
        motif_i++;
-    else if ((a_motif[motif_i] =='K') && 
-             ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+    else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
        motif_i++;
-    else if ((a_motif[motif_i] =='V') && 
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
-              (seq_c[seq_i]=='G')))
+    else if ((motif_char =='V') && 
+             ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
        motif_i++;
-    else if ((a_motif[seq_i] =='H') && 
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
-              (seq_c[seq_i]=='T')))
+    else if ((motif_char =='H') && 
+             ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
        motif_i++;
-    else if ((a_motif[motif_i] =='D') &&
-             ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
-              (seq_c[seq_i]=='T')))
+    else if ((motif_char =='D') &&
+             ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
        motif_i++;
-    else if ((a_motif[motif_i] =='B') &&
-             ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
-              (seq_c[seq_i]=='T')))
+    else if ((motif_char =='B') &&
+             ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
        motif_i++;
-
      else
      {
+      // if a motif doesn't match, erase our current trial and try again
        seq_i -= motif_i;
        motif_i = 0;
      }
@@ -1026,7 +933,6 @@ Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts)
      // end Nora stuff, now we see if a match is found this pass
      if (motif_i == motif_len)
      {
-      //std::cout << "!!";
        annot new_motif;
        motif_match_starts->push_back(seq_i - motif_len + 1);
        motif_i = 0;
@@ -1118,9 +1024,14 @@ bool operator==(const Sequence& x, const Sequence& y)
    // since the length of the two sequences is equal, we only need to
    // test one.
    for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) {
-    if (*xseq_i != *yseq_i) {
+    if (toupper(*xseq_i) != toupper(*yseq_i)) {
        return false;
      }
    }
    return true;
  }
+
+bool operator!=(const Sequence& x, const Sequence& y) 
+{
+  return not operator==(x, y);
+}
diff --git a/alg/sequence.hpp b/alg/sequence.hpp

index 5010729243fc8b5caae76424419789db81c0b866..dc9d594a1f572753f73ced6a5de056db5b6c0209 100644 (file)
--- a/alg/sequence.hpp
+++ b/alg/sequence.hpp
@@ -31,6 +31,8 @@
  
  #include <iostream>
  
+#include "alg/alphabet.hpp"
+
  // Sequence data class
  
  //! Attach annotation information to a sequence track
@@ -87,6 +89,11 @@ BOOST_CLASS_EXPORT(motif);
  //! functions need the serialization support to be in-class.
  class seq_string : public std::string
  {
+public:
+  typedef std::string::iterator iterator;
+  typedef std::string::reverse_iterator reverse_iterator;
+  typedef std::string::const_iterator const_iterator;
+  typedef std::string::const_reverse_iterator const_reverse_iterator;
  private:
    friend class boost::serialization::access;
    template<class Archive>
@@ -94,7 +101,7 @@ private:
      //ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(std::string);
      ar & boost::serialization::make_nvp("bases",
            boost::serialization::base_object<std::string>(*this)
-         );
+    );
    }
  };
  
@@ -102,43 +109,39 @@ private:
  class Sequence 
  {
  public:
+  typedef std::string::value_type value_type;
    typedef std::string::difference_type difference_type;
    typedef std::string::iterator iterator;
+  typedef std::string::reverse_iterator reverse_iterator;
    typedef std::string::const_iterator const_iterator;
+  typedef std::string::const_reverse_iterator const_reverse_iterator;
    typedef std::string::reference reference;
    typedef std::string::const_reference const_reference;
    typedef std::string::size_type size_type;
    static const size_type npos = std::string::npos;
    enum strand_type { UnknownStrand, PlusStrand, MinusStrand, BothStrand };
-
-  // some standard dna alphabets 
-  // Include nl (\012), and cr (\015) to make sequence parsing eol 
-  // convention independent.
-
-  static const std::string dna_alphabet;
-  static const std::string rna_alphabet;
-  //! this is the general iupac alphabet for nucleotides
-  static const std::string nucleic_iupac_alphabet;
-  //! the protein alphabet
-  static const std::string protein_alphabet;
-
-  Sequence();
-  ~Sequence();
-  Sequence(const char* seq);
-  Sequence(const std::string& seq);
+  enum alphabet_ref { reduced_dna_alphabet, reduced_rna_alphabet, reduced_nucleic_alphabet, 
+                      nucleic_alphabet, protein_alphabet };
+                      
+  Sequence(alphabet_ref a = reduced_nucleic_alphabet);
+  Sequence(const char* seq, alphabet_ref a = reduced_nucleic_alphabet);
+  Sequence(const std::string& seq, alphabet_ref a = reduced_nucleic_alphabet);
    Sequence(const Sequence& seq);
+  ~Sequence();
    //! assignment to constant sequences
    Sequence &operator=(const Sequence&);
  
    friend std::ostream& operator<<(std::ostream&, const Sequence&);
    friend bool operator<(const Sequence&, const Sequence&);
    friend bool operator==(const Sequence&, const Sequence&);
+  friend bool operator!=(const Sequence&, const Sequence&);
    const_reference operator[](size_type) const;
  
    //! set sequence to a (sub)string containing nothing but AGCTN
-  void set_filtered_sequence(const std::string& seq, 
-                            size_type start=0,
-                            size_type count=npos,
+  void set_filtered_sequence(const std::string& seq,
+                             alphabet_ref a, 
+                             size_type start=0,
+                             size_type count=npos,
                               strand_type strand=UnknownStrand);
  
    //! retrive element at specific position
@@ -164,11 +167,13 @@ public:
  
    //! return a subsequence, copying over any appropriate annotation
    Sequence subseq(int start=0, int count = std::string::npos);
+  //! reverse a character
+  std::string create_reverse_map() const;
    //! return a reverse compliment (this needs to be improved?)
-  std::string rev_comp() const;
+  Sequence rev_comp() const;
  
    //! set sequence (filtered)
-  void set_sequence(const std::string &);
+  void set_sequence(const std::string &, alphabet_ref);
    //! get sequence
    std::string get_sequence() const;
    //! set species name
@@ -181,19 +186,32 @@ public:
    std::string get_fasta_header() const;
    //! get name (will return the first non-empty, of fasta_header, species)
    std::string get_name() const;
+  //! return a reference to whichever alphabet we're currently representing
+  const Alphabet& get_alphabet() const; 
+  //! return a reference to whichever alphabet we're currently representing
+  const Alphabet& get_alphabet(alphabet_ref) const; 
    
+  //! load sequence from fasta file using the sequences current alphabet
+  void load_fasta(const boost::filesystem::path file_path, int seq_num=1,
+                  int start_index=0, int end_index=0);
    //! load sequence AGCT from fasta file
    //! \throw mussa_load_error
    //! \throw sequence_empty_error
    //! \throw sequence_empty_file_error
-  void load_fasta(const boost::filesystem::path file_path, int seq_num=1, 
-                 int start_index=0, int end_index=0);
+  void load_fasta(const boost::filesystem::path file_path, 
+                  alphabet_ref a, 
+                  int seq_num=1, 
+                             int start_index=0, int end_index=0);
+  void load_fasta(std::iostream& file, 
+                  int seq_num=1, int start_index=0, int end_index=0);
    //! load sequence from stream
    //! \throw mussa_load_error
    //! \throw sequence_empty_error
    //! \throw sequence_empty_file_error
-  void load_fasta(std::iostream& file, int seq_num=1, 
-                 int start_index=0, int end_index=0);
+  void load_fasta(std::iostream& file, 
+                  alphabet_ref a,
+                  int seq_num=1, 
+                             int start_index=0, int end_index=0);
    //! load sequence annotations
    //! \throws mussa_load_error 
    void load_annot(const boost::filesystem::path file_path, int start_index, int end_index);
@@ -207,20 +225,12 @@ public:
    const std::list<motif>& motifs() const;
  
    //! add a motif to our list of motifs
-  //! \throws motif_normalize_error if there's something wrong with a_motif
    void add_motif(const Sequence& a_motif);
    //! clear our list of found motifs
    void clear_motifs();
    //! search a sequence for a_motif
    //! \throws motif_normalize_error if there's something wrong with a_motif
-  std::vector<int> find_motif(const std::string& a_motif) const;
-  //! search a sequence for a_motif
-  //! \throws motif_normalize_error if there's something wrong with a_motif
-  std::vector<int> find_motif(const Sequence& a_motif) const;
-  //! convert IUPAC symbols to upperase
-  //! \throws motif_normalize_error if there is an invalid symbol
-  static std::string motif_normalize(const std::string& a_motif);
-  
+  std::vector<int> find_motif(const Sequence& a_motif) const;  
    //! annotate the current sequence with other sequences
    void find_sequences(std::list<Sequence>::iterator start, 
                       std::list<Sequence>::iterator end);
@@ -233,6 +243,8 @@ private:
    Sequence *parent;
    //! hold a shared pointer to our sequence string
    boost::shared_ptr<seq_string> seq;
+  //! which alphabet we're using
+  alphabet_ref alphabet;
    //! start offset into the sequence
    size_type seq_start;
    //! number of basepairs of the shared sequence we represent
@@ -249,7 +261,7 @@ private:
    //! a seperate list for motifs since we're currently not saving them
    std::list<motif> motif_list;
  
-  void motif_scan(std::string a_motif, std::vector<int> * motif_match_starts) const;
+  void motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const;
    std::string rc_motif(std::string a_motif) const;
    //! look for a string sequence type and and it to an annotation list
    void add_string_annotation(std::string a_seq, std::string name);
@@ -260,6 +272,7 @@ private:
    void serialize(Archive& ar, const unsigned int /*version*/) {
      ar & BOOST_SERIALIZATION_NVP(parent);
      ar & BOOST_SERIALIZATION_NVP(seq);
+    ar & BOOST_SERIALIZATION_NVP(alphabet);
      ar & BOOST_SERIALIZATION_NVP(seq_start);
      ar & BOOST_SERIALIZATION_NVP(seq_count);
      ar & BOOST_SERIALIZATION_NVP(strand);
diff --git a/alg/test/CMakeLists.txt b/alg/test/CMakeLists.txt

index 64767bb99ff42e8b15219ff34bf2a67f452d0718..7e5b7a89f87127a0248e08a330a390cd64839f71 100644 (file)
--- a/alg/test/CMakeLists.txt
+++ b/alg/test/CMakeLists.txt
@@ -2,10 +2,19 @@ FIND_PACKAGE(OpenGL)
  INCLUDE(FindBoost)
  INCLUDE(Platform)
  
-SET(SOURCES test_annotation_color.cpp test_color.cpp test_conserved_path.cpp
-            test_flp.cpp test_glseqbrowser.cpp test_glsequence.cpp
-            test_main.cpp test_mussa.cpp test_nway.cpp 
-            test_sequence.cpp test_sequence_location.cpp )
+SET(SOURCES 
+      test_alphabet.cpp 
+      test_annotation_color.cpp 
+      test_color.cpp 
+      test_conserved_path.cpp
+      test_flp.cpp 
+      test_glseqbrowser.cpp 
+      test_glsequence.cpp
+      test_main.cpp
+      test_mussa.cpp 
+      test_nway.cpp 
+      test_sequence.cpp 
+      test_sequence_location.cpp )
  
  GET_MUSSA_COMPILE_FLAGS(ALG_TEST_CFLAGS)
  GET_MUSSA_LINK_FLAGS(ALG_TEST_LDFLAGS)
diff --git a/alg/test/test_alphabet.cpp b/alg/test/test_alphabet.cpp

new file mode 100644 (file)

index 0000000..bfe6e3e
--- /dev/null
+++ b/alg/test/test_alphabet.cpp
@@ -0,0 +1,21 @@
+#include <boost/test/auto_unit_test.hpp>
+
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/xml_oarchive.hpp>
+#include <boost/archive/xml_iarchive.hpp>
+
+#include "alg/alphabet.hpp"
+#include "mussa_exceptions.hpp"
+
+BOOST_AUTO_TEST_CASE( alphabet_simple )
+{
+  const Alphabet &a = Alphabet::reduced_dna_alphabet;
+  // exists is case insensitive
+  BOOST_CHECK_EQUAL( a.exists('a'), true);
+  BOOST_CHECK_EQUAL( a.exists('A'), true);
+  BOOST_CHECK_EQUAL( a.exists('Q'), false);
+  BOOST_CHECK_EQUAL( a.exists('q'), false);
+  
+  BOOST_CHECK_EQUAL( a.c_str(), "AaCcGgTtNn\012\015"); // copied from alphabet.cpp
+}
diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp

index c588555e9be18e9f9eecec5e858d70245820c9cd..d4b282132884855b7d2b74831c2ff35d27fa0147 100644 (file)
--- a/alg/test/test_sequence.cpp
+++ b/alg/test/test_sequence.cpp
@@ -65,33 +65,66 @@ BOOST_AUTO_TEST_CASE( sequence_eol_conventions )
  BOOST_AUTO_TEST_CASE( sequence_filter )
  {
    const char *core_seq = "AATTGGCC";
-  Sequence s1(core_seq);
+  Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL(s1, core_seq);
  
-  Sequence s2("aattggcc");
+  Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL(s2, "AATTGGCC");
    BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
+  BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
    BOOST_CHECK_EQUAL(s2.size(), s2.size());
-  BOOST_CHECK_EQUAL(s2.get_sequence(), core_seq);
+  BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
  
-  Sequence s3("asdfg");
+  Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL(s3, "ANNNG");
    BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
  
-  s3.set_filtered_sequence("AAGGCCTT", 0, 2); 
+  s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2); 
    BOOST_CHECK_EQUAL(s3, "AA");
-  s3.set_filtered_sequence("AAGGCCTT", 2, 2);
+  s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
    BOOST_CHECK_EQUAL( s3, "GG");
-  s3.set_filtered_sequence("AAGGCCTT", 4);
+  s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
    BOOST_CHECK_EQUAL( s3, "CCTT");
  
    s3 = "AAGGFF";
    BOOST_CHECK_EQUAL(s3, "AAGGNN");
  }
  
+BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
+{
+  std::string agct("AGCT");
+  Sequence seq(agct, Sequence::nucleic_alphabet);
+  BOOST_CHECK_EQUAL(seq.size(), agct.size());
+  BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
+  
+  std::string bdv("BDv");
+  Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
+  BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
+  BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
+  
+}
+
+BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
+{
+  std::string agct("AGCT");
+  Sequence seq(agct);
+  BOOST_CHECK_EQUAL(seq.size(), agct.size());
+  BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
+  BOOST_CHECK_EQUAL(seq[0], agct[0]);
+  BOOST_CHECK_EQUAL(seq[1], agct[1]);
+  BOOST_CHECK_EQUAL(seq[2], agct[2]);
+  BOOST_CHECK_EQUAL(seq[3], agct[3]);
+  
+  std::string bdv("BDv");
+  Sequence seq_bdv(bdv);
+  BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
+  // default alphabet only allows AGCTUN
+  BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");  
+}
+
  BOOST_AUTO_TEST_CASE( subseq_names )
  {
-  Sequence s1("AAGGCCTT");
+  Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
    s1.set_species("species");
    s1.set_fasta_header("a fasta header");
    Sequence s2 = s1.subseq(2,2);
@@ -107,7 +140,7 @@ BOOST_AUTO_TEST_CASE( sequence_start_stop )
    BOOST_CHECK_EQUAL( s1.stop(), 0 );
  
    std::string seq_string("AAGGCCTT");
-  Sequence s2(seq_string);
+  Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL( s2.start(), 0 );
    BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
  
@@ -132,7 +165,7 @@ BOOST_AUTO_TEST_CASE( sequence_load )
    fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
    seq_path /=  "human_mck_pro.fa";
    Sequence s;
-  s.load_fasta(seq_path);
+  s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
    BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
    BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
@@ -140,6 +173,113 @@ BOOST_AUTO_TEST_CASE( sequence_load )
                                      "5' flank");
  }
  
+BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
+{
+  std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+  std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
+  std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
+  std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
+  std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
+  std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
+  std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+  std::stringstream garbage_fasta(garbage_string);
+  
+  Sequence s;
+  s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
+  BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta, 
+                                 Sequence::reduced_dna_alphabet),
+                    sequence_invalid_load_error);
+  BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta, 
+                                 Sequence::reduced_dna_alphabet),
+                    sequence_invalid_load_error);
+  BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
+                                 Sequence::reduced_dna_alphabet),
+                    sequence_invalid_load_error);
+
+}
+
+BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
+{
+  std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
+  std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
+  std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
+  std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
+  std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+  std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
+  std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+  std::stringstream garbage_fasta(garbage_string);
+  
+  Sequence s;
+  s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
+  BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta, 
+                                 Sequence::reduced_rna_alphabet),
+                    sequence_invalid_load_error);
+  BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta, 
+                                 Sequence::reduced_rna_alphabet),
+                    sequence_invalid_load_error);
+  BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
+                                 Sequence::reduced_rna_alphabet),
+                    sequence_invalid_load_error);
+}
+
+BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
+{
+  std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
+  std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
+  std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
+  std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
+  std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
+  std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
+  std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
+  std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
+  std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
+  std::stringstream garbage_fasta(garbage_string);
+  
+  Sequence s;
+  Sequence specific;
+  // there's two copies of reduced_rna_fasta because i didn't feel like
+  // figuring out how to properly reset the read pointer in a stringstream
+  s.load_fasta(reduced_rna_fasta1);
+  specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
+  BOOST_CHECK_EQUAL(s, specific);
+  
+  s.load_fasta(reduced_dna_fasta1);
+  specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
+  BOOST_CHECK_EQUAL(s, specific);
+  
+  BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta), 
+                    sequence_invalid_load_error);
+  BOOST_CHECK_THROW(s.load_fasta(garbage_fasta), 
+                    sequence_invalid_load_error);
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
+{
+  std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
+  Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
+  Sequence seqr = seq.rev_comp();
+  
+  BOOST_CHECK( seq != seqr );
+  BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
+  BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
+{
+  std::string dna_str("AGCTN");
+  Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
+  BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");  
+  BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
+}
+
+BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
+{
+  std::string rna_str("AGCUN");
+  Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
+  BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");  
+  BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
+}
+
  BOOST_AUTO_TEST_CASE( annotation_load )
  {
    string annot_data = "human\n"
@@ -156,7 +296,7 @@ BOOST_AUTO_TEST_CASE( annotation_load )
                        ;
    string s(100, 'A');
    s += "GCTGCTAATT";
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
                       
    //istringstream annot_stream(annot_data);
    seq.parse_annot(annot_data, 0, 0);
@@ -207,7 +347,7 @@ BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
      "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
      "AAAAA"
      "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
    seq.parse_annot(annot_data);
    std::list<annot> annots = seq.annotations();
    BOOST_CHECK_EQUAL( annots.size(), 2);
@@ -228,7 +368,7 @@ BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
                        ;
    string s(100, 'A');
    s += "GCTGCTAATT";
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
                       
    //istringstream annot_stream(annot_data);
    seq.parse_annot(annot_data, 0, 0);
@@ -279,12 +419,12 @@ BOOST_AUTO_TEST_CASE ( sequence_size )
  
  BOOST_AUTO_TEST_CASE( sequence_empty_equality )
  {
-  Sequence szero("");
+  Sequence szero("", Sequence::reduced_dna_alphabet);
    BOOST_CHECK_EQUAL(szero.empty(), true);
    BOOST_CHECK_EQUAL(szero, szero);
    BOOST_CHECK_EQUAL(szero, "");
  
-  Sequence sclear("AGCT");
+  Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
    sclear.clear();
    BOOST_CHECK_EQUAL(sclear.empty(), true);
    BOOST_CHECK_EQUAL(sclear, sclear);
@@ -295,7 +435,7 @@ BOOST_AUTO_TEST_CASE( sequence_empty_equality )
  BOOST_AUTO_TEST_CASE ( sequence_iterators )
  {
    std::string seq_string = "AAGGCCTTNNTATA";
-  Sequence s(seq_string);
+  Sequence s(seq_string, Sequence::reduced_dna_alphabet);
    const Sequence cs(s);
    std::string::size_type count = 0;
  
@@ -324,7 +464,7 @@ BOOST_AUTO_TEST_CASE( sequence_motifs )
  {
    string m("AAAA");
    string bogus("AATTGGAA");
-  Sequence s1("AAAAGGGGCCCCTTTT");
+  Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
  
    list<motif>::const_iterator motif_i = s1.motifs().begin();
    list<motif>::const_iterator motif_end = s1.motifs().end();
@@ -401,7 +541,7 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence )
             "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
    string gc("GCCCCC");
    string gga("GGACACCTC");
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
  
    std::list<Sequence> query_list;
    std::list<string> string_list;
@@ -428,6 +568,13 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence )
      }
    }
    BOOST_CHECK_EQUAL(seq.annotations().size(), count);
+  const std::list<annot> &a = seq.annotations();
+  for (std::list<annot>::const_iterator annot_i = a.begin();
+       annot_i != a.end();
+       ++annot_i)
+  {
+    int count = annot_i->end - annot_i->begin ;
+  }
  }
  
  BOOST_AUTO_TEST_CASE( subseq_annotation_test )
@@ -440,7 +587,7 @@ BOOST_AUTO_TEST_CASE( subseq_annotation_test )
             "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
             "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
             "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
  
  
    seq.add_annotation(annot(0, 10, "0-10", "0-10"));
@@ -477,7 +624,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update )
             "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
             "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
             "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
  
    // starting conditions
    BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
@@ -498,7 +645,7 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update )
  BOOST_AUTO_TEST_CASE( out_operator )
  {
    string s("AAGGCCTT");
-  Sequence seq(s);
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
  
    ostringstream buf;
    buf << s;
@@ -507,7 +654,7 @@ BOOST_AUTO_TEST_CASE( out_operator )
  
  BOOST_AUTO_TEST_CASE( get_name )
  {
-  Sequence seq("AAGGCCTT");
+  Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
  
    BOOST_CHECK_EQUAL( seq.get_name(), "" );
    seq.set_species("hooman"); // anyone remember tradewars?
@@ -519,7 +666,7 @@ BOOST_AUTO_TEST_CASE( get_name )
  BOOST_AUTO_TEST_CASE( serialize_simple )
  {
    std::string seq_string = "AAGGCCTT";
-  Sequence seq(seq_string);
+  Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
    seq.set_species("ribbet");
    std::ostringstream oss;
    // allocate/deallocate serialization components
@@ -542,7 +689,7 @@ BOOST_AUTO_TEST_CASE( serialize_simple )
  BOOST_AUTO_TEST_CASE( serialize_tree )
  {
    std::string seq_string = "AAGGCCTT";
-  Sequence seq(seq_string);
+  Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
    seq.set_species("ribbet");
    seq.add_motif("AA");
    seq.add_motif("GC");
@@ -572,7 +719,7 @@ BOOST_AUTO_TEST_CASE( serialize_tree )
  BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
  {
    std::string seq_string = "AAGGCCTT";
-  Sequence seq(seq_string);
+  Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
    seq.set_species("ribbet");
    seq.add_motif("AA");
    seq.add_motif("GC");
@@ -599,7 +746,7 @@ BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
  BOOST_AUTO_TEST_CASE( serialize_xml_two )
  {
    std::string seq_string = "AAGGCCTT";
-  Sequence seq1(seq_string);
+  Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
    Sequence seq2(seq1);
  
    std::ostringstream oss;
diff --git a/mussa_exceptions.hpp b/mussa_exceptions.hpp

index 9adb5299f58dee677f2633f03488e163e69f91fb..065347a6b52ec8eb98eb48b5eb6b981d909a8a5e 100644 (file)
--- a/mussa_exceptions.hpp
+++ b/mussa_exceptions.hpp
@@ -55,7 +55,13 @@ public:
      sequence_load_error(msg) {};
  };
  
-
+// Empty unrecognized characters
+class sequence_invalid_load_error : public sequence_load_error 
+{
+public:
+  explicit sequence_invalid_load_error(const std::string& msg) :
+    sequence_load_error(msg) {};
+};
  //! failure running analysis
  class mussa_analysis_error : public mussa_error
  {
diff --git a/qui/motif_editor/MotifElement.cpp b/qui/motif_editor/MotifElement.cpp

index 5bbda0ec43f87750c4a0326275e96d88c660d72b..1804e94b962e2a05789c2d476edfb1f6b5c78acf 100644 (file)
--- a/qui/motif_editor/MotifElement.cpp
+++ b/qui/motif_editor/MotifElement.cpp
@@ -1,11 +1,12 @@
  #include "qui/motif_editor/MotifEditor.hpp"
+#include "alg/alphabet.hpp"
  
  #include <iostream>
  
  MotifElement::MotifElement() :
    enabled(true),
    color(Color(1.0,0.0,0.0,1.0)),
-  motif()
+  motif(Sequence::nucleic_alphabet)
  {
  }
  
@@ -76,7 +77,7 @@ void MotifElement::setSequence(const Sequence& seq)
  //! set sequence text
  void MotifElement::setSequence(const std::string& seq_text)
  {
-  motif.set_sequence(seq_text);
+  motif.set_sequence(seq_text, Sequence::nucleic_alphabet);
  }
  
  QString MotifElement::getSequenceText() const
diff --git a/qui/mussa_setup_dialog/MussaSetupWidget.cpp b/qui/mussa_setup_dialog/MussaSetupWidget.cpp

index 55d619921ce10735ae1111823c0d690457fe024e..4e7b37bf26a31c71550920c7477a85fd5d45ff90 100644 (file)
--- a/qui/mussa_setup_dialog/MussaSetupWidget.cpp
+++ b/qui/mussa_setup_dialog/MussaSetupWidget.cpp
@@ -69,10 +69,10 @@ MussaSetupWidget::MussaSetupWidget(QWidget *parent)
    row1Layout->addWidget(analysisNameLabel);
    row1Layout->addWidget(&analysisNameLineEdit);
  
-  row2Layout->addWidget(windowLabel);
-  row2Layout->addWidget(&windowLineEdit);
    row2Layout->addWidget(thresholdLabel);
    row2Layout->addWidget(&thresholdLineEdit);
+  row2Layout->addWidget(windowLabel);
+  row2Layout->addWidget(&windowLineEdit);
    row2Layout->addWidget(numOfSequencesLabel);
    row2Layout->addWidget(&numOfSequencesSpinBox);
author	Diane Trout <diane@caltech.edu>
	Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)
committer	Diane Trout <diane@caltech.edu>
	Tue, 10 Oct 2006 22:27:54 +0000 (22:27 +0000)
alg/CMakeLists.txt		patch \| blob \| history
alg/alphabet.cpp	[new file with mode: 0644]	patch \| blob
alg/alphabet.hpp	[new file with mode: 0644]	patch \| blob
alg/mussa.cpp		patch \| blob \| history
alg/sequence.cpp		patch \| blob \| history
alg/sequence.hpp		patch \| blob \| history
alg/test/CMakeLists.txt		patch \| blob \| history
alg/test/test_alphabet.cpp	[new file with mode: 0644]	patch \| blob
alg/test/test_sequence.cpp		patch \| blob \| history
mussa_exceptions.hpp		patch \| blob \| history
qui/motif_editor/MotifElement.cpp		patch \| blob \| history
qui/mussa_setup_dialog/MussaSetupWidget.cpp		patch \| blob \| history