use spirit parser for reading annot file

author Diane Trout <diane@caltech.edu>

Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)

committer Diane Trout <diane@caltech.edu>

Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)
author Diane Trout <diane@caltech.edu>
Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)
committer Diane Trout <diane@caltech.edu>
Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)
diff --git a/alg/sequence.cpp b/alg/sequence.cpp

index 53a13f0b266ddde77b716a55e7b991a4a1aac02a..d5c77c87ea240e47d667f43f2c8e4776cf17b343 100644 (file)
--- a/alg/sequence.cpp
+++ b/alg/sequence.cpp
@@ -22,6 +22,12 @@
  //                           ---------- sequence.cc -----------
  //                        ----------------------------------------
  #include <boost/filesystem/fstream.hpp>
+namespace fs = boost::filesystem;
+
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/actor/push_back_actor.hpp>
+#include <boost/spirit/iterator/file_iterator.hpp>
+namespace spirit = boost::spirit;
  
  #include "alg/sequence.hpp"
  #include "mussa_exceptions.hpp"
@@ -30,9 +36,6 @@
  #include <iostream>
  #include <sstream>
  
-namespace fs = boost::filesystem;
-using namespace std;
-
  annot::annot() 
   : start(0),
     end(0),
@@ -64,7 +67,7 @@ Sequence::Sequence()
    motif_list.clear();
  }
  
-Sequence::Sequence(string seq)
+Sequence::Sequence(std::string seq)
  {
    set_filtered_sequence(seq);
  }
@@ -91,7 +94,7 @@ char Sequence::operator[](int index) const
    return sequence[index];
  }
  
-ostream& operator<<(ostream& out, const Sequence& seq)
+std::ostream& operator<<(std::ostream& out, const Sequence& seq)
  {
    out << "Sequence(" << seq.get_seq() << ")";
    return out;
@@ -110,14 +113,14 @@ Sequence::load_fasta(fs::path file_path, int seq_num,
                       int start_index, int end_index)
  {
    fs::fstream data_file;
-  string file_data_line;
+  std::string file_data_line;
    int header_counter = 0;
    bool read_seq = true;
-  string rev_comp;
-  string sequence_raw;
-  string seq_tmp;             // holds sequence during basic filtering
+  std::string rev_comp;
+  std::string sequence_raw;
+  std::string seq_tmp;             // holds sequence during basic filtering
  
-  data_file.open(file_path, ios::in);
+  data_file.open(file_path, std::ios::in);
  
    if (seq_num == 0) {
      throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
@@ -158,18 +161,18 @@ Sequence::load_fasta(fs::path file_path, int seq_num,
        // sequence filtering for upcasing agctn and convert non AGCTN to N
        set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
      } else {
-      stringstream errormsg;
+      std::stringstream errormsg;
        errormsg << file_path.native_file_string()
-               << " did not have any fasta sequences" << endl;
+               << " did not have any fasta sequences" << std::endl;
        throw mussa_load_error(errormsg.str());
      }
      data_file.close();
    }
  }
  
-void Sequence::set_filtered_sequence(const string &old_seq, 
-                                     string::size_type start, 
-                                     string::size_type count)
+void Sequence::set_filtered_sequence(const std::string &old_seq, 
+                                     std::string::size_type start, 
+                                     std::string::size_type count)
  {
    char conversionTable[257];
  
@@ -200,7 +203,7 @@ void Sequence::set_filtered_sequence(const string &old_seq,
    conversionTable[(int)'c'] = 'C';
  
    // finally, the actual conversion loop
-  for(string::size_type seq_index = 0; seq_index < count; seq_index++)
+  for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
    {
      sequence += conversionTable[ (int)old_seq[seq_index+start]];
    }
@@ -209,105 +212,178 @@ void Sequence::set_filtered_sequence(const string &old_seq,
    // this doesn't work properly under gcc 3.x ... it can't recognize toupper
    //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
  
-
  void
  Sequence::load_annot(fs::path file_path, int start_index, int end_index)
  {
-  fs::fstream data_file;
-  string file_data_line;
-  annot an_annot;
-  string::size_type space_split_i;
-  string annot_value;
-  list<annot>::iterator list_i;
-  string err_msg;
-
-
-  annots.clear();
-  data_file.open(file_path, ios::in);
-
-  if (!data_file)
+  fs::fstream data_stream(file_path, std::ios::in);
+  if (!data_stream)
    {
      throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
    }
-  // if file opened okay, read it
-  else
+  // so i should probably be passing the parse function some iterators
+  // but the annotations files are (currently) small, so i think i can 
+  // get away with loading the whole file into memory
+  std::string data;
+  char c;
+  while(data_stream.good()) {
+    data_stream.get(c);
+    data.push_back(c);
+  }
+  data_stream.close();
+        
+  parse_annot(data, start_index, end_index);
+}
+
+/* If this works, yikes, this is some brain hurting code.
+ *
+ * what's going on is that when pb_annot is instantiated it stores references
+ * to begin, end, name, type, declared in the parse function, then
+ * when operator() is called it grabs values from those references
+ * and uses that to instantiate an annot object and append that to our
+ * annotation list.
+ *
+ * This weirdness is because the spirit library requires that actions
+ * conform to a specific prototype operator()(IteratorT, IteratorT)
+ * which doesn't provide any useful opportunity for me to actually
+ * grab the results of our parsing.
+ *
+ * so I instantiate this structure in order to have a place to grab
+ * my data from.
+ */
+  
+struct push_back_annot {
+  std::list<annot>& annot_list;
+  int& begin;
+  int& end;
+  std::string& name;
+  std::string& type;
+
+  push_back_annot(std::list<annot>& annot_list_, 
+                  int& begin_, 
+                  int& end_, 
+                  std::string& name_, 
+                  std::string& type_) 
+  : annot_list(annot_list_), 
+    begin(begin_),
+    end(end_),
+    name(name_),
+    type(type_)
    {
-    getline(data_file,file_data_line);
-    species = file_data_line;
+  }
  
-    // end_index = 0 means no end was specified, so cut to the end 
-    if (end_index == 0)
-      end_index = sequence.length();
+  void operator()(std::string::const_iterator, 
+                  std::string::const_iterator) const 
+  {
+    annot_list.push_back(annot(begin, end, name, type));
+  };
+};
  
-    //cout << "START: " << start_index << " END: " << end_index << endl;
  
-    while ( !data_file.eof() )
+void
+Sequence::parse_annot(std::string data, int start_index, int end_index)
+{
+  std::string species_name;
+  int start=0;
+  int end=0;
+  std::string name;
+  std::string type;
+
+
+  bool status = spirit::parse(data.begin(), data.end(),
+                //begin grammar
+                (
+                (+(spirit::alpha_p))[spirit::assign_a(species_name)] >> 
+                    *((spirit::uint_p[spirit::assign_a(start)] >> 
+                      spirit::uint_p[spirit::assign_a(end)] >> 
+                      (*(spirit::alpha_p))[spirit::assign_a(name)] >> 
+                      (*(spirit::alpha_p))[spirit::assign_a(type)]
+                     // to understand, read the comment above 
+                     // struct push_back_annot
+                    )[push_back_annot(annots, start, end, name, type)])
+                ),
+                //end grammar
+                spirit::space_p).full;
+}
+
+/*
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
+  std::string file_data_line;
+  annot an_annot;
+  std::string::size_type space_split_i;
+  std::string annot_value;
+  std::list<annot>::iterator list_i;
+  std::string err_msg;
+
+  annots.clear();
+
+  getline(data_stream,file_data_line);
+  species = file_data_line;
+
+  // end_index = 0 means no end was specified, so cut to the end 
+  if (end_index == 0)
+    end_index = sequence.length();
+
+  //std::cout << "START: " << start_index << " END: " << end_index << std::endl;
+
+  while ( !data_stream.eof() )
+  {
+    getline(data_stream,file_data_line);
+    if (file_data_line != "")
      {
-      getline(data_file,file_data_line);
-      if (file_data_line != "")
+      // need to get 4 values...almost same code 4 times...
+      // get annot start index
+      space_split_i = file_data_line.find(" ");
+      annot_value = file_data_line.substr(0,space_split_i);
+      an_annot.start = atoi (annot_value.c_str());
+      file_data_line = file_data_line.substr(space_split_i+1);
+      // get annot end index
+      space_split_i = file_data_line.find(" ");
+      annot_value = file_data_line.substr(0,space_split_i);
+      an_annot.end = atoi (annot_value.c_str());
+      file_data_line = file_data_line.substr(space_split_i+1);
+
+      //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+      //     << std::endl;
+
+      // get annot name
+      space_split_i = file_data_line.find(" ");
+      if (space_split_i == std::string::npos)  // no entries for name & type
+      {
+        std::cout << "seq, annots - no name or type\n";
+        an_annot.name = "";
+        an_annot.type = "";
+      }
+      else
        {
-        // need to get 4 values...almost same code 4 times...
-        // get annot start index
-        space_split_i = file_data_line.find(" ");
-        annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.start = atoi (annot_value.c_str());
-        file_data_line = file_data_line.substr(space_split_i+1);
-        // get annot end index
-        space_split_i = file_data_line.find(" ");
          annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.end = atoi (annot_value.c_str());
+        an_annot.name = annot_value;
          file_data_line = file_data_line.substr(space_split_i+1);
-
-        //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
-             //     << endl;
-
-        // get annot name
+        // get annot type
          space_split_i = file_data_line.find(" ");
-        if (space_split_i == string::npos)  // no entries for name & type
-        {
-          cout << "seq, annots - no name or type\n";
-          an_annot.name = "";
+        if (space_split_i == std::string::npos)  // no entry for type
            an_annot.type = "";
-        }
          else
          {
            annot_value = file_data_line.substr(0,space_split_i);
-          an_annot.name = annot_value;
-          file_data_line = file_data_line.substr(space_split_i+1);
-          // get annot type
-          space_split_i = file_data_line.find(" ");
-          if (space_split_i == string::npos)  // no entry for type
-            an_annot.type = "";
-          else
-          {
-            annot_value = file_data_line.substr(0,space_split_i);
-            an_annot.type = annot_value;
-          }
+          an_annot.type = annot_value;
          }
+      }
  
  
-        // add annot to list if it falls within the range of sequence specified
-        if ((start_index <= an_annot.start) && (end_index >= an_annot.end)) 
-        {
-          an_annot.start -= start_index;
-          an_annot.end -= start_index;
-          annots.push_back(an_annot);
-        }
-        // else no (or bogus) annotations
+      // add annot to list if it falls within the range of sequence specified
+      if ((start_index <= an_annot.start) && (end_index >= an_annot.end)) 
+      {
+        an_annot.start -= start_index;
+        an_annot.end -= start_index;
+        annots.push_back(an_annot);
        }
+      // else no (or bogus) annotations
      }
-
-    data_file.close();
-    /*
-    // debugging check
-    for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
-    {
-      cout << (*list_i).start << "," << (*list_i).end << "\t";
-      cout << (*list_i).name << "\t" << (*list_i).type << endl;
-    }
-    */
    }
  }
+*/
  
  const std::string& Sequence::get_species() const
  {
@@ -324,12 +400,12 @@ const std::list<annot>& Sequence::annotations() const
    return annots;
  }
  
-string::size_type Sequence::length() const
+std::string::size_type Sequence::length() const
  {
    return size();
  }
  
-string::size_type Sequence::size() const
+std::string::size_type Sequence::size() const
  {
    return sequence.size();
  }
@@ -355,14 +431,14 @@ Sequence::const_iterator Sequence::end() const
  }
  
  
-const string&
+const std::string&
  Sequence::get_seq() const
  {
    return sequence;
  }
  
  
-string
+std::string
  Sequence::subseq(int start, int end) const
  {
    return sequence.substr(start, end);
@@ -375,10 +451,10 @@ Sequence::c_seq() const
    return sequence.c_str();
  }
  
-string
+std::string
  Sequence::rev_comp() const
  {
-  string rev_comp;
+  std::string rev_comp;
    char conversionTable[257];
    int seq_i, table_i, len;
  
@@ -416,14 +492,14 @@ Sequence::rev_comp() const
  }
  
  
-const string&
+const std::string&
  Sequence::get_header() const
  {
    return header;
  }
  /*
  //FIXME: i don't think this code is callable
-string 
+std::string 
  Sequence::sp_name() const
  {
    return species;
@@ -431,14 +507,14 @@ Sequence::sp_name() const
  */
  
  void
-Sequence::set_seq(const string& a_seq)
+Sequence::set_seq(const std::string& a_seq)
  {
    set_filtered_sequence(a_seq);
  }
  
  
  /*
-string 
+std::string 
  Sequence::species()
  {
    return species;
@@ -456,28 +532,28 @@ Sequence::clear()
  
  void
  Sequence::save(fs::fstream &save_file)
-               //string save_file_path)
+               //std::string save_file_path)
  {
    //fstream save_file;
-  list<annot>::iterator annots_i;
+  std::list<annot>::iterator annots_i;
  
    // not sure why, or if i'm doing something wrong, but can't seem to pass
    // file pointers down to this method from the mussa control class
    // so each call to save a sequence appends to the file started by mussa_class
-  //save_file.open(save_file_path.c_str(), ios::app);
+  //save_file.open(save_file_path.c_str(), std::ios::app);
  
-  save_file << "<Sequence>" << endl;
-  save_file << sequence << endl;
-  save_file << "</Sequence>" << endl;
+  save_file << "<Sequence>" << std::endl;
+  save_file << sequence << std::endl;
+  save_file << "</Sequence>" << std::endl;
  
-  save_file << "<Annotations>" << endl;
-  save_file << species << endl;
+  save_file << "<Annotations>" << std::endl;
+  save_file << species << std::endl;
    for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
    {
      save_file << annots_i->start << " " << annots_i->end << " " ;
-    save_file << annots_i->name << " " << annots_i->type << endl;
+    save_file << annots_i->name << " " << annots_i->type << std::endl;
    }
-  save_file << "</Annotations>" << endl;
+  save_file << "</Annotations>" << std::endl;
    //save_file.close();
  }
  
@@ -485,14 +561,14 @@ void
  Sequence::load_museq(fs::path load_file_path, int seq_num)
  {
    fs::fstream load_file;
-  string file_data_line;
+  std::string file_data_line;
    int seq_counter;
    annot an_annot;
-  string::size_type space_split_i;
-  string annot_value;
+  std::string::size_type space_split_i;
+  std::string annot_value;
  
    annots.clear();
-  load_file.open(load_file_path, ios::in);
+  load_file.open(load_file_path, std::ios::in);
  
    seq_counter = 0;
    // search for the seq_num-th sequence 
@@ -526,9 +602,9 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
          annot_value = file_data_line.substr(0,space_split_i);
          an_annot.end = atoi (annot_value.c_str());
  
-        if (space_split_i == string::npos)  // no entry for type or name
+        if (space_split_i == std::string::npos)  // no entry for type or name
          {
-          cout << "seq, annots - no type or name\n";
+          std::cout << "seq, annots - no type or name\n";
            an_annot.type = "";
            an_annot.name = "";
          }
@@ -538,9 +614,9 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
            space_split_i = file_data_line.find(" ");
            annot_value = file_data_line.substr(0,space_split_i);
            an_annot.type = annot_value;
-          if (space_split_i == string::npos)  // no entry for name
+          if (space_split_i == std::string::npos)  // no entry for name
            {
-            cout << "seq, annots - no name\n";
+            std::cout << "seq, annots - no name\n";
              an_annot.name = "";
            }
            else          // get annot name
@@ -553,18 +629,18 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
          }
          annots.push_back(an_annot);  // don't forget to actually add the annot
        }
-      //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
-      //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
+      //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+      //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
      }
    }
    load_file.close();
  }
  
  
-string
-Sequence::rc_motif(string a_motif)
+std::string
+Sequence::rc_motif(std::string a_motif)
  {
-  string rev_comp;
+  std::string rev_comp;
    char conversionTable[257];
    int seq_i, table_i, len;
  
@@ -575,7 +651,7 @@ Sequence::rc_motif(string a_motif)
    {
      conversionTable[table_i] = '~';
    }
-  // add end of string character for printing out table for testing purposes
+  // add end of std::string character for printing out table for testing purposes
    conversionTable[256] = '\0';
  
    // add in the characters for the bases we want to convert (IUPAC)
@@ -598,21 +674,21 @@ Sequence::rc_motif(string a_motif)
    // finally, the actual conversion loop
    for(seq_i = len - 1; seq_i >= 0; seq_i--)
    {
-    //cout << "** i = " << seq_i << " bp = " << 
+    //std::cout << "** i = " << seq_i << " bp = " << 
      table_i = (int) a_motif[seq_i];
      rev_comp += conversionTable[table_i];
    }
  
-  //cout << "seq: " << a_motif << endl;
-  //cout << "rc:  " << rev_comp << endl;
+  //std::cout << "seq: " << a_motif << std::endl;
+  //std::cout << "rc:  " << rev_comp << std::endl;
  
    return rev_comp;
  }
  
-string
-Sequence::motif_normalize(string a_motif)
+std::string
+Sequence::motif_normalize(std::string a_motif)
  {
-  string valid_motif;
+  std::string valid_motif;
    int seq_i, len;
  
    len = a_motif.length();
@@ -653,21 +729,21 @@ Sequence::motif_normalize(string a_motif)
      else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
        valid_motif += 'B';
      else {
-      string msg = "Letter ";
+      std::string msg = "Letter ";
        msg += a_motif[seq_i];
        msg += " is not a valid IUPAC symbol";
        throw motif_normalize_error(msg);
      }
    }
-  //cout << "valid_motif is: " << valid_motif << endl;
+  //std::cout << "valid_motif is: " << valid_motif << std::endl;
    return valid_motif;
  }
  
-void Sequence::add_motif(string a_motif)
+void Sequence::add_motif(std::string a_motif)
  {
-  vector<int> motif_starts = find_motif(a_motif);
+  std::vector<int> motif_starts = find_motif(a_motif);
  
-  for(vector<int>::iterator motif_start_i = motif_starts.begin();
+  for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
        motif_start_i != motif_starts.end();
        ++motif_start_i)
    {
@@ -680,26 +756,26 @@ void Sequence::clear_motifs()
    motif_list.clear();
  }
  
-const list<motif>& Sequence::motifs() const
+const std::list<motif>& Sequence::motifs() const
  {
    return motif_list;
  }
  
-vector<int>
-Sequence::find_motif(string a_motif)
+std::vector<int>
+Sequence::find_motif(std::string a_motif)
  {
-  vector<int> motif_match_starts;
-  string a_motif_rc;
+  std::vector<int> motif_match_starts;
+  std::string a_motif_rc;
  
    motif_match_starts.clear();
  
-  //cout << "motif is: " << a_motif << endl;
+  //std::cout << "motif is: " << a_motif << std::endl;
    a_motif = motif_normalize(a_motif);
-  //cout << "motif is: " << a_motif << endl;
+  //std::cout << "motif is: " << a_motif << std::endl;
  
    if (a_motif != "")
    {
-    //cout << "Sequence: none blank motif\n";
+    //std::cout << "Sequence: none blank motif\n";
      motif_scan(a_motif, &motif_match_starts);
  
      a_motif_rc = rc_motif(a_motif);
@@ -711,30 +787,30 @@ Sequence::find_motif(string a_motif)
  }
  
  void
-Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
+Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts)
  {
    char * seq_c;
-  string::size_type seq_i;
+  std::string::size_type seq_i;
    int motif_i, motif_len;
  
-  // faster to loop thru the sequence as a old c string (ie char array)
+  // faster to loop thru the sequence as a old c std::string (ie char array)
    seq_c = (char*)sequence.c_str();
-  //cout << "Sequence: motif, seq len = " << sequence.length() << endl; 
+  //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl; 
    motif_len = a_motif.length();
  
-  //cout << "motif_length: " << motif_len << endl;
-  //cout << "RAAARRRRR\n";
+  //std::cout << "motif_length: " << motif_len << std::endl;
+  //std::cout << "RAAARRRRR\n";
  
    motif_i = 0;
  
-  //cout << "motif: " << a_motif << endl;
+  //std::cout << "motif: " << a_motif << std::endl;
  
-  //cout << "Sequence: motif, length= " << length << endl;
+  //std::cout << "Sequence: motif, length= " << length << std::endl;
    seq_i = 0;
    while (seq_i < sequence.length())
    {
-    //cout << seq_c[seq_i];
-    //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
+    //std::cout << seq_c[seq_i];
+    //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
      // this is pretty much a straight translation of Nora's python code
      // to match iupac letter codes
      if (a_motif[motif_i] =='N')
@@ -785,7 +861,7 @@ Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
      // end Nora stuff, now we see if a match is found this pass
      if (motif_i == motif_len)
      {
-      //cout << "!!";
+      //std::cout << "!!";
        annot new_motif;
        motif_match_starts->push_back(seq_i - motif_len + 1);
        motif_i = 0;
@@ -793,15 +869,15 @@ Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
  
      seq_i++;
    }
-  //cout << endl;
+  //std::cout << std::endl;
  }
  
  void Sequence::add_string_annotation(std::string a_seq, 
                                       std::string name)
  {
-  vector<int> seq_starts = find_motif(a_seq);
+  std::vector<int> seq_starts = find_motif(a_seq);
  
-  for(vector<int>::iterator seq_start_i = seq_starts.begin();
+  for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
        seq_start_i != seq_starts.end();
        ++seq_start_i)
    {
diff --git a/alg/sequence.hpp b/alg/sequence.hpp

index dac29db641de6fae3a28d50f0b3476e3fc3e1b1b..c3fb53fd74aeaf11bebd6e0ac915f0330a837106 100644 (file)
--- a/alg/sequence.hpp
+++ b/alg/sequence.hpp
@@ -89,9 +89,12 @@ class Sequence
      void load_fasta(const boost::filesystem::path file_path, int seq_num=1, 
                      int start_index=0, int end_index=0);
      //! load sequence annotations
-    /*! \throws mussa_load_error 
-     */
+    //! \throws mussa_load_error 
      void load_annot(const boost::filesystem::path file_path, int start_index, int end_index);
+    //! load sequence annotations
+    //! \throws mussa_load_error 
+    //void load_annot(std::istream& data_stream, int start_index, int end_index);
+    void parse_annot(std::string data, int start_index, int end_index);
      const std::list<annot>& annotations() const;
      const std::list<motif>& motifs() const;
      const std::string& get_species() const;
diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp

index ae2726cc0d0321d40a82829151426308db557e7c..1e478fa4cf5a66e721a8187eef346ef06d133947 100644 (file)
--- a/alg/test/test_sequence.cpp
+++ b/alg/test/test_sequence.cpp
@@ -5,6 +5,7 @@ namespace fs=boost::filesystem;
  
  #include <list>
  #include <iostream>
+#include <sstream>
  
  #include "alg/sequence.hpp"
  #include "mussa_exceptions.hpp"
@@ -63,6 +64,29 @@ BOOST_AUTO_TEST_CASE( sequence_load )
                                      "5' flank");
  }
  
+BOOST_AUTO_TEST_CASE( annotation_load )
+{
+  string annot_data = "human\n"
+                      "0 10 name type\n"
+                      "10 20 name\n"
+                      "20 30\n"
+                      "15 20 backward\n";
+  string s('A',100);
+  s += "GCTGCT";
+  Sequence seq(s);
+                     
+  //istringstream annot_stream(annot_data);
+  seq.parse_annot(annot_data, 0, 0);
+  typedef std::list<annot> annot_list_t;
+  annot_list_t annots = seq.annotations();
+  for(annot_list_t::iterator annot_i = annots.begin();
+      annot_i != annots.end();
+      ++annot_i)
+  {
+    std::cout << "start " << annot_i->start << endl;
+  }
+}
+
  // ticket:83 when you try to load a sequence from a file that doesn't
  // have fasta headers it crashes. 
  BOOST_AUTO_TEST_CASE( sequence_past_end ) 
@@ -161,21 +185,42 @@ BOOST_AUTO_TEST_CASE( annot_test )
  
  BOOST_AUTO_TEST_CASE( annotate_from_sequence )
  {
-  Sequence s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
-             "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
-             "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
-             "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
-             "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
-             "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
-             "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
-             "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
+  string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
+           "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
+           "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
+           "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
+           "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
+           "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
+           "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
+           "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
+  string gc("GCCCCC");
+  string gga("GGACACCTC");
+  Sequence seq(s);
  
    std::list<Sequence> query_list;
-  query_list.push_back(Sequence("GCCCCC"));
-  query_list.push_back(Sequence("GGACACCTC"));
-
-  BOOST_CHECK_EQUAL( s.annotations().size(), 0 );
-  s.find_sequences(query_list.begin(), query_list.end());
-  BOOST_CHECK_EQUAL( s.annotations().size(), 4 );
+  std::list<string> string_list;
+  query_list.push_back(Sequence(gc));
+  string_list.push_back(gc);
+  query_list.push_back(Sequence(gga));
+  string_list.push_back(gga);
+
+  BOOST_CHECK_EQUAL( seq.annotations().size(), 0 );
+  seq.find_sequences(query_list.begin(), query_list.end());
+  
+  int count = 0;
+  for(list<string>::iterator string_i = string_list.begin();
+      string_i != string_list.end();
+      ++string_i)
+  {
+    string::size_type pos=0;
+    while(pos != string::npos) {
+      pos = s.find(*string_i, pos);
+      if (pos != string::npos) {
+        ++count;
+        ++pos;
+      }
+    }
+  }
+  BOOST_CHECK_EQUAL(seq.annotations().size(), count);
  }
author	Diane Trout <diane@caltech.edu>
	Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)
committer	Diane Trout <diane@caltech.edu>
	Wed, 17 May 2006 01:57:30 +0000 (01:57 +0000)
alg/sequence.cpp		patch \| blob \| history
alg/sequence.hpp		patch \| blob \| history
alg/test/test_sequence.cpp		patch \| blob \| history