try to read fasta blocks in the annotation file
authorDiane Trout <diane@caltech.edu>
Wed, 17 May 2006 08:10:00 +0000 (08:10 +0000)
committerDiane Trout <diane@caltech.edu>
Wed, 17 May 2006 08:10:00 +0000 (08:10 +0000)
though this also revealed some problems with the spirit parser not
really splitting on whitespace. I should look into how to do per
character parsing.

alg/sequence.cpp
alg/sequence.hpp
alg/test/test_sequence.cpp

index d5c77c87ea240e47d667f43f2c8e4776cf17b343..f0cb6e5178fbc19cdabbc9e7c37154ed4210e4fa 100644 (file)
@@ -220,6 +220,7 @@ Sequence::load_annot(fs::path file_path, int start_index, int end_index)
   {
     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
   }
+
   // so i should probably be passing the parse function some iterators
   // but the annotations files are (currently) small, so i think i can 
   // get away with loading the whole file into memory
@@ -274,32 +275,67 @@ struct push_back_annot {
   void operator()(std::string::const_iterator, 
                   std::string::const_iterator) const 
   {
+    std::cout << "adding annot: " << begin << " " << end << " " << name << " " << type << std::endl;
     annot_list.push_back(annot(begin, end, name, type));
   };
 };
 
+struct push_back_seq {
+  std::list<Sequence>& seq_list;
+  std::string& name;
+  std::string& seq;
+
+  push_back_seq(std::list<Sequence>& seq_list_,
+                std::string& name_, 
+                std::string& seq_)
+  : seq_list(seq_list_), 
+    name(name_),
+    seq(seq_)
+  {
+  }
+
+  void operator()(std::string::const_iterator, 
+                  std::string::const_iterator) const 
+  {
+    std::cout << "adding seq: " << name << " " << seq << std::endl;
+    Sequence s(seq);
+    s.set_header(name);
+    seq_list.push_back(s);
+  };
+};
 
 void
 Sequence::parse_annot(std::string data, int start_index, int end_index)
 {
-  std::string species_name;
   int start=0;
   int end=0;
   std::string name;
   std::string type;
-
+  std::string seq;
+  std::list<Sequence> query_seqs;
 
   bool status = spirit::parse(data.begin(), data.end(),
                 //begin grammar
                 (
-                (+(spirit::alpha_p))[spirit::assign_a(species_name)] >> 
-                    *((spirit::uint_p[spirit::assign_a(start)] >> 
-                      spirit::uint_p[spirit::assign_a(end)] >> 
-                      (*(spirit::alpha_p))[spirit::assign_a(name)] >> 
-                      (*(spirit::alpha_p))[spirit::assign_a(type)]
-                     // to understand, read the comment above 
-                     // struct push_back_annot
-                    )[push_back_annot(annots, start, end, name, type)])
+                (+(spirit::alpha_p))[spirit::assign_a(species)] >> 
+                    *(
+                       // parse an absolute location name
+                       (spirit::uint_p[spirit::assign_a(start)] >> 
+                        spirit::uint_p[spirit::assign_a(end)] >> 
+                        (*(spirit::alpha_p))[spirit::assign_a(name)]/* >> 
+                        (*(spirit::alpha_p))[spirit::assign_a(type)]*/
+                        // to understand how this group gets set
+                        // read the comment above struct push_back_annot
+                       )[push_back_annot(annots, start, end, type, name)]
+                     |
+                      (spirit::ch_p('>') >> 
+                       (*(spirit::alpha_p))[spirit::assign_a(name)] >>
+                       (+(spirit::ch_p('A')|
+                          spirit::ch_p('G')|
+                          spirit::ch_p('C')|
+                          spirit::ch_p('T'))[spirit::assign_a(seq)])
+                       )[push_back_seq(query_seqs, name, seq)]
+                     )
                 ),
                 //end grammar
                 spirit::space_p).full;
@@ -491,6 +527,10 @@ Sequence::rev_comp() const
   return rev_comp;
 }
 
+void Sequence::set_header(std::string &header_)
+{
+  header = header_;
+}
 
 const std::string&
 Sequence::get_header() const
index c3fb53fd74aeaf11bebd6e0ac915f0330a837106..3b09312da7aba088d233cf6ca911d221dd113ca1 100644 (file)
@@ -118,6 +118,7 @@ class Sequence
     std::string::size_type size() const;
     void clear();
 
+    void set_header(std::string& header);
     const std::string& get_header() const;
     //! add a motif to our list of motifs
     //! \throws motif_normalize_error if there's something wrong with a_motif
index 1e478fa4cf5a66e721a8187eef346ef06d133947..c2dea4e4f4cfb11a21b35679abefe5d1247d9fc5 100644 (file)
@@ -67,24 +67,31 @@ BOOST_AUTO_TEST_CASE( sequence_load )
 BOOST_AUTO_TEST_CASE( annotation_load )
 {
   string annot_data = "human\n"
-                      "0 10 name type\n"
-                      "10 20 name\n"
-                      "20 30\n"
-                      "15 20 backward\n";
+                      "0 10 name\n" // type\n"
+                      "10 20 myf\n"
+                      "20 30 myod\n"
+                      "50\t55 anothername\n"
+                      "60 50 backward\n"
+                      ">ident\n"
+                      "GCT\n"
+                      "GCT\n"
+                      ;
   string s('A',100);
   s += "GCTGCT";
   Sequence seq(s);
                      
   //istringstream annot_stream(annot_data);
   seq.parse_annot(annot_data, 0, 0);
-  typedef std::list<annot> annot_list_t;
-  annot_list_t annots = seq.annotations();
-  for(annot_list_t::iterator annot_i = annots.begin();
-      annot_i != annots.end();
-      ++annot_i)
-  {
-    std::cout << "start " << annot_i->start << endl;
-  }
+  std::list<annot> annots_list = seq.annotations();
+  std::vector<annot> annots(annots_list.begin(), annots_list.end());
+  BOOST_REQUIRE_EQUAL( annots.size(), 5);
+  BOOST_CHECK_EQUAL( annots[0].start, 0 );
+  BOOST_CHECK_EQUAL( annots[0].end, 10 );
+  //BOOST_CHECK_EQUAL( annots[0].type, "type");
+  BOOST_CHECK_EQUAL( annots[0].name, "name");
+  //BOOST_CHECK_EQUAL( annots[1].name, "myf7");
+
+  //BOOST_CHECK_EQUAL( annots
 }
 
 // ticket:83 when you try to load a sequence from a file that doesn't