switch to a character based spirit parser
authorDiane Trout <diane@caltech.edu>
Wed, 17 May 2006 08:36:18 +0000 (08:36 +0000)
committerDiane Trout <diane@caltech.edu>
Wed, 17 May 2006 08:36:18 +0000 (08:36 +0000)
by not letting spirit automatically deal with the spacing I was able
to actually get parsing to work correctly.

alg/sequence.cpp
alg/test/test_sequence.cpp

index f0cb6e5178fbc19cdabbc9e7c37154ed4210e4fa..f3f68a80431bfa7dabfa7139994b361d9fcdd6c8 100644 (file)
@@ -297,8 +297,17 @@ struct push_back_seq {
   void operator()(std::string::const_iterator, 
                   std::string::const_iterator) const 
   {
-    std::cout << "adding seq: " << name << " " << seq << std::endl;
-    Sequence s(seq);
+    // filter out newlines from our sequence
+    std::string new_seq;
+    for(std::string::const_iterator seq_i = seq.begin();
+        seq_i != seq.end();
+        ++seq_i)
+    {
+      if (*seq_i != '\n') new_seq += *seq_i;
+    }
+    std::cout << "adding seq: " << name << " " << new_seq << std::endl;
+    
+    Sequence s(new_seq);
     s.set_header(name);
     seq_list.push_back(s);
   };
@@ -315,30 +324,42 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
   std::list<Sequence> query_seqs;
 
   bool status = spirit::parse(data.begin(), data.end(),
-                //begin grammar
                 (
+                //begin grammar
                 (+(spirit::alpha_p))[spirit::assign_a(species)] >> 
+                 +(spirit::space_p) >>
                     *(
-                       // parse an absolute location name
+                      ( // parse an absolute location name
                        (spirit::uint_p[spirit::assign_a(start)] >> 
+                        +spirit::space_p >>
                         spirit::uint_p[spirit::assign_a(end)] >> 
-                        (*(spirit::alpha_p))[spirit::assign_a(name)]/* >> 
-                        (*(spirit::alpha_p))[spirit::assign_a(type)]*/
+                        +spirit::space_p >>
+                        (*(spirit::alpha_p|spirit::digit_p))[spirit::assign_a(name)] >> 
+                          // optional type
+                          !(
+                             +spirit::space_p >>
+                             (*(spirit::alpha_p))[spirit::assign_a(type)]
+                           )
                         // to understand how this group gets set
                         // read the comment above struct push_back_annot
                        )[push_back_annot(annots, start, end, type, name)]
                      |
                       (spirit::ch_p('>') >> 
-                       (*(spirit::alpha_p))[spirit::assign_a(name)] >>
+                       (*(~spirit::chlit<char>('\n')))[spirit::assign_a(name)] >>
+                        +spirit::space_p >>
                        (+(spirit::ch_p('A')|
                           spirit::ch_p('G')|
                           spirit::ch_p('C')|
-                          spirit::ch_p('T'))[spirit::assign_a(seq)])
+                          spirit::ch_p('T')|
+                          spirit::ch_p('N')|
+                          spirit::ch_p('\n')))[spirit::assign_a(seq)]
                        )[push_back_seq(query_seqs, name, seq)]
+                      ) >>
+                      *spirit::space_p
                      )
-                ),
                 //end grammar
-                spirit::space_p).full;
+                ) /*,
+                spirit::space_p*/).full;
 }
 
 /*
index c2dea4e4f4cfb11a21b35679abefe5d1247d9fc5..660f4df8fb7cdaf23a49fc852b2466d741e698bb 100644 (file)
@@ -67,14 +67,14 @@ BOOST_AUTO_TEST_CASE( sequence_load )
 BOOST_AUTO_TEST_CASE( annotation_load )
 {
   string annot_data = "human\n"
-                      "0 10 name\n" // type\n"
-                      "10 20 myf\n"
+                      "0 10 name   type\n"
+                      "10 20 myf7\n"
                       "20 30 myod\n"
                       "50\t55 anothername\n"
                       "60 50 backward\n"
-                      ">ident\n"
-                      "GCT\n"
+                      ">ident3 asdf\n"
                       "GCT\n"
+                      "GCTN\n"
                       ;
   string s('A',100);
   s += "GCTGCT";