throw errors when spirit parsing fails
authorDiane Trout <diane@caltech.edu>
Wed, 11 Oct 2006 20:59:30 +0000 (20:59 +0000)
committerDiane Trout <diane@caltech.edu>
Wed, 11 Oct 2006 20:59:30 +0000 (20:59 +0000)
ticket:126
this patch adds code to throw errors (and give some rough idea
where the error was) when the spirit parser fails. (Currently
either loading motifs, or loading sequence annotations).

Also I added a typedef to shorten std::set<Sequence> to motif_set

alg/mussa.cpp
alg/mussa.hpp
alg/sequence.cpp
alg/sequence.hpp
alg/test/test_mussa.cpp
alg/test/test_sequence.cpp
mussa_exceptions.hpp

index 50991c1e1d1411a574f9663b6f43ffbacfa8a0ef..634cb6d58dae2c72f032657dfe75152df8ab8bf1 100644 (file)
@@ -764,7 +764,7 @@ void Mussa::set_motifs(const vector<Sequence>& motifs,
 
 // Helper functor to append created motifs to our Mussa analysis
 struct push_back_motif {
-  std::set<Sequence>& motif_set;
+  Mussa::motif_set& motifs;
   boost::shared_ptr<AnnotationColors> color_mapper;
   std::string& seq_string;
   std::string& name;
@@ -772,20 +772,23 @@ struct push_back_motif {
   float& green;
   float& blue;
   float& alpha;
+  int& parsed;
 
-  push_back_motif(std::set<Sequence>& motif_set_,
+  push_back_motif(Mussa::motif_set& motifs_,
                   boost::shared_ptr<AnnotationColors> color_mapper_,
                   std::string& seq_, 
                   std::string& name_,
-                  float &red_, float &green_, float &blue_, float &alpha_)
-    : motif_set(motif_set_),
+                  float &red_, float &green_, float &blue_, float &alpha_,
+                  int &parsed_)
+    : motifs(motifs_),
       color_mapper(color_mapper_),
       seq_string(seq_),
       name(name_),
       red(red_),
       green(green_),
       blue(blue_),
-      alpha(alpha_)
+      alpha(alpha_),
+      parsed(parsed_)
   {
   }
 
@@ -801,10 +804,18 @@ struct push_back_motif {
     // just attach colors directly to the motif.
     Color c(red, green, blue);
     color_mapper->appendInstanceColor("motif", seq.c_str(), c);
-    motif_set.insert(seq);
+    motifs.insert(seq);
+    ++parsed;
   };
 };
 
+void Mussa::load_motifs(fs::path filename)
+{
+  fs::ifstream f;
+  f.open(filename, ifstream::in);
+  load_motifs(f);
+}
+
 // I mostly split the ifstream out so I can use a stringstream to test it.
 void Mussa::load_motifs(std::istream &in)
 {
@@ -816,6 +827,7 @@ void Mussa::load_motifs(std::istream &in)
   float green = 0.0;
   float blue = 0.0;
   float alpha = 1.0;
+  int parsed = 1;
 
   // slurp our data into a string
   std::streamsize bytes_read = 1;
@@ -840,21 +852,18 @@ void Mussa::load_motifs(std::istream &in)
         spirit::real_p[spirit::assign_a(red)] >> +spirit::space_p >>
         spirit::real_p[spirit::assign_a(green)] >> +spirit::space_p >>
         spirit::real_p[spirit::assign_a(blue)] >> +spirit::space_p
-       )[push_back_motif(motif_sequences, color_mapper, seq, name, red, green, blue, alpha)]
+       )[push_back_motif(motif_sequences, color_mapper, seq, name, red, green, blue, alpha, parsed)]
      )).full;
   if (not ok) {
-    std::clog << "Error parsing motif stream " << std::endl;
+    stringstream msg;
+    msg << "Error parsing motif #" << parsed;
+    // erase our potentially broken motif list
+    motif_sequences.clear();
+    throw motif_load_error(msg.str());
   }
   update_sequences_motifs();
 }
 
-void Mussa::load_motifs(fs::path filename)
-{
-  fs::ifstream f;
-  f.open(filename, ifstream::in);
-  load_motifs(f);
-}
-
 void Mussa::update_sequences_motifs()
 {
   // once we've loaded all the motifs from the file, 
index cc806073fd92658475ad85608f77af7d10fae7b5..64f46499b2ccd0eb450d051e219335eb59256c2e 100644 (file)
@@ -40,6 +40,7 @@ signals:
     void progress(const std::string& description, int cur, int max);
 
 public:
+    typedef std::set<Sequence> motif_set;
     enum analysis_modes { TransitiveNway, RadialNway, EntropyNway, 
                           RecursiveNway };
 
@@ -168,18 +169,20 @@ public:
      */
     void set_motifs(const std::vector<Sequence>& motifs, 
                     const std::vector<Color>& colors);
-    //! load motifs from an ifstream
     /*! The file should look something like
      *  <sequence> <red> <green> <blue>
      *  where sequence is a string of IUPAC symbols
      *  and red,green,blue are a white space separated list of floats
      *  in the range [0.0, 1.0]
      */
-    void load_motifs(std::istream &);
     //! load a list of motifs from a file named filename
     void load_motifs(boost::filesystem::path filename);
+    //! load motifs from an ifstream
+    /*! \sa Mussa::load_motifs(boost::filesystem::path)
+     */
+    void load_motifs(std::istream &);
     //! return our motifs;
-    const std::set<Sequence>& motifs() const;
+    const motif_set& motifs() const;
 
     //! return color mapper
     boost::shared_ptr<AnnotationColors> colorMapper();
@@ -214,7 +217,7 @@ public:
     NwayPaths the_paths;
 
     //! motif list
-    std::set<Sequence> motif_sequences;
+    motif_set motif_sequences;
     //! color manager
     boost::shared_ptr<AnnotationColors> color_mapper;
     //! path to our analysis
index ae58e5c87ed2da52f47193c679a5377b41aa03fe..e1923c68d49e484ac769800c66fccff7952ef6a5 100644 (file)
@@ -345,17 +345,20 @@ struct push_back_annot {
   int& end;
   std::string& name;
   std::string& type;
+  int &parsed;
 
   push_back_annot(std::list<annot>& annot_list_, 
                   int& begin_, 
                   int& end_, 
                   std::string& name_, 
-                  std::string& type_) 
+                  std::string& type_,
+                  int &parsed_) 
   : annot_list(annot_list_), 
     begin(begin_),
     end(end_),
     name(name_),
-    type(type_)
+    type(type_),
+    parsed(parsed_)
   {
   }
 
@@ -364,6 +367,7 @@ struct push_back_annot {
   {
     //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
     annot_list.push_back(annot(begin, end, name, type));
+    ++parsed;
   };
 };
 
@@ -371,13 +375,16 @@ struct push_back_seq {
   std::list<Sequence>& seq_list;
   std::string& name;
   std::string& seq;
+  int &parsed;
 
   push_back_seq(std::list<Sequence>& seq_list_,
                 std::string& name_, 
-                std::string& seq_)
+                std::string& seq_,
+                int &parsed_)
   : seq_list(seq_list_), 
     name(name_),
-    seq(seq_)
+    seq(seq_),
+    parsed(parsed_)
   {
   }
 
@@ -397,10 +404,11 @@ struct push_back_seq {
     Sequence s(new_seq);
     s.set_fasta_header(name);
     seq_list.push_back(s);
+    ++parsed;
   };
 };
 
-bool
+void
 Sequence::parse_annot(std::string data, int start_index, int end_index)
 {
   int start=0;
@@ -408,62 +416,69 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
   std::string name;
   std::string type;
   std::string seq;
+  std::list<annot> parsed_annots;
   std::list<Sequence> query_seqs;
-
-  bool status = spirit::parse(data.begin(), data.end(),
-                (
-                 //begin grammar
-                   !(
-                      (
-                        spirit::alpha_p >> 
-                        +(spirit::graph_p)
-                      )[spirit::assign_a(species)] >> 
-                      +(spirit::space_p)
-                    ) >>
-                    *(
-                       ( // ignore html tags
-                         *(spirit::space_p) >>
-                         spirit::ch_p('<') >> 
-                         +(~spirit::ch_p('>')) >>
-                         spirit::ch_p('>') >>
-                         *(spirit::space_p)
-                       )
-                     |
-                      ( // parse an absolute location name
-                       (spirit::uint_p[spirit::assign_a(start)] >> 
-                        +spirit::space_p >>
-                        spirit::uint_p[spirit::assign_a(end)] >> 
-                        +spirit::space_p >>
-                        ( 
-                           spirit::alpha_p >> 
-                           *spirit::graph_p
-                        )[spirit::assign_a(name)] >> 
-                        // optional type
-                        !(
-                            +spirit::space_p >>
-                            (
-                              spirit::alpha_p >>
-                              *spirit::graph_p
-                            )[spirit::assign_a(type)]
-                        )
-                        // to understand how this group gets set
-                        // read the comment above struct push_back_annot
-                       )[push_back_annot(annots, start, end, type, name)]
-                     |
-                      ((spirit::ch_p('>')|spirit::str_p("&gt;")) >> 
-                         (*(spirit::print_p))[spirit::assign_a(name)] >>
-                         spirit::eol_p >> 
-                         (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
-                       )[push_back_seq(query_seqs, name, seq)]
-                      ) >>
-                      *spirit::space_p
+  int parsed=1;
+
+  bool ok = spirit::parse(data.begin(), data.end(),
+              (
+               //begin grammar
+                 !(
+                    (
+                      spirit::alpha_p >> 
+                      +(spirit::graph_p)
+                    )[spirit::assign_a(species)] >> 
+                    +(spirit::space_p)
+                  ) >>
+                  *(
+                     ( // ignore html tags
+                       *(spirit::space_p) >>
+                       spirit::ch_p('<') >> 
+                       +(~spirit::ch_p('>')) >>
+                       spirit::ch_p('>') >>
+                       *(spirit::space_p)
                      )
-                //end grammar
-                )).full;
-                
+                   |
+                    ( // parse an absolute location name
+                     (spirit::uint_p[spirit::assign_a(start)] >> 
+                      +spirit::space_p >>
+                      spirit::uint_p[spirit::assign_a(end)] >> 
+                      +spirit::space_p >>
+                      ( 
+                         spirit::alpha_p >> 
+                         *spirit::graph_p
+                      )[spirit::assign_a(name)] >> 
+                      // optional type
+                      !(
+                          +spirit::space_p >>
+                          (
+                            spirit::alpha_p >>
+                            *spirit::graph_p
+                          )[spirit::assign_a(type)]
+                      )
+                      // to understand how this group gets set
+                      // read the comment above struct push_back_annot
+                     )[push_back_annot(parsed_annots, start, end, type, name, parsed)]
+                   |
+                    ((spirit::ch_p('>')|spirit::str_p("&gt;")) >> 
+                       (*(spirit::print_p))[spirit::assign_a(name)] >>
+                       spirit::eol_p >> 
+                       (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
+                     )[push_back_seq(query_seqs, name, seq, parsed)]
+                    ) >>
+                    *spirit::space_p
+                   )
+              //end grammar
+              )).full;
+  if (not ok) {
+    std::stringstream msg;
+    msg << "Error parsing annotation #" << parsed;
+    throw annotation_load_error(msg.str());
+  }
+  // add newly parsed annotations to our sequence
+  std::copy(parsed_annots.begin(), parsed_annots.end(), std::back_inserter(annots));
   // go seearch for query sequences 
   find_sequences(query_seqs.begin(), query_seqs.end());
-  return status;
 }
 
 void Sequence::add_annotation(const annot& a)
index dc9d594a1f572753f73ced6a5de056db5b6c0209..20bf01872bfd44d2b8129091d30de882cb6bf457 100644 (file)
@@ -218,7 +218,10 @@ public:
   //! load sequence annotations
   //! \throws mussa_load_error 
   void load_annot(std::fstream& data_stream, int start_index, int end_index);
-  bool parse_annot(std::string data, int start_index=0, int end_index=0);
+  //! parse annotation file
+  /*! \throws annotation_load_error 
+   */
+  void parse_annot(std::string data, int start_index=0, int end_index=0);
   //! add an annotation to our list of annotations
   void add_annotation(const annot& a);
   const std::list<annot>& annotations() const;
index dd68250476d906dc1edaf8750d1e59aa6e315359..74f3c20b2a543f1f0d14b9a92aad49da3aa30b1b 100644 (file)
@@ -180,8 +180,7 @@ BOOST_AUTO_TEST_CASE( mussa_load_analysis )
 BOOST_AUTO_TEST_CASE( mussa_load_motif )
 {
   string data = "AAGG 1.0 1.0 0.0\n"
-                "GGTT 0.0 0.1 1.0\n"
-                "ZXY 2 1.9 0\n";
+                "GGTT 0.0 0.1 1.0\n";
 
   istringstream test_istream(data);
 
@@ -190,6 +189,7 @@ BOOST_AUTO_TEST_CASE( mussa_load_motif )
   m1.append_sequence("GGGCCCCTTCCAATT");
   m1.load_motifs(test_istream);
 
+  BOOST_CHECK_EQUAL( m1.motifs().size(), 2);
   for (Mussa::vector_sequence_type::const_iterator seq_i = m1.sequences().begin();
        seq_i != m1.sequences().end();
        ++seq_i)
@@ -198,6 +198,22 @@ BOOST_AUTO_TEST_CASE( mussa_load_motif )
   }
 }
 
+BOOST_AUTO_TEST_CASE( mussa_load_broken_motif )
+{
+  string data = "AAGG 1.0 1.0 0.0\n"
+                "GGTT 0.0 0.1 1.0 1.0\n"
+                "ZZCTA 0.1 0.0 1.0\n";
+
+  istringstream test_istream(data);
+
+  Mussa m1;
+  m1.append_sequence("AAAAGGGGTTTT");
+  m1.append_sequence("GGGCCCCTTCCAATT");
+  BOOST_CHECK_THROW(m1.load_motifs(test_istream), motif_load_error);
+
+  BOOST_CHECK_EQUAL( m1.motifs().size(), 0);
+}
+
 BOOST_AUTO_TEST_CASE( mussa_named_motif )
 {
   string data = "CCAATT cat 0.1 0.2 0.3\n";
index d4b282132884855b7d2b74831c2ff35d27fa0147..f9634ce51f9958535f4b02264552c21c4681e33d 100644 (file)
@@ -325,6 +325,22 @@ BOOST_AUTO_TEST_CASE( annotation_load )
   //BOOST_CHECK_EQUAL( annots
 }
 
+BOOST_AUTO_TEST_CASE( annotation_broken_load )
+{
+  string annot_data = "human\n"
+                      "0 10 name   type\n"
+                      "blah60 50 backward\n"
+                      ">ident3 asdf\n"
+                      "GCT\n"
+                      "gCTn\n"
+                      ;
+  string s(100, 'A');
+  s += "GCTGCTAATT";
+  Sequence seq(s, Sequence::reduced_dna_alphabet);
+                     
+  BOOST_CHECK_THROW(seq.parse_annot(annot_data, 0, 0), annotation_load_error);
+  BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
+  }
 
 BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
 {
index 065347a6b52ec8eb98eb48b5eb6b981d909a8a5e..a78dd4a51be50067bfee1fda485404dd2d338ffe 100644 (file)
@@ -62,6 +62,15 @@ public:
   explicit sequence_invalid_load_error(const std::string& msg) :
     sequence_load_error(msg) {};
 };
+
+//! Error loading sequence annotation
+class annotation_load_error : public sequence_load_error
+{
+public:
+  explicit annotation_load_error(const std::string& msg) :
+    sequence_load_error(msg) {};
+};
+
 //! failure running analysis
 class mussa_analysis_error : public mussa_error
 {
@@ -70,12 +79,23 @@ public:
     mussa_error(msg) {};
 };
 
+//! couldn't normalize a motif
+/*
 class motif_normalize_error : public mussa_error
 {
 public:
   explicit motif_normalize_error(const std::string& msg) : 
     mussa_error(msg) {};
 };
+*/
+
+//! something went wrong loading a motif  
+class motif_load_error : public mussa_load_error
+{
+public:
+  explicit motif_load_error(const std::string& msg) :
+    mussa_load_error(msg) {};
+};
 
 //! ConservedPath::nextTo had two paths that weren't the same size
 class conserved_path_size_mismatch : public mussa_error