Let the annotation parser skip html tags
authorDiane Trout <diane@caltech.edu>
Thu, 10 Aug 2006 22:38:17 +0000 (22:38 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 10 Aug 2006 22:38:17 +0000 (22:38 +0000)
Since we're telling people to download annotation pages from websites
to cut down on support requests I extended the parser to skip <html></html>
tags. (basically anything of the form "space* <[!>+> space*"

alg/sequence.cpp
alg/sequence.hpp
alg/test/test_sequence.cpp

index e2961c2c7bf60de6d7339070bd4ab9990365c4eb..bbd6292c75f90dd81977887b301485eabf2a712b 100644 (file)
@@ -380,7 +380,7 @@ struct push_back_seq {
   };
 };
 
-void
+bool
 Sequence::parse_annot(std::string data, int start_index, int end_index)
 {
   int start=0;
@@ -401,6 +401,14 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                       +(spirit::space_p)
                     ) >>
                     *(
+                       ( // ignore html tags
+                         *(spirit::space_p) >>
+                         spirit::ch_p('<') >> 
+                         +(~spirit::ch_p('>')) >>
+                         spirit::ch_p('>') >>
+                         *(spirit::space_p)
+                       )
+                     |
                       ( // parse an absolute location name
                        (spirit::uint_p[spirit::assign_a(start)] >> 
                         +spirit::space_p >>
@@ -422,7 +430,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                         // read the comment above struct push_back_annot
                        )[push_back_annot(annots, start, end, type, name)]
                      |
-                      (spirit::ch_p('>') >> 
+                      ((spirit::ch_p('>')|spirit::str_p("&gt;")) >> 
                          (*(spirit::print_p))[spirit::assign_a(name)] >>
                          spirit::eol_p >> 
                          (+(spirit::chset<>(iupac_alphabet)))[spirit::assign_a(seq)]
@@ -436,6 +444,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                 
   // go seearch for query sequences 
   find_sequences(query_seqs.begin(), query_seqs.end());
+  return status;
 }
 
 void Sequence::add_annotation(const annot& a)
index 9cb3c85fa433839ced63995fd4168f5d0cb24f14..0a80506d02b914b4898a2582b14b981d49bf97a5 100644 (file)
@@ -139,7 +139,7 @@ class Sequence : public std::string
     //! load sequence annotations
     //! \throws mussa_load_error 
     void load_annot(std::fstream& data_stream, int start_index, int end_index);
-    void parse_annot(std::string data, int start_index, int end_index);
+    bool parse_annot(std::string data, int start_index=0, int end_index=0);
     //! add an annotation to our list of annotations
     void add_annotation(const annot& a);
     const std::list<annot>& annotations() const;
index f7ea70d6a6b1f1fe8ce2d2d4662a75e88b248068..21f3682f610af8acf3eadf66e2646fdf0c51605f 100644 (file)
@@ -153,6 +153,34 @@ BOOST_AUTO_TEST_CASE( annotation_load )
   //BOOST_CHECK_EQUAL( annots
 }
 
+
+BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
+{
+  // this actually is basically what's returned by UCSC
+  // (well actually with some of the sequence and copies of fasta blocks
+  // removed to make the example shorter
+  string annot_data = "\n"
+    "<PRE>\n"
+    ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
+    "&gt;hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGTGGAGACGACCTGGACCCTAACTACGT\n"
+    "</PRE>\n"
+    "\n"
+    "</BODY>\n"
+    "</HTML>\n"
+    ;
+
+  string s = 
+    "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
+    "AAAAA"
+    "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
+  Sequence seq(s);
+  seq.parse_annot(annot_data);
+  std::list<annot> annots = seq.annotations();
+  BOOST_CHECK_EQUAL( annots.size(), 2);
+}
+
 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
 {
   string annot_data = "0 10 name   type\n"