From: Diane Trout <diane@caltech.edu>
Date: Thu, 10 Aug 2006 22:38:17 +0000 (+0000)
Subject: Let the annotation parser skip html tags
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=commitdiff_plain;h=2ec127a53d7abb8df8cb06236895e1258f5b9e92

Let the annotation parser skip html tags
Since we're telling people to download annotation pages from websites
to cut down on support requests I extended the parser to skip <html></html>
tags. (basically anything of the form "space* <[!>+> space*"
---

diff --git a/alg/sequence.cpp b/alg/sequence.cpp
index e2961c2..bbd6292 100644
--- a/alg/sequence.cpp
+++ b/alg/sequence.cpp
@@ -380,7 +380,7 @@ struct push_back_seq {
   };
 };
 
-void
+bool
 Sequence::parse_annot(std::string data, int start_index, int end_index)
 {
   int start=0;
@@ -401,6 +401,14 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                       +(spirit::space_p)
                     ) >>
                     *(
+                       ( // ignore html tags
+                         *(spirit::space_p) >>
+                         spirit::ch_p('<') >> 
+                         +(~spirit::ch_p('>')) >>
+                         spirit::ch_p('>') >>
+                         *(spirit::space_p)
+                       )
+                     |
                       ( // parse an absolute location name
                        (spirit::uint_p[spirit::assign_a(start)] >> 
                         +spirit::space_p >>
@@ -422,7 +430,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                         // read the comment above struct push_back_annot
                        )[push_back_annot(annots, start, end, type, name)]
                      |
-                      (spirit::ch_p('>') >> 
+                      ((spirit::ch_p('>')|spirit::str_p("&gt;")) >> 
                          (*(spirit::print_p))[spirit::assign_a(name)] >>
                          spirit::eol_p >> 
                          (+(spirit::chset<>(iupac_alphabet)))[spirit::assign_a(seq)]
@@ -436,6 +444,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                 
   // go seearch for query sequences 
   find_sequences(query_seqs.begin(), query_seqs.end());
+  return status;
 }
 
 void Sequence::add_annotation(const annot& a)
diff --git a/alg/sequence.hpp b/alg/sequence.hpp
index 9cb3c85..0a80506 100644
--- a/alg/sequence.hpp
+++ b/alg/sequence.hpp
@@ -139,7 +139,7 @@ class Sequence : public std::string
     //! load sequence annotations
     //! \throws mussa_load_error 
     void load_annot(std::fstream& data_stream, int start_index, int end_index);
-    void parse_annot(std::string data, int start_index, int end_index);
+    bool parse_annot(std::string data, int start_index=0, int end_index=0);
     //! add an annotation to our list of annotations
     void add_annotation(const annot& a);
     const std::list<annot>& annotations() const;
diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp
index f7ea70d..21f3682 100644
--- a/alg/test/test_sequence.cpp
+++ b/alg/test/test_sequence.cpp
@@ -153,6 +153,34 @@ BOOST_AUTO_TEST_CASE( annotation_load )
   //BOOST_CHECK_EQUAL( annots
 }
 
+
+BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
+{
+  // this actually is basically what's returned by UCSC
+  // (well actually with some of the sequence and copies of fasta blocks
+  // removed to make the example shorter
+  string annot_data = "\n"
+    "<PRE>\n"
+    ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
+    "&gt;hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGTGGAGACGACCTGGACCCTAACTACGT\n"
+    "</PRE>\n"
+    "\n"
+    "</BODY>\n"
+    "</HTML>\n"
+    ;
+
+  string s = 
+    "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
+    "AAAAA"
+    "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
+  Sequence seq(s);
+  seq.parse_annot(annot_data);
+  std::list<annot> annots = seq.annotations();
+  BOOST_CHECK_EQUAL( annots.size(), 2);
+}
+
 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
 {
   string annot_data = "0 10 name   type\n"