From: Diane Trout Date: Thu, 10 Aug 2006 22:38:17 +0000 (+0000) Subject: Let the annotation parser skip html tags X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=commitdiff_plain;h=2ec127a53d7abb8df8cb06236895e1258f5b9e92 Let the annotation parser skip html tags Since we're telling people to download annotation pages from websites to cut down on support requests I extended the parser to skip tags. (basically anything of the form "space* <[!>+> space*" --- diff --git a/alg/sequence.cpp b/alg/sequence.cpp index e2961c2..bbd6292 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -380,7 +380,7 @@ struct push_back_seq { }; }; -void +bool Sequence::parse_annot(std::string data, int start_index, int end_index) { int start=0; @@ -401,6 +401,14 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) +(spirit::space_p) ) >> *( + ( // ignore html tags + *(spirit::space_p) >> + spirit::ch_p('<') >> + +(~spirit::ch_p('>')) >> + spirit::ch_p('>') >> + *(spirit::space_p) + ) + | ( // parse an absolute location name (spirit::uint_p[spirit::assign_a(start)] >> +spirit::space_p >> @@ -422,7 +430,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) // read the comment above struct push_back_annot )[push_back_annot(annots, start, end, type, name)] | - (spirit::ch_p('>') >> + ((spirit::ch_p('>')|spirit::str_p(">")) >> (*(spirit::print_p))[spirit::assign_a(name)] >> spirit::eol_p >> (+(spirit::chset<>(iupac_alphabet)))[spirit::assign_a(seq)] @@ -436,6 +444,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) // go seearch for query sequences find_sequences(query_seqs.begin(), query_seqs.end()); + return status; } void Sequence::add_annotation(const annot& a) diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 9cb3c85..0a80506 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -139,7 +139,7 @@ class Sequence : public std::string //! load sequence annotations //! \throws mussa_load_error void load_annot(std::fstream& data_stream, int start_index, int end_index); - void parse_annot(std::string data, int start_index, int end_index); + bool parse_annot(std::string data, int start_index=0, int end_index=0); //! add an annotation to our list of annotations void add_annotation(const annot& a); const std::list& annotations() const; diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index f7ea70d..21f3682 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -153,6 +153,34 @@ BOOST_AUTO_TEST_CASE( annotation_load ) //BOOST_CHECK_EQUAL( annots } + +BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load) +{ + // this actually is basically what's returned by UCSC + // (well actually with some of the sequence and copies of fasta blocks + // removed to make the example shorter + string annot_data = "\n" + "
\n"
+    ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
+    ">hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGTGGAGACGACCTGGACCCTAACTACGT\n"
+    "
\n" + "\n" + "\n" + "\n" + ; + + string s = + "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG" + "AAAAA" + "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC"; + Sequence seq(s); + seq.parse_annot(annot_data); + std::list annots = seq.annotations(); + BOOST_CHECK_EQUAL( annots.size(), 2); +} + BOOST_AUTO_TEST_CASE( annotation_load_no_species_name ) { string annot_data = "0 10 name type\n"