Let the annotation parser skip html tags
[mussa.git] / alg / test / test_sequence.cpp
index f7ea70d6a6b1f1fe8ce2d2d4662a75e88b248068..21f3682f610af8acf3eadf66e2646fdf0c51605f 100644 (file)
@@ -153,6 +153,34 @@ BOOST_AUTO_TEST_CASE( annotation_load )
   //BOOST_CHECK_EQUAL( annots
 }
 
+
+BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
+{
+  // this actually is basically what's returned by UCSC
+  // (well actually with some of the sequence and copies of fasta blocks
+  // removed to make the example shorter
+  string annot_data = "\n"
+    "<PRE>\n"
+    ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
+    "&gt;hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
+    "GGTGGAGACGACCTGGACCCTAACTACGT\n"
+    "</PRE>\n"
+    "\n"
+    "</BODY>\n"
+    "</HTML>\n"
+    ;
+
+  string s = 
+    "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
+    "AAAAA"
+    "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
+  Sequence seq(s);
+  seq.parse_annot(annot_data);
+  std::list<annot> annots = seq.annotations();
+  BOOST_CHECK_EQUAL( annots.size(), 2);
+}
+
 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
 {
   string annot_data = "0 10 name   type\n"