Catch annotation sequences that don't end with newline
[mussa.git] / alg / test / test_sequence.cpp
index 9faa8d069c156376ce945202527664b9955d47e6..2ea2fc0c5df2c9a9484548f2169d9a2d8a366212 100644 (file)
@@ -833,6 +833,48 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence )
   }
 }
 
+BOOST_AUTO_TEST_CASE( sequence_no_trailing_newline )
+{
+  // sorry about the long string...
+  string s = "AATTACACAAGGAATATAGGTAGTTTGAATAAAAATATCTTTAACAGCTTGGAGCTATTGAGACAGGAACACTTCCACGCACATGCACAGTTAAACAACTTGAGTGCAACACACAACATTGGCACTAAACGAGATTGAAGGGGGACTTTTTGTGTGTTTTTTTTTCTCTTTTCTTTTTTTGTTATAGTTACTTCAAGTAACACAGCTTGCTTCATATAAATAAGTTAAAACATCTATTTTTTTTCAAGACAAAGCCATTCAGGACAAAGAGATGAACAGAAAGCAGATCTACTTATACAGGCGCTATAATGGCAATAAACAGGCTCATGATTAAAAGATGAATTAGGGCAACGAGAACAGGGCTTCTTCACAGAAGGAACACAAGGGAGTTTCAGAAAGTCACCTTAGTACTGACACTACGCGGGATCCGCTAATACTGCTCAGTACTTTAAACGCTCAGATACTCAGGGACGGAAGGCCCCTCCTGCCGCGGCCATGCTCATGCTTTTCAGCTTATTATCTTTTTTCCACTTCATTCTCCGGTTTTGGAACCAGATTTTAATTTGTCTCTCGGAGAGGCAAAGAGCATGTGCTATTTCAATCCTCCTTCTGCGGGTCAGGTAACGGTTGAAGTGGAACTCCTTCTCCAGCTCCAGGGTCTGGTAGCGCGTGTAGGCCGTCCGGGCCCTTTTGCCTTCCGGGCCGCCTATGTTGTCTGCAATAGAAAAGTCAGCGGTTTAGCCACCAACTCCTGTCTTCCAAAGTCCGCCAGGGGGACAAGCTTGGGTCATGAGCAGGGAACCCAGGCGAAAAGCTCAACAAGTTCTGCCTACCAGCCCGCACACCCCTCCCGAATTTCCTTCTCTCTTCCTTTCTAGAAAGAAAACAATACGATTTGGACCCTGGGAACAATCTGCCCATCTGAGGCTGGGGCCGTGTCCCGGCGGACTCCGGCTTTCCCTGGCCCCTCTCCTGCCCCCTCCGCCCTGCCCCGGGCGCCCCGATCGGGAGGCACAGCCCTCCCAGGCTGCCCACCGCACAGAAACCCAGGAAGCAAGGCCCTTTCCTGAGCGCCCAAGTGGCCTTCGGGTCACCCTCCCTCAAAGTTCCAGCCCCGAGAGCCGCCTCCCGTTTCCAGCCTGCAGGGTTGGGGAGCCTGTTTTCTTTTTCTTCCCTTTCCTTCTCTCTCCCTCCTGCCCCCAAAATTCAGAATCCTGCAGGCTCTCGCCTCGATTCTTTCCCCCAAGCCCCTTTTCGGGGGCTGTAATTAGTAACGCTGTTTCCCCAGCGTAGCCCTCCTCATAAATTATCCGCCGTGACAAGCCCGATTCACGGCTGCTACAGCCATCCTCTACCTCTCTGCGCCTTGCTCGGCTGGCCTGACCCGGGAGCGCGTCCCAAGGCGTGGGGTTCCAGAGGGGTTTTTTGCTTCCTCCCCCTTCCAACGTCTAAACTGTCCCAGAGAACGCCCATTTCCCCCACTATTTGTGAGCGCAGGGTGCTCGCAAAGAAGAGGAGGAAGGAGGAAGGCAGGGGAGGGAGAACGGCAAGGAGAGCTCCGCAGGGCTGGGAGAAATGAGACCAAGAGAGACTGGGAGAGGGCGGCAGAGAAGAGAGGGGGGACCGAGAGCCGCGTCCCCGCGGTCGCGTGGATTTAGAAAAAGGCTGGCTTTACCATGACTTATGTGCAGCTTGCGCATCCAGGGGTAGATCTGGGGTTGGGCGGGCGGCGCCGGGCTCGGCTCGCTCTGCGCACTCGCCTGCTCGCTGCTGGCAGGGGCGTCCTCCTCGGCTCCGGACGCCGTGCCAACCCCCTCTCTGCTGCTGATGTGGGTGCTGCCGGCGTCGGCCGAGGCGCCGCTGGAGTTGCTTAGGGAGTTTTTCCCGCCGTGGTGGCTGTCGCTGCCGGGCGAGGGGGCCACGGCGGAGCAGGGCAGCGGATCGGGCTGAGGAGAGTGCGTGGACGTGGCCGGCTGGCTGTACCTGGGCTCGGCGGGCGCCGCGCTGGCGCTGGCAGCGTAGCTGCGGGCGCGCTCTCCGGAGCCAAAGTGGCCGGAGCCCGAGCGGCCGACGCTGAGATCCATGCCATTGTAGCCGTAGCCGTACCTGCCGGAGTGCATGCTCGCCGAGTCCCTGAATTGCTCGCTCACGGAACTATGATCTCCATAATTATGCAACTGGTAGTCCGGGCCATTTGGATAGCGACCGCAAAATGAGTTTACAAAATAAGAGCTCATTTGTTTTTTGATATGTGTGCTTGATTTGTGGCTCGCGGTCGTTTGTGCGTCTATAGCACCCTT";
+  std::string species = "HumanHXA5\n";
+  std::string header0 = ">hg18_knownGene_NM_019102_0\n";
+  std::string str0 = "GGGTGCTATAGACGCACAAACGACCGCGAGCCACAAATCAAGCACACATATCAAAAAACAAATGAGCTCTTATTTTGTAAACTCATTTTGCGGTCGCTATCCAAATGGCCCGGACTACCAGTTGCATAATTATGGAGATCATAGTTCCGTGAGCGAGCAATTCAGGGACTCGGCGAGCATGCACTCCGGCAGGTACGGCTACGGCTACAATGGCATGGATCTCAGCGTCGGCCGCTCGGGCTCCGGCCACTTTGGCTCCGGAGAGCGCGCCCGCAGCTACGCTGCCAGCGCCAGCGCGGCGCCCGCCGAGCCCAGGTACAGCCAGCCGGCCACGTCCACGCACTCTCCTCAGCCCGATCCGCTGCCCTGCTCCGCCGTGGCCCCCTCGCCCGGCAGCGACAGCCACCACGGCGGGAAAAACTCCCTAAGCAACTCCAGCGGCGCCTCGGCCGACGCCGGCAGCACCCACATCAGCAGCAGAGAGGGGGTTGGCACGGCGTCCGGAGCCGAGGAGGACGCCCCTGCCAGCAGCGAGCAGGCGAGTGCGCAGAGCGAGCCGAGCCCGGCGCCGCCCGCCCAACCCCAGATCTACCCCTGGATGCGCAAGCTGCACATAAGTCATG";
+  std::string header1 = ">hg18_knownGene_NM_019102_1\n";
+  std::string str1 = "ACAACATAGGCGGCCCGGAAGGCAAAAGGGCCCGGACGGCCTACACGCGCTACCAGACCCTGGAGCTGGAGAAGGAGTTCCACTTCAACCGTTACCTGACCCGCAGAAGGAGGATTGAAATAGCACATGCTCTTTGCCTCTCCGAGAGACAAATTAAAATCTGGTTCCAAAACCGGAGAATGAAGTGGAAAAAAGATAATAAGCTGAAAAGCATGAGCATGGCCGCGGCAGGAGGGGCCTTCCGTCCCTGAGTATCTGAGCGTTTAAAGTACTGAGCAGTATTAGCGGATCCCGCGTAGTGTCAGTACTAAGGTGACTTTCTGAAACTCCCTTGTGTTCCTTCTGTGAAGAAGCCCTGTTCTCGTTGCCCTAATTCATCTTTTAATCATGAGCCTGTTTATTGCCATTATAGCGCCTGTATAAGTAGATCTGCTTTCTGTTCATCTCTTTGTCCTGAATGGCTTTGTCTTGAAAAAAAATAGATGTTTTAACTTATTTATATGAAGCAAGCTGTGTTACTTGAAGTAACTATAACAAAAAAAGAAAAGAGAAAAAAAAACACACAAAAAGTCCCCCTTCAATCTCGTTTAGTGCCAATGTTGTGTGTTGCACTCAAGTTGTTTAACTGTGCATGTGCGTGGAAGTGTTCCTGTCTCAATAGCTCCAAGCTGTTAAAGATATTTTTATTCAAACTACCTATATTCCTTGT";
+  stringstream annot;
+  annot << species 
+        << header0 
+        << str0 << std::endl 
+        << std::endl 
+        << header1 
+        << str1;
+  // need to convert strings to sequences for reverse complementing
+  Sequence seq0(str0, reduced_dna_alphabet);
+  Sequence seq1(str1, reduced_dna_alphabet);
+
+  Sequence annotated_seq(s, reduced_dna_alphabet);
+  annotated_seq.load_annot(annot, 0, 0);
+
+  SeqSpanRefList annots_list = annotated_seq.annotations();
+  // both sequences were found
+  BOOST_REQUIRE_EQUAL( annots_list.size(),  2 );
+
+  std::vector<SeqSpanRef> annots(annots_list.begin(), annots_list.end());
+  // are they the same sequence?
+  BOOST_CHECK_EQUAL( annots[0]->size(),  seq0.size());
+  BOOST_CHECK_EQUAL( annots[0]->sequence(), seq0.rev_comp() );
+  // this should hopefully catch the case when my hack in 
+  // sequence.cpp::push_back_seq::operator() is no longer needed.
+  // spirit (or my grammar was duplicating the last char, 
+  // the hack removes the duplicate. but if what ever's causing
+  // the dup gets fixed actual meaningful data will be being removed.
+  // see mussa ticket:265 for more information
+  BOOST_CHECK_EQUAL( annots[1]->size(),  seq1.size());
+  BOOST_CHECK_EQUAL( annots[1]->sequence(), seq1.rev_comp() );
+
+}
+
 BOOST_AUTO_TEST_CASE( subseq_annotation_test )
 {
   string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"