X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=blobdiff_plain;f=alg%2Ftest%2Ftest_sequence.cpp;h=aefa6ac6c7aa6ec2b68285d674622d89d82ee127;hp=921c5ef7aa2437a177b6f8094279ccca50d71441;hb=97498410e1fc5c39eac0282a6620b8fcb0f02ff3;hpb=8b38b5bc63e5c62983d0814aa75d3f88b9116e49 diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 921c5ef..aefa6ac 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -1,5 +1,7 @@ -#define BOOST_AUTO_TEST_MAIN -#include +#define BOOST_TEST_DYN_LINK +#define BOOST_TEST_MODULE +#include + #include #include namespace fs=boost::filesystem; @@ -463,16 +465,16 @@ BOOST_AUTO_TEST_CASE( sequence_empty_reverse_iterator) BOOST_AUTO_TEST_CASE( annotation_load ) { string annot_data = "human\n" - "0 10 name type\n" - "10 20 myf7\n" - "20 30 myod\n" - "50\t55 anothername\n" - "60 50 backward\n" - ">ident3 asdf\n" + "0 10 name type\n" //0 + "10 20 myf7\n" //1 + "20 30 myod\n" //2 + "50\t55 anothername\n" //3 + "60 50 backward\n" //4 + ">ident3 asdf\n" //7 (as these are added last) "GCT\n" "gCTn\n" - "75\t90\tname2\ttype2\n" - "100 120 name-asdf type!@#$%\n" + "75\t90\tname2\ttype2\n" //5 + "100 120 name-asdf type!@#$%\n" //6 ; string s(100, 'A'); s += "GCTGCTAATT"; @@ -480,27 +482,37 @@ BOOST_AUTO_TEST_CASE( annotation_load ) //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); - std::list annots_list = seq.annotations(); - std::vector annots(annots_list.begin(), annots_list.end()); + SeqSpanRefList annots_list(seq.annotations()); + std::vector annots(annots_list.begin(), annots_list.end()); BOOST_REQUIRE_EQUAL( annots.size(), 8); - BOOST_CHECK_EQUAL( annots[0].begin, 0 ); - BOOST_CHECK_EQUAL( annots[0].end, 10 ); - BOOST_CHECK_EQUAL( annots[0].type, "type"); - BOOST_CHECK_EQUAL( annots[0].name, "name"); - BOOST_CHECK_EQUAL( annots[1].name, "myf7"); - BOOST_CHECK_EQUAL( annots[2].name, "myod"); - BOOST_CHECK_EQUAL( annots[3].name, "anothername"); - BOOST_CHECK_EQUAL( annots[4].name, "backward"); - BOOST_CHECK_EQUAL( annots[5].name, "name2"); - BOOST_CHECK_EQUAL( annots[5].end, 90); - BOOST_CHECK_EQUAL( annots[6].begin, 100); - BOOST_CHECK_EQUAL( annots[6].end, 120); - BOOST_CHECK_EQUAL( annots[6].name, "name-asdf"); - BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%"); + BOOST_CHECK_EQUAL( annots[0]->start(), 0 ); + BOOST_CHECK_EQUAL( annots[0]->stop(), 10 ); + BOOST_REQUIRE( annots[0]->annotations() ); + BOOST_CHECK_EQUAL( annots[0]->annotations()->get("type"), "type"); + BOOST_CHECK_EQUAL( annots[0]->annotations()->name(), "name"); + BOOST_REQUIRE( annots[1]->annotations() ); + BOOST_CHECK_EQUAL( annots[1]->annotations()->name(), "myf7"); + BOOST_REQUIRE( annots[2]->annotations() ); + BOOST_CHECK_EQUAL( annots[2]->annotations()->name(), "myod"); + BOOST_REQUIRE( annots[3]->annotations() ); + BOOST_CHECK_EQUAL( annots[3]->annotations()->name(), "anothername"); + BOOST_REQUIRE( annots[4]->annotations() ); + BOOST_CHECK_EQUAL( annots[4]->annotations()->name(), "backward"); + BOOST_REQUIRE( annots[5]->annotations() ); + BOOST_CHECK_EQUAL( annots[5]->annotations()->name(), "name2"); + BOOST_CHECK_EQUAL( annots[5]->start(), 75); + BOOST_CHECK_EQUAL( annots[5]->stop(), 90); + BOOST_CHECK_EQUAL( annots[6]->start(), 100); + BOOST_CHECK_EQUAL( annots[6]->stop(), 110); + BOOST_REQUIRE( annots[6]->annotations() ); + BOOST_CHECK_EQUAL( annots[6]->annotations()->name(), "name-asdf"); + BOOST_CHECK_EQUAL( annots[6]->annotations()->get("type"), "type!@#$%"); // sequence defined annotations will always be after the // absolute positions - BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf"); - BOOST_CHECK_EQUAL( annots[7].begin, 100); + BOOST_REQUIRE( annots[7]->annotations() ); + BOOST_CHECK_EQUAL( annots[7]->annotations()->name(), "ident3 asdf"); + BOOST_CHECK_EQUAL( annots[7]->start(), 100); + BOOST_CHECK_EQUAL( annots[7]->stop(), 107); //BOOST_CHECK_EQUAL( annots } @@ -545,7 +557,7 @@ BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load) "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC"; Sequence seq(s, reduced_dna_alphabet); seq.parse_annot(annot_data); - std::list annots = seq.annotations(); + SeqSpanRefList annots(seq.annotations()); BOOST_CHECK_EQUAL( annots.size(), 2); } @@ -568,12 +580,28 @@ BOOST_AUTO_TEST_CASE( annotation_load_no_species_name ) //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); - std::list annots_list = seq.annotations(); - std::vector annots(annots_list.begin(), annots_list.end()); + SeqSpanRefList annots_list(seq.annotations()); + std::vector annots(annots_list.begin(), annots_list.end()); BOOST_REQUIRE_EQUAL( annots.size(), 8); - BOOST_CHECK_EQUAL( annots[0].begin, 0 ); - BOOST_CHECK_EQUAL( annots[0].end, 10 ); - BOOST_CHECK_EQUAL( annots[0].type, "type"); + BOOST_CHECK_EQUAL( annots[0]->start(), 0 ); + BOOST_CHECK_EQUAL( annots[0]->stop(), 10 ); + BOOST_CHECK_EQUAL( annots[0]->annotations()->get("type"), "type"); +} + +// when we do a subsequence (or something that calls copy_children) +// the annotations need to be updated to have the right parent +BOOST_AUTO_TEST_CASE( update_annotations_seqref ) +{ + Sequence s1("AAAAGGGG"); + s1.add_annotation("A", "A", 0, 4); + BOOST_CHECK_EQUAL(s1.annotations().size(), 1); + BOOST_CHECK_EQUAL(s1.seqspan(), s1.annotations().front()->parent() ); + + Sequence subseq1(s1.subseq(2,4)); + BOOST_CHECK_EQUAL(subseq1.annotations().size(), 1); + BOOST_CHECK_EQUAL(subseq1.annotations().front()->parentStart(), 0 ); + BOOST_CHECK_EQUAL(subseq1.annotations().front()->parentStop(), 2 ); + BOOST_CHECK_EQUAL(subseq1.seqspan(), subseq1.annotations().front()->parent() ); } // ticket:83 when you try to load a sequence from a file that doesn't @@ -742,12 +770,14 @@ BOOST_AUTO_TEST_CASE( sequence_motif_subseq) BOOST_AUTO_TEST_CASE( annot_test ) { - annot a(0, 10, "test", "thing"); - - BOOST_CHECK_EQUAL( a.begin, 0 ); - BOOST_CHECK_EQUAL( a.end, 10 ); - BOOST_CHECK_EQUAL( a.type, "test" ); - BOOST_CHECK_EQUAL( a.name, "thing" ); + Sequence s("AAAAAAAAAA"); + s.add_annotation("test", "thing", 0, 10); + SeqSpanRef a(s.annotations().front()); + + BOOST_CHECK_EQUAL( a->start(), 0 ); + BOOST_CHECK_EQUAL( a->stop(), 10 ); + BOOST_CHECK_EQUAL( a->annotations()->get("name"), "test" ); + BOOST_CHECK_EQUAL( a->annotations()->get("type"), "thing" ); motif m(10, "AAGGCC"); BOOST_CHECK_EQUAL( m.begin, 10 ); @@ -795,15 +825,58 @@ BOOST_AUTO_TEST_CASE( annotate_from_sequence ) } } BOOST_CHECK_EQUAL(seq.annotations().size(), count); - const std::list &a = seq.annotations(); - for (std::list::const_iterator annot_i = a.begin(); + const SeqSpanRefList& a = seq.annotations(); + for (SeqSpanRefList::const_iterator annot_i = a.begin(); annot_i != a.end(); ++annot_i) { - int count = annot_i->end - annot_i->begin ; + //FIXME: was I doing something here? + int count = (*annot_i)->stop() - (*annot_i)->start(); } } +BOOST_AUTO_TEST_CASE( sequence_no_trailing_newline ) +{ + // sorry about the long string... + string s = "AATTACACAAGGAATATAGGTAGTTTGAATAAAAATATCTTTAACAGCTTGGAGCTATTGAGACAGGAACACTTCCACGCACATGCACAGTTAAACAACTTGAGTGCAACACACAACATTGGCACTAAACGAGATTGAAGGGGGACTTTTTGTGTGTTTTTTTTTCTCTTTTCTTTTTTTGTTATAGTTACTTCAAGTAACACAGCTTGCTTCATATAAATAAGTTAAAACATCTATTTTTTTTCAAGACAAAGCCATTCAGGACAAAGAGATGAACAGAAAGCAGATCTACTTATACAGGCGCTATAATGGCAATAAACAGGCTCATGATTAAAAGATGAATTAGGGCAACGAGAACAGGGCTTCTTCACAGAAGGAACACAAGGGAGTTTCAGAAAGTCACCTTAGTACTGACACTACGCGGGATCCGCTAATACTGCTCAGTACTTTAAACGCTCAGATACTCAGGGACGGAAGGCCCCTCCTGCCGCGGCCATGCTCATGCTTTTCAGCTTATTATCTTTTTTCCACTTCATTCTCCGGTTTTGGAACCAGATTTTAATTTGTCTCTCGGAGAGGCAAAGAGCATGTGCTATTTCAATCCTCCTTCTGCGGGTCAGGTAACGGTTGAAGTGGAACTCCTTCTCCAGCTCCAGGGTCTGGTAGCGCGTGTAGGCCGTCCGGGCCCTTTTGCCTTCCGGGCCGCCTATGTTGTCTGCAATAGAAAAGTCAGCGGTTTAGCCACCAACTCCTGTCTTCCAAAGTCCGCCAGGGGGACAAGCTTGGGTCATGAGCAGGGAACCCAGGCGAAAAGCTCAACAAGTTCTGCCTACCAGCCCGCACACCCCTCCCGAATTTCCTTCTCTCTTCCTTTCTAGAAAGAAAACAATACGATTTGGACCCTGGGAACAATCTGCCCATCTGAGGCTGGGGCCGTGTCCCGGCGGACTCCGGCTTTCCCTGGCCCCTCTCCTGCCCCCTCCGCCCTGCCCCGGGCGCCCCGATCGGGAGGCACAGCCCTCCCAGGCTGCCCACCGCACAGAAACCCAGGAAGCAAGGCCCTTTCCTGAGCGCCCAAGTGGCCTTCGGGTCACCCTCCCTCAAAGTTCCAGCCCCGAGAGCCGCCTCCCGTTTCCAGCCTGCAGGGTTGGGGAGCCTGTTTTCTTTTTCTTCCCTTTCCTTCTCTCTCCCTCCTGCCCCCAAAATTCAGAATCCTGCAGGCTCTCGCCTCGATTCTTTCCCCCAAGCCCCTTTTCGGGGGCTGTAATTAGTAACGCTGTTTCCCCAGCGTAGCCCTCCTCATAAATTATCCGCCGTGACAAGCCCGATTCACGGCTGCTACAGCCATCCTCTACCTCTCTGCGCCTTGCTCGGCTGGCCTGACCCGGGAGCGCGTCCCAAGGCGTGGGGTTCCAGAGGGGTTTTTTGCTTCCTCCCCCTTCCAACGTCTAAACTGTCCCAGAGAACGCCCATTTCCCCCACTATTTGTGAGCGCAGGGTGCTCGCAAAGAAGAGGAGGAAGGAGGAAGGCAGGGGAGGGAGAACGGCAAGGAGAGCTCCGCAGGGCTGGGAGAAATGAGACCAAGAGAGACTGGGAGAGGGCGGCAGAGAAGAGAGGGGGGACCGAGAGCCGCGTCCCCGCGGTCGCGTGGATTTAGAAAAAGGCTGGCTTTACCATGACTTATGTGCAGCTTGCGCATCCAGGGGTAGATCTGGGGTTGGGCGGGCGGCGCCGGGCTCGGCTCGCTCTGCGCACTCGCCTGCTCGCTGCTGGCAGGGGCGTCCTCCTCGGCTCCGGACGCCGTGCCAACCCCCTCTCTGCTGCTGATGTGGGTGCTGCCGGCGTCGGCCGAGGCGCCGCTGGAGTTGCTTAGGGAGTTTTTCCCGCCGTGGTGGCTGTCGCTGCCGGGCGAGGGGGCCACGGCGGAGCAGGGCAGCGGATCGGGCTGAGGAGAGTGCGTGGACGTGGCCGGCTGGCTGTACCTGGGCTCGGCGGGCGCCGCGCTGGCGCTGGCAGCGTAGCTGCGGGCGCGCTCTCCGGAGCCAAAGTGGCCGGAGCCCGAGCGGCCGACGCTGAGATCCATGCCATTGTAGCCGTAGCCGTACCTGCCGGAGTGCATGCTCGCCGAGTCCCTGAATTGCTCGCTCACGGAACTATGATCTCCATAATTATGCAACTGGTAGTCCGGGCCATTTGGATAGCGACCGCAAAATGAGTTTACAAAATAAGAGCTCATTTGTTTTTTGATATGTGTGCTTGATTTGTGGCTCGCGGTCGTTTGTGCGTCTATAGCACCCTT"; + std::string species = "HumanHXA5\n"; + std::string header0 = ">hg18_knownGene_NM_019102_0\n"; + std::string str0 = "GGGTGCTATAGACGCACAAACGACCGCGAGCCACAAATCAAGCACACATATCAAAAAACAAATGAGCTCTTATTTTGTAAACTCATTTTGCGGTCGCTATCCAAATGGCCCGGACTACCAGTTGCATAATTATGGAGATCATAGTTCCGTGAGCGAGCAATTCAGGGACTCGGCGAGCATGCACTCCGGCAGGTACGGCTACGGCTACAATGGCATGGATCTCAGCGTCGGCCGCTCGGGCTCCGGCCACTTTGGCTCCGGAGAGCGCGCCCGCAGCTACGCTGCCAGCGCCAGCGCGGCGCCCGCCGAGCCCAGGTACAGCCAGCCGGCCACGTCCACGCACTCTCCTCAGCCCGATCCGCTGCCCTGCTCCGCCGTGGCCCCCTCGCCCGGCAGCGACAGCCACCACGGCGGGAAAAACTCCCTAAGCAACTCCAGCGGCGCCTCGGCCGACGCCGGCAGCACCCACATCAGCAGCAGAGAGGGGGTTGGCACGGCGTCCGGAGCCGAGGAGGACGCCCCTGCCAGCAGCGAGCAGGCGAGTGCGCAGAGCGAGCCGAGCCCGGCGCCGCCCGCCCAACCCCAGATCTACCCCTGGATGCGCAAGCTGCACATAAGTCATG"; + std::string header1 = ">hg18_knownGene_NM_019102_1\n"; + std::string str1 = "ACAACATAGGCGGCCCGGAAGGCAAAAGGGCCCGGACGGCCTACACGCGCTACCAGACCCTGGAGCTGGAGAAGGAGTTCCACTTCAACCGTTACCTGACCCGCAGAAGGAGGATTGAAATAGCACATGCTCTTTGCCTCTCCGAGAGACAAATTAAAATCTGGTTCCAAAACCGGAGAATGAAGTGGAAAAAAGATAATAAGCTGAAAAGCATGAGCATGGCCGCGGCAGGAGGGGCCTTCCGTCCCTGAGTATCTGAGCGTTTAAAGTACTGAGCAGTATTAGCGGATCCCGCGTAGTGTCAGTACTAAGGTGACTTTCTGAAACTCCCTTGTGTTCCTTCTGTGAAGAAGCCCTGTTCTCGTTGCCCTAATTCATCTTTTAATCATGAGCCTGTTTATTGCCATTATAGCGCCTGTATAAGTAGATCTGCTTTCTGTTCATCTCTTTGTCCTGAATGGCTTTGTCTTGAAAAAAAATAGATGTTTTAACTTATTTATATGAAGCAAGCTGTGTTACTTGAAGTAACTATAACAAAAAAAGAAAAGAGAAAAAAAAACACACAAAAAGTCCCCCTTCAATCTCGTTTAGTGCCAATGTTGTGTGTTGCACTCAAGTTGTTTAACTGTGCATGTGCGTGGAAGTGTTCCTGTCTCAATAGCTCCAAGCTGTTAAAGATATTTTTATTCAAACTACCTATATTCCTTGT"; + stringstream annot; + annot << species + << header0 + << str0 << std::endl + << std::endl + << header1 + << str1; + // need to convert strings to sequences for reverse complementing + Sequence seq0(str0, reduced_dna_alphabet); + Sequence seq1(str1, reduced_dna_alphabet); + + Sequence annotated_seq(s, reduced_dna_alphabet); + annotated_seq.load_annot(annot, 0, 0); + + SeqSpanRefList annots_list = annotated_seq.annotations(); + // both sequences were found + BOOST_REQUIRE_EQUAL( annots_list.size(), 2 ); + + std::vector annots(annots_list.begin(), annots_list.end()); + // are they the same sequence? + BOOST_CHECK_EQUAL( annots[0]->size(), seq0.size()); + BOOST_CHECK_EQUAL( annots[0]->sequence(), seq0.rev_comp() ); + // this should hopefully catch the case when my hack in + // sequence.cpp::push_back_seq::operator() is no longer needed. + // spirit (or my grammar was duplicating the last char, + // the hack removes the duplicate. but if what ever's causing + // the dup gets fixed actual meaningful data will be being removed. + // see mussa ticket:265 for more information + BOOST_CHECK_EQUAL( annots[1]->size(), seq1.size()); + BOOST_CHECK_EQUAL( annots[1]->sequence(), seq1.rev_comp() ); + +} + BOOST_AUTO_TEST_CASE( subseq_annotation_test ) { string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC" @@ -816,29 +889,36 @@ BOOST_AUTO_TEST_CASE( subseq_annotation_test ) "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT"); Sequence seq(s, reduced_dna_alphabet); - - seq.add_annotation(annot(0, 10, "0-10", "0-10")); - seq.add_annotation(annot(10, 20, "10-20", "10-20")); - seq.add_annotation(annot(0, 20, "0-20", "0-20")); - seq.add_annotation(annot(8, 12, "8-12", "8-12")); - seq.add_annotation(annot(100, 5000, "100-5000", "100-5000")); + seq.add_annotation("0-10", "0-10", 0, 10); + seq.add_annotation("10-20", "10-20", 10, 20); + seq.add_annotation("0-20", "0-20", 0, 20); + seq.add_annotation("8-12", "8-12", 8, 12); + seq.add_annotation("100-5000", "100-5000", 100, 5000); Sequence subseq = seq.subseq(5, 10); - const list annots = subseq.annotations(); - // generate some ground truth - list correct; - correct.push_back(annot(0, 5, "0-10", "0-10")); - correct.push_back(annot(5,10, "10-20", "10-20")); - correct.push_back(annot(0,10, "0-20", "0-20")); - correct.push_back(annot(3, 7, "8-12", "8-12")); - BOOST_REQUIRE_EQUAL( annots.size(), correct.size() ); - - list::iterator correct_i = correct.begin(); - list::const_iterator annot_i = annots.begin(); - for(; annot_i != annots.end(); ++annot_i, ++correct_i) - { - BOOST_CHECK( *annot_i == *correct_i ); - } + SeqSpanRefList annots_list = subseq.annotations(); + BOOST_REQUIRE_EQUAL( annots_list.size(), 4 ); + + std::vector annots(annots_list.begin(), annots_list.end()); + BOOST_CHECK_EQUAL( annots[0]->parentStart(), 0); + BOOST_CHECK_EQUAL( annots[0]->size(), 5); + BOOST_REQUIRE( annots[0]->annotations() ); + BOOST_CHECK_EQUAL( annots[0]->annotations()->name(), "0-10"); + + BOOST_CHECK_EQUAL( annots[1]->parentStart(), 5); + BOOST_CHECK_EQUAL( annots[1]->size(), 5); + BOOST_REQUIRE( annots[1]->annotations() ); + BOOST_CHECK_EQUAL( annots[1]->annotations()->name(), "10-20"); + + BOOST_CHECK_EQUAL( annots[2]->parentStart(), 0); + BOOST_CHECK_EQUAL( annots[2]->size(), 10); + BOOST_REQUIRE( annots[2]->annotations() ); + BOOST_CHECK_EQUAL( annots[2]->annotations()->name(), "0-20"); + + BOOST_CHECK_EQUAL( annots[3]->parentStart(), 3); + BOOST_CHECK_EQUAL( annots[3]->size(), 7); + BOOST_REQUIRE( annots[3]->annotations() ); + BOOST_CHECK_EQUAL( annots[3]->annotations()->name(), "8-12"); } BOOST_AUTO_TEST_CASE( motif_annotation_update ) @@ -856,9 +936,9 @@ BOOST_AUTO_TEST_CASE( motif_annotation_update ) // starting conditions BOOST_CHECK_EQUAL(seq.annotations().size(), 0); BOOST_CHECK_EQUAL(seq.motifs().size(), 0); - seq.add_annotation(annot(0, 10, "0-10", "0-10")); - seq.add_annotation(annot(10, 20, "10-20", "10-20")); - seq.add_annotation(annot(0, 20, "0-20", "0-20")); + seq.add_annotation("0-10", "0-10", 0, 10); + seq.add_annotation("10-20", "10-20", 10, 20); + seq.add_annotation("0-20", "0-20", 0, 20); BOOST_CHECK_EQUAL(seq.annotations().size(), 3); BOOST_CHECK_EQUAL(seq.motifs().size(), 0); seq.add_motif("CCGTCCC"); @@ -889,7 +969,7 @@ BOOST_AUTO_TEST_CASE( get_name ) seq.set_fasta_header("fasta human"); BOOST_CHECK_EQUAL( seq.get_name(), "fasta human"); } - +/* BOOST_AUTO_TEST_CASE( serialize_simple ) { std::string seq_string = "AAGGCCTT"; @@ -920,8 +1000,7 @@ BOOST_AUTO_TEST_CASE( serialize_tree ) seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); - annot a1(6,7,"t","t"); - seq.add_annotation(a1); + seq.add_annotation("t", "t", 6, 7); std::ostringstream oss; // allocate/deallocate serialization components @@ -950,8 +1029,7 @@ BOOST_AUTO_TEST_CASE( serialize_xml_sequence ) seq.set_species("ribbet"); seq.add_motif("AA"); seq.add_motif("GC"); - annot a1(6,7,"t","t"); - seq.add_annotation(a1); + seq.add_annotation("t", "t", 6, 7); std::ostringstream oss; // allocate/deallocate serialization components @@ -999,3 +1077,4 @@ BOOST_AUTO_TEST_CASE( serialize_xml_two ) // test if our pointers are the same BOOST_CHECK_EQUAL(seq1_loaded.data(), seq2_loaded.data()); } +*/