BOOST_AUTO_TEST_CASE( annotation_load )
{
string annot_data = "human\n"
- "0 10 name type\n"
- "10 20 myf7\n"
- "20 30 myod\n"
- "50\t55 anothername\n"
- "60 50 backward\n"
- ">ident3 asdf\n"
+ "0 10 name type\n" //0
+ "10 20 myf7\n" //1
+ "20 30 myod\n" //2
+ "50\t55 anothername\n" //3
+ "60 50 backward\n" //4
+ ">ident3 asdf\n" //7 (as these are added last)
"GCT\n"
"gCTn\n"
- "75\t90\tname2\ttype2\n"
- "100 120 name-asdf type!@#$%\n"
+ "75\t90\tname2\ttype2\n" //5
+ "100 120 name-asdf type!@#$%\n" //6
;
string s(100, 'A');
s += "GCTGCTAATT";
//istringstream annot_stream(annot_data);
seq.parse_annot(annot_data, 0, 0);
- std::list<annot> annots_list = seq.annotations();
- std::vector<annot> annots(annots_list.begin(), annots_list.end());
+ SeqSpanRefList annots_list(seq.annotations());
+ std::vector<SeqSpanRef> annots(annots_list.begin(), annots_list.end());
BOOST_REQUIRE_EQUAL( annots.size(), 8);
- BOOST_CHECK_EQUAL( annots[0].begin, 0 );
- BOOST_CHECK_EQUAL( annots[0].end, 10 );
- BOOST_CHECK_EQUAL( annots[0].type, "type");
- BOOST_CHECK_EQUAL( annots[0].name, "name");
- BOOST_CHECK_EQUAL( annots[1].name, "myf7");
- BOOST_CHECK_EQUAL( annots[2].name, "myod");
- BOOST_CHECK_EQUAL( annots[3].name, "anothername");
- BOOST_CHECK_EQUAL( annots[4].name, "backward");
- BOOST_CHECK_EQUAL( annots[5].name, "name2");
- BOOST_CHECK_EQUAL( annots[5].end, 90);
- BOOST_CHECK_EQUAL( annots[6].begin, 100);
- BOOST_CHECK_EQUAL( annots[6].end, 120);
- BOOST_CHECK_EQUAL( annots[6].name, "name-asdf");
- BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%");
+ BOOST_CHECK_EQUAL( annots[0]->start(), 0 );
+ BOOST_CHECK_EQUAL( annots[0]->stop(), 10 );
+ BOOST_REQUIRE( annots[0]->annotations() );
+ BOOST_CHECK_EQUAL( annots[0]->annotations()->get("type"), "type");
+ BOOST_CHECK_EQUAL( annots[0]->annotations()->name(), "name");
+ BOOST_REQUIRE( annots[1]->annotations() );
+ BOOST_CHECK_EQUAL( annots[1]->annotations()->name(), "myf7");
+ BOOST_REQUIRE( annots[2]->annotations() );
+ BOOST_CHECK_EQUAL( annots[2]->annotations()->name(), "myod");
+ BOOST_REQUIRE( annots[3]->annotations() );
+ BOOST_CHECK_EQUAL( annots[3]->annotations()->name(), "anothername");
+ BOOST_REQUIRE( annots[4]->annotations() );
+ BOOST_CHECK_EQUAL( annots[4]->annotations()->name(), "backward");
+ BOOST_REQUIRE( annots[5]->annotations() );
+ BOOST_CHECK_EQUAL( annots[5]->annotations()->name(), "name2");
+ BOOST_CHECK_EQUAL( annots[5]->start(), 75);
+ BOOST_CHECK_EQUAL( annots[5]->stop(), 90);
+ BOOST_CHECK_EQUAL( annots[6]->start(), 100);
+ BOOST_CHECK_EQUAL( annots[6]->stop(), 110);
+ BOOST_REQUIRE( annots[6]->annotations() );
+ BOOST_CHECK_EQUAL( annots[6]->annotations()->name(), "name-asdf");
+ BOOST_CHECK_EQUAL( annots[6]->annotations()->get("type"), "type!@#$%");
// sequence defined annotations will always be after the
// absolute positions
- BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf");
- BOOST_CHECK_EQUAL( annots[7].begin, 100);
+ BOOST_REQUIRE( annots[7]->annotations() );
+ BOOST_CHECK_EQUAL( annots[7]->annotations()->name(), "ident3 asdf");
+ BOOST_CHECK_EQUAL( annots[7]->start(), 100);
+ BOOST_CHECK_EQUAL( annots[7]->stop(), 107);
//BOOST_CHECK_EQUAL( annots
}
"GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
Sequence seq(s, reduced_dna_alphabet);
seq.parse_annot(annot_data);
- std::list<annot> annots = seq.annotations();
+ SeqSpanRefList annots(seq.annotations());
BOOST_CHECK_EQUAL( annots.size(), 2);
}
//istringstream annot_stream(annot_data);
seq.parse_annot(annot_data, 0, 0);
- std::list<annot> annots_list = seq.annotations();
- std::vector<annot> annots(annots_list.begin(), annots_list.end());
+ SeqSpanRefList annots_list(seq.annotations());
+ std::vector<SeqSpanRef> annots(annots_list.begin(), annots_list.end());
BOOST_REQUIRE_EQUAL( annots.size(), 8);
- BOOST_CHECK_EQUAL( annots[0].begin, 0 );
- BOOST_CHECK_EQUAL( annots[0].end, 10 );
- BOOST_CHECK_EQUAL( annots[0].type, "type");
+ BOOST_CHECK_EQUAL( annots[0]->start(), 0 );
+ BOOST_CHECK_EQUAL( annots[0]->stop(), 10 );
+ BOOST_CHECK_EQUAL( annots[0]->annotations()->get("type"), "type");
+}
+
+// when we do a subsequence (or something that calls copy_children)
+// the annotations need to be updated to have the right parent
+BOOST_AUTO_TEST_CASE( update_annotations_seqref )
+{
+ Sequence s1("AAAAGGGG");
+ s1.add_annotation("A", "A", 0, 4);
+ BOOST_CHECK_EQUAL(s1.annotations().size(), 1);
+ BOOST_CHECK_EQUAL(s1.seqspan(), s1.annotations().front()->parent() );
+
+ Sequence subseq1(s1.subseq(2,4));
+ BOOST_CHECK_EQUAL(subseq1.annotations().size(), 1);
+ BOOST_CHECK_EQUAL(subseq1.annotations().front()->parentStart(), 0 );
+ BOOST_CHECK_EQUAL(subseq1.annotations().front()->parentStop(), 2 );
+ BOOST_CHECK_EQUAL(subseq1.seqspan(), subseq1.annotations().front()->parent() );
}
// ticket:83 when you try to load a sequence from a file that doesn't
BOOST_AUTO_TEST_CASE( annot_test )
{
- annot a(0, 10, "test", "thing");
-
- BOOST_CHECK_EQUAL( a.begin, 0 );
- BOOST_CHECK_EQUAL( a.end, 10 );
- BOOST_CHECK_EQUAL( a.type, "test" );
- BOOST_CHECK_EQUAL( a.name, "thing" );
+ Sequence s("AAAAAAAAAA");
+ s.add_annotation("test", "thing", 0, 10);
+ SeqSpanRef a(s.annotations().front());
+
+ BOOST_CHECK_EQUAL( a->start(), 0 );
+ BOOST_CHECK_EQUAL( a->stop(), 10 );
+ BOOST_CHECK_EQUAL( a->annotations()->get("name"), "test" );
+ BOOST_CHECK_EQUAL( a->annotations()->get("type"), "thing" );
motif m(10, "AAGGCC");
BOOST_CHECK_EQUAL( m.begin, 10 );
}
}
BOOST_CHECK_EQUAL(seq.annotations().size(), count);
- const std::list<annot> &a = seq.annotations();
- for (std::list<annot>::const_iterator annot_i = a.begin();
+ const SeqSpanRefList& a = seq.annotations();
+ for (SeqSpanRefList::const_iterator annot_i = a.begin();
annot_i != a.end();
++annot_i)
{
- int count = annot_i->end - annot_i->begin ;
+ //FIXME: was I doing something here?
+ int count = (*annot_i)->stop() - (*annot_i)->start();
}
}
+BOOST_AUTO_TEST_CASE( sequence_no_trailing_newline )
+{
+ // sorry about the long string...
+ string s = "AATTACACAAGGAATATAGGTAGTTTGAATAAAAATATCTTTAACAGCTTGGAGCTATTGAGACAGGAACACTTCCACGCACATGCACAGTTAAACAACTTGAGTGCAACACACAACATTGGCACTAAACGAGATTGAAGGGGGACTTTTTGTGTGTTTTTTTTTCTCTTTTCTTTTTTTGTTATAGTTACTTCAAGTAACACAGCTTGCTTCATATAAATAAGTTAAAACATCTATTTTTTTTCAAGACAAAGCCATTCAGGACAAAGAGATGAACAGAAAGCAGATCTACTTATACAGGCGCTATAATGGCAATAAACAGGCTCATGATTAAAAGATGAATTAGGGCAACGAGAACAGGGCTTCTTCACAGAAGGAACACAAGGGAGTTTCAGAAAGTCACCTTAGTACTGACACTACGCGGGATCCGCTAATACTGCTCAGTACTTTAAACGCTCAGATACTCAGGGACGGAAGGCCCCTCCTGCCGCGGCCATGCTCATGCTTTTCAGCTTATTATCTTTTTTCCACTTCATTCTCCGGTTTTGGAACCAGATTTTAATTTGTCTCTCGGAGAGGCAAAGAGCATGTGCTATTTCAATCCTCCTTCTGCGGGTCAGGTAACGGTTGAAGTGGAACTCCTTCTCCAGCTCCAGGGTCTGGTAGCGCGTGTAGGCCGTCCGGGCCCTTTTGCCTTCCGGGCCGCCTATGTTGTCTGCAATAGAAAAGTCAGCGGTTTAGCCACCAACTCCTGTCTTCCAAAGTCCGCCAGGGGGACAAGCTTGGGTCATGAGCAGGGAACCCAGGCGAAAAGCTCAACAAGTTCTGCCTACCAGCCCGCACACCCCTCCCGAATTTCCTTCTCTCTTCCTTTCTAGAAAGAAAACAATACGATTTGGACCCTGGGAACAATCTGCCCATCTGAGGCTGGGGCCGTGTCCCGGCGGACTCCGGCTTTCCCTGGCCCCTCTCCTGCCCCCTCCGCCCTGCCCCGGGCGCCCCGATCGGGAGGCACAGCCCTCCCAGGCTGCCCACCGCACAGAAACCCAGGAAGCAAGGCCCTTTCCTGAGCGCCCAAGTGGCCTTCGGGTCACCCTCCCTCAAAGTTCCAGCCCCGAGAGCCGCCTCCCGTTTCCAGCCTGCAGGGTTGGGGAGCCTGTTTTCTTTTTCTTCCCTTTCCTTCTCTCTCCCTCCTGCCCCCAAAATTCAGAATCCTGCAGGCTCTCGCCTCGATTCTTTCCCCCAAGCCCCTTTTCGGGGGCTGTAATTAGTAACGCTGTTTCCCCAGCGTAGCCCTCCTCATAAATTATCCGCCGTGACAAGCCCGATTCACGGCTGCTACAGCCATCCTCTACCTCTCTGCGCCTTGCTCGGCTGGCCTGACCCGGGAGCGCGTCCCAAGGCGTGGGGTTCCAGAGGGGTTTTTTGCTTCCTCCCCCTTCCAACGTCTAAACTGTCCCAGAGAACGCCCATTTCCCCCACTATTTGTGAGCGCAGGGTGCTCGCAAAGAAGAGGAGGAAGGAGGAAGGCAGGGGAGGGAGAACGGCAAGGAGAGCTCCGCAGGGCTGGGAGAAATGAGACCAAGAGAGACTGGGAGAGGGCGGCAGAGAAGAGAGGGGGGACCGAGAGCCGCGTCCCCGCGGTCGCGTGGATTTAGAAAAAGGCTGGCTTTACCATGACTTATGTGCAGCTTGCGCATCCAGGGGTAGATCTGGGGTTGGGCGGGCGGCGCCGGGCTCGGCTCGCTCTGCGCACTCGCCTGCTCGCTGCTGGCAGGGGCGTCCTCCTCGGCTCCGGACGCCGTGCCAACCCCCTCTCTGCTGCTGATGTGGGTGCTGCCGGCGTCGGCCGAGGCGCCGCTGGAGTTGCTTAGGGAGTTTTTCCCGCCGTGGTGGCTGTCGCTGCCGGGCGAGGGGGCCACGGCGGAGCAGGGCAGCGGATCGGGCTGAGGAGAGTGCGTGGACGTGGCCGGCTGGCTGTACCTGGGCTCGGCGGGCGCCGCGCTGGCGCTGGCAGCGTAGCTGCGGGCGCGCTCTCCGGAGCCAAAGTGGCCGGAGCCCGAGCGGCCGACGCTGAGATCCATGCCATTGTAGCCGTAGCCGTACCTGCCGGAGTGCATGCTCGCCGAGTCCCTGAATTGCTCGCTCACGGAACTATGATCTCCATAATTATGCAACTGGTAGTCCGGGCCATTTGGATAGCGACCGCAAAATGAGTTTACAAAATAAGAGCTCATTTGTTTTTTGATATGTGTGCTTGATTTGTGGCTCGCGGTCGTTTGTGCGTCTATAGCACCCTT";
+ std::string species = "HumanHXA5\n";
+ std::string header0 = ">hg18_knownGene_NM_019102_0\n";
+ std::string str0 = "GGGTGCTATAGACGCACAAACGACCGCGAGCCACAAATCAAGCACACATATCAAAAAACAAATGAGCTCTTATTTTGTAAACTCATTTTGCGGTCGCTATCCAAATGGCCCGGACTACCAGTTGCATAATTATGGAGATCATAGTTCCGTGAGCGAGCAATTCAGGGACTCGGCGAGCATGCACTCCGGCAGGTACGGCTACGGCTACAATGGCATGGATCTCAGCGTCGGCCGCTCGGGCTCCGGCCACTTTGGCTCCGGAGAGCGCGCCCGCAGCTACGCTGCCAGCGCCAGCGCGGCGCCCGCCGAGCCCAGGTACAGCCAGCCGGCCACGTCCACGCACTCTCCTCAGCCCGATCCGCTGCCCTGCTCCGCCGTGGCCCCCTCGCCCGGCAGCGACAGCCACCACGGCGGGAAAAACTCCCTAAGCAACTCCAGCGGCGCCTCGGCCGACGCCGGCAGCACCCACATCAGCAGCAGAGAGGGGGTTGGCACGGCGTCCGGAGCCGAGGAGGACGCCCCTGCCAGCAGCGAGCAGGCGAGTGCGCAGAGCGAGCCGAGCCCGGCGCCGCCCGCCCAACCCCAGATCTACCCCTGGATGCGCAAGCTGCACATAAGTCATG";
+ std::string header1 = ">hg18_knownGene_NM_019102_1\n";
+ std::string str1 = "ACAACATAGGCGGCCCGGAAGGCAAAAGGGCCCGGACGGCCTACACGCGCTACCAGACCCTGGAGCTGGAGAAGGAGTTCCACTTCAACCGTTACCTGACCCGCAGAAGGAGGATTGAAATAGCACATGCTCTTTGCCTCTCCGAGAGACAAATTAAAATCTGGTTCCAAAACCGGAGAATGAAGTGGAAAAAAGATAATAAGCTGAAAAGCATGAGCATGGCCGCGGCAGGAGGGGCCTTCCGTCCCTGAGTATCTGAGCGTTTAAAGTACTGAGCAGTATTAGCGGATCCCGCGTAGTGTCAGTACTAAGGTGACTTTCTGAAACTCCCTTGTGTTCCTTCTGTGAAGAAGCCCTGTTCTCGTTGCCCTAATTCATCTTTTAATCATGAGCCTGTTTATTGCCATTATAGCGCCTGTATAAGTAGATCTGCTTTCTGTTCATCTCTTTGTCCTGAATGGCTTTGTCTTGAAAAAAAATAGATGTTTTAACTTATTTATATGAAGCAAGCTGTGTTACTTGAAGTAACTATAACAAAAAAAGAAAAGAGAAAAAAAAACACACAAAAAGTCCCCCTTCAATCTCGTTTAGTGCCAATGTTGTGTGTTGCACTCAAGTTGTTTAACTGTGCATGTGCGTGGAAGTGTTCCTGTCTCAATAGCTCCAAGCTGTTAAAGATATTTTTATTCAAACTACCTATATTCCTTGT";
+ stringstream annot;
+ annot << species
+ << header0
+ << str0 << std::endl
+ << std::endl
+ << header1
+ << str1;
+ // need to convert strings to sequences for reverse complementing
+ Sequence seq0(str0, reduced_dna_alphabet);
+ Sequence seq1(str1, reduced_dna_alphabet);
+
+ Sequence annotated_seq(s, reduced_dna_alphabet);
+ annotated_seq.load_annot(annot, 0, 0);
+
+ SeqSpanRefList annots_list = annotated_seq.annotations();
+ // both sequences were found
+ BOOST_REQUIRE_EQUAL( annots_list.size(), 2 );
+
+ std::vector<SeqSpanRef> annots(annots_list.begin(), annots_list.end());
+ // are they the same sequence?
+ BOOST_CHECK_EQUAL( annots[0]->size(), seq0.size());
+ BOOST_CHECK_EQUAL( annots[0]->sequence(), seq0.rev_comp() );
+ // this should hopefully catch the case when my hack in
+ // sequence.cpp::push_back_seq::operator() is no longer needed.
+ // spirit (or my grammar was duplicating the last char,
+ // the hack removes the duplicate. but if what ever's causing
+ // the dup gets fixed actual meaningful data will be being removed.
+ // see mussa ticket:265 for more information
+ BOOST_CHECK_EQUAL( annots[1]->size(), seq1.size());
+ BOOST_CHECK_EQUAL( annots[1]->sequence(), seq1.rev_comp() );
+
+}
+
BOOST_AUTO_TEST_CASE( subseq_annotation_test )
{
string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
"AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
Sequence seq(s, reduced_dna_alphabet);
-
- seq.add_annotation(annot(0, 10, "0-10", "0-10"));
- seq.add_annotation(annot(10, 20, "10-20", "10-20"));
- seq.add_annotation(annot(0, 20, "0-20", "0-20"));
- seq.add_annotation(annot(8, 12, "8-12", "8-12"));
- seq.add_annotation(annot(100, 5000, "100-5000", "100-5000"));
+ seq.add_annotation("0-10", "0-10", 0, 10);
+ seq.add_annotation("10-20", "10-20", 10, 20);
+ seq.add_annotation("0-20", "0-20", 0, 20);
+ seq.add_annotation("8-12", "8-12", 8, 12);
+ seq.add_annotation("100-5000", "100-5000", 100, 5000);
Sequence subseq = seq.subseq(5, 10);
- const list<annot> annots = subseq.annotations();
- // generate some ground truth
- list<annot> correct;
- correct.push_back(annot(0, 5, "0-10", "0-10"));
- correct.push_back(annot(5,10, "10-20", "10-20"));
- correct.push_back(annot(0,10, "0-20", "0-20"));
- correct.push_back(annot(3, 7, "8-12", "8-12"));
- BOOST_REQUIRE_EQUAL( annots.size(), correct.size() );
-
- list<annot>::iterator correct_i = correct.begin();
- list<annot>::const_iterator annot_i = annots.begin();
- for(; annot_i != annots.end(); ++annot_i, ++correct_i)
- {
- BOOST_CHECK( *annot_i == *correct_i );
- }
+ SeqSpanRefList annots_list = subseq.annotations();
+ BOOST_REQUIRE_EQUAL( annots_list.size(), 4 );
+
+ std::vector<SeqSpanRef> annots(annots_list.begin(), annots_list.end());
+ BOOST_CHECK_EQUAL( annots[0]->parentStart(), 0);
+ BOOST_CHECK_EQUAL( annots[0]->size(), 5);
+ BOOST_REQUIRE( annots[0]->annotations() );
+ BOOST_CHECK_EQUAL( annots[0]->annotations()->name(), "0-10");
+
+ BOOST_CHECK_EQUAL( annots[1]->parentStart(), 5);
+ BOOST_CHECK_EQUAL( annots[1]->size(), 5);
+ BOOST_REQUIRE( annots[1]->annotations() );
+ BOOST_CHECK_EQUAL( annots[1]->annotations()->name(), "10-20");
+
+ BOOST_CHECK_EQUAL( annots[2]->parentStart(), 0);
+ BOOST_CHECK_EQUAL( annots[2]->size(), 10);
+ BOOST_REQUIRE( annots[2]->annotations() );
+ BOOST_CHECK_EQUAL( annots[2]->annotations()->name(), "0-20");
+
+ BOOST_CHECK_EQUAL( annots[3]->parentStart(), 3);
+ BOOST_CHECK_EQUAL( annots[3]->size(), 7);
+ BOOST_REQUIRE( annots[3]->annotations() );
+ BOOST_CHECK_EQUAL( annots[3]->annotations()->name(), "8-12");
}
BOOST_AUTO_TEST_CASE( motif_annotation_update )
// starting conditions
BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
- seq.add_annotation(annot(0, 10, "0-10", "0-10"));
- seq.add_annotation(annot(10, 20, "10-20", "10-20"));
- seq.add_annotation(annot(0, 20, "0-20", "0-20"));
+ seq.add_annotation("0-10", "0-10", 0, 10);
+ seq.add_annotation("10-20", "10-20", 10, 20);
+ seq.add_annotation("0-20", "0-20", 0, 20);
BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
seq.add_motif("CCGTCCC");
seq.set_species("ribbet");
seq.add_motif("AA");
seq.add_motif("GC");
- annot a1(6,7,"t","t");
- seq.add_annotation(a1);
+ seq.add_annotation("t", "t", 6, 7);
std::ostringstream oss;
// allocate/deallocate serialization components
seq.set_species("ribbet");
seq.add_motif("AA");
seq.add_motif("GC");
- annot a1(6,7,"t","t");
- seq.add_annotation(a1);
+ seq.add_annotation("t", "t", 6, 7);
std::ostringstream oss;
// allocate/deallocate serialization components