1 #include <boost/test/auto_unit_test.hpp>
2 #include <boost/filesystem/path.hpp>
3 #include <boost/filesystem/operations.hpp>
4 namespace fs=boost::filesystem;
10 #include <boost/archive/text_oarchive.hpp>
11 #include <boost/archive/text_iarchive.hpp>
12 #include <boost/archive/xml_oarchive.hpp>
13 #include <boost/archive/xml_iarchive.hpp>
15 #include "alg/sequence.hpp"
16 #include "mussa_exceptions.hpp"
20 BOOST_AUTO_TEST_CASE( sequence_get_sequence )
23 // make sure that retrieving the sequence doesn't throw an error
24 BOOST_CHECK_EQUAL(s.get_sequence(), std::string() );
27 //! when we try to load a missing file, do we get an error?
28 BOOST_AUTO_TEST_CASE( sequence_load_exception )
31 // there should be errors when we try to load something that doesn't exist
32 BOOST_CHECK_THROW( s.load_fasta("alkejralk", 1, 0, 0), mussa_load_error);
33 BOOST_CHECK_THROW( s.load_annot("alkejralk", 0, 0), mussa_load_error);
36 BOOST_AUTO_TEST_CASE( sequence_eol_conventions )
38 string header(">Header");
39 string line1("AAAAGGGGCCCCTTTTT");
40 string line2("AAAAGGGGCCCCTTTTT");
41 int seq_len = line1.size() + line2.size();
44 cr << header << "\015" << line1 << "\015" << line2 << "\015";
46 seq_cr.load_fasta(cr);
49 crlf << header << "\015\012" << line1 << "\015\012" << line2 << "\015\012";
51 seq_crlf.load_fasta(crlf);
54 lf << header << "\012" << line1 << "\012" << line2 << "\012";
56 seq_lf.load_fasta(lf);
58 BOOST_CHECK_EQUAL(seq_cr.size(), seq_len);
59 BOOST_CHECK_EQUAL(seq_crlf.size(), seq_len);
60 BOOST_CHECK_EQUAL(seq_cr.size(), seq_len);
64 //! Do simple operations work correctly?
65 BOOST_AUTO_TEST_CASE( sequence_filter )
67 const char *core_seq = "AATTGGCC";
68 Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
69 BOOST_CHECK_EQUAL(s1, core_seq);
71 Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
72 BOOST_CHECK_EQUAL(s2, "AATTGGCC");
73 BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
74 BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
75 BOOST_CHECK_EQUAL(s2.size(), s2.size());
76 BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
78 Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
79 BOOST_CHECK_EQUAL(s3, "ANNNG");
80 BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
82 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2);
83 BOOST_CHECK_EQUAL(s3, "AA");
84 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
85 BOOST_CHECK_EQUAL( s3, "GG");
86 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
87 BOOST_CHECK_EQUAL( s3, "CCTT");
90 BOOST_CHECK_EQUAL(s3, "AAGGNN");
93 BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
95 std::string agct("AGCT");
96 Sequence seq(agct, Sequence::nucleic_alphabet);
97 BOOST_CHECK_EQUAL(seq.size(), agct.size());
98 BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
100 std::string bdv("BDv");
101 Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
102 BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
103 BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
107 BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
109 std::string agct("AGCT");
111 BOOST_CHECK_EQUAL(seq.size(), agct.size());
112 BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
113 BOOST_CHECK_EQUAL(seq[0], agct[0]);
114 BOOST_CHECK_EQUAL(seq[1], agct[1]);
115 BOOST_CHECK_EQUAL(seq[2], agct[2]);
116 BOOST_CHECK_EQUAL(seq[3], agct[3]);
118 std::string bdv("BDv");
119 Sequence seq_bdv(bdv);
120 BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
121 // default alphabet only allows AGCTUN
122 BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");
125 BOOST_AUTO_TEST_CASE( subseq_names )
127 Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
128 s1.set_species("species");
129 s1.set_fasta_header("a fasta header");
130 Sequence s2 = s1.subseq(2,2);
131 BOOST_CHECK_EQUAL(s2, "GG");
132 BOOST_CHECK_EQUAL(s2.get_species(), s1.get_species());
133 BOOST_CHECK_EQUAL(s2.get_fasta_header(), s1.get_fasta_header());
136 BOOST_AUTO_TEST_CASE( sequence_start_stop )
139 BOOST_CHECK_EQUAL( s1.start(), 0 );
140 BOOST_CHECK_EQUAL( s1.stop(), 0 );
142 std::string seq_string("AAGGCCTT");
143 Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
144 BOOST_CHECK_EQUAL( s2.start(), 0 );
145 BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
147 std::string s3seq_string = seq_string.substr(2,3);
148 Sequence s3 = s2.subseq(2,3);
149 BOOST_CHECK_EQUAL( s3.start(), 2);
150 BOOST_CHECK_EQUAL( s3.stop(), 2+3);
151 BOOST_CHECK_EQUAL( s3.size(), 3);
152 BOOST_CHECK_EQUAL( s3, s3seq_string);
154 std::string s4seq_string = s3seq_string.substr(1,1);
155 Sequence s4 = s3.subseq(1,1);
156 BOOST_CHECK_EQUAL( s4.start(), 1 );
157 BOOST_CHECK_EQUAL( s4.stop(), 1+1);
158 BOOST_CHECK_EQUAL( s4.size(), 1);
159 BOOST_CHECK_EQUAL( s4, s4seq_string);
162 //! Can we load data from a file
163 BOOST_AUTO_TEST_CASE( sequence_load )
165 fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
166 seq_path /= "human_mck_pro.fa";
168 s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
169 BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
170 BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
171 BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
172 "muscle creatine kinase gene (CKMM), "
176 BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
178 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
179 std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
180 std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
181 std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
182 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
183 std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
184 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
185 std::stringstream garbage_fasta(garbage_string);
188 s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
189 BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta,
190 Sequence::reduced_dna_alphabet),
191 sequence_invalid_load_error);
192 BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta,
193 Sequence::reduced_dna_alphabet),
194 sequence_invalid_load_error);
195 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
196 Sequence::reduced_dna_alphabet),
197 sequence_invalid_load_error);
201 BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
203 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
204 std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
205 std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
206 std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
207 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
208 std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
209 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
210 std::stringstream garbage_fasta(garbage_string);
213 s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
214 BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta,
215 Sequence::reduced_rna_alphabet),
216 sequence_invalid_load_error);
217 BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta,
218 Sequence::reduced_rna_alphabet),
219 sequence_invalid_load_error);
220 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
221 Sequence::reduced_rna_alphabet),
222 sequence_invalid_load_error);
225 BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
227 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
228 std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
229 std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
230 std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
231 std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
232 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
233 std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
234 std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
235 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
236 std::stringstream garbage_fasta(garbage_string);
240 // there's two copies of reduced_rna_fasta because i didn't feel like
241 // figuring out how to properly reset the read pointer in a stringstream
242 s.load_fasta(reduced_rna_fasta1);
243 specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
244 BOOST_CHECK_EQUAL(s, specific);
246 s.load_fasta(reduced_dna_fasta1);
247 specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
248 BOOST_CHECK_EQUAL(s, specific);
250 BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta),
251 sequence_invalid_load_error);
252 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta),
253 sequence_invalid_load_error);
256 BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
258 std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
259 Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
260 Sequence seqr = seq.rev_comp();
262 BOOST_CHECK( seq != seqr );
263 BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
264 BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
267 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
269 std::string dna_str("AGCTN");
270 Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
271 BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");
272 BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
275 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
277 std::string rna_str("AGCUN");
278 Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
279 BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");
280 BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
283 BOOST_AUTO_TEST_CASE( annotation_load )
285 string annot_data = "human\n"
289 "50\t55 anothername\n"
294 "75\t90\tname2\ttype2\n"
295 "100 120 name-asdf type!@#$%\n"
299 Sequence seq(s, Sequence::reduced_dna_alphabet);
301 //istringstream annot_stream(annot_data);
302 seq.parse_annot(annot_data, 0, 0);
303 std::list<annot> annots_list = seq.annotations();
304 std::vector<annot> annots(annots_list.begin(), annots_list.end());
305 BOOST_REQUIRE_EQUAL( annots.size(), 8);
306 BOOST_CHECK_EQUAL( annots[0].begin, 0 );
307 BOOST_CHECK_EQUAL( annots[0].end, 10 );
308 BOOST_CHECK_EQUAL( annots[0].type, "type");
309 BOOST_CHECK_EQUAL( annots[0].name, "name");
310 BOOST_CHECK_EQUAL( annots[1].name, "myf7");
311 BOOST_CHECK_EQUAL( annots[2].name, "myod");
312 BOOST_CHECK_EQUAL( annots[3].name, "anothername");
313 BOOST_CHECK_EQUAL( annots[4].name, "backward");
314 BOOST_CHECK_EQUAL( annots[5].name, "name2");
315 BOOST_CHECK_EQUAL( annots[5].end, 90);
316 BOOST_CHECK_EQUAL( annots[6].begin, 100);
317 BOOST_CHECK_EQUAL( annots[6].end, 120);
318 BOOST_CHECK_EQUAL( annots[6].name, "name-asdf");
319 BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%");
320 // sequence defined annotations will always be after the
321 // absolute positions
322 BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf");
323 BOOST_CHECK_EQUAL( annots[7].begin, 100);
325 //BOOST_CHECK_EQUAL( annots
328 BOOST_AUTO_TEST_CASE( annotation_broken_load )
330 string annot_data = "human\n"
332 "blah60 50 backward\n"
339 Sequence seq(s, Sequence::reduced_dna_alphabet);
341 BOOST_CHECK_THROW(seq.parse_annot(annot_data, 0, 0), annotation_load_error);
342 BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
345 BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
347 // this actually is basically what's returned by UCSC
348 // (well actually with some of the sequence and copies of fasta blocks
349 // removed to make the example shorter
350 string annot_data = "\n"
352 ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
353 "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
354 ">hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
355 "GGTGGAGACGACCTGGACCCTAACTACGT\n"
363 "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
365 "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
366 Sequence seq(s, Sequence::reduced_dna_alphabet);
367 seq.parse_annot(annot_data);
368 std::list<annot> annots = seq.annotations();
369 BOOST_CHECK_EQUAL( annots.size(), 2);
372 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
374 string annot_data = "0 10 name type\n"
377 "50\t55 anothername\n"
382 "75\t90\tname2\ttype2\n"
383 "100 120 name-asdf type!@#$%\n"
387 Sequence seq(s, Sequence::reduced_dna_alphabet);
389 //istringstream annot_stream(annot_data);
390 seq.parse_annot(annot_data, 0, 0);
391 std::list<annot> annots_list = seq.annotations();
392 std::vector<annot> annots(annots_list.begin(), annots_list.end());
393 BOOST_REQUIRE_EQUAL( annots.size(), 8);
394 BOOST_CHECK_EQUAL( annots[0].begin, 0 );
395 BOOST_CHECK_EQUAL( annots[0].end, 10 );
396 BOOST_CHECK_EQUAL( annots[0].type, "type");
399 // ticket:83 when you try to load a sequence from a file that doesn't
400 // have fasta headers it crashes.
401 BOOST_AUTO_TEST_CASE( sequence_past_end )
403 fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq" );
404 seq_path /= "misformated_seq.fa";
406 BOOST_CHECK_THROW( s.load_fasta(seq_path), mussa_load_error );
409 BOOST_AUTO_TEST_CASE ( sequence_empty )
413 BOOST_CHECK_EQUAL( s.empty(), true );
415 BOOST_CHECK_EQUAL( s.empty(), false );
417 BOOST_CHECK_EQUAL( s.empty(), true);
419 BOOST_CHECK_EQUAL( s.empty(), true);
422 BOOST_AUTO_TEST_CASE ( sequence_size )
426 BOOST_CHECK_EQUAL( s.size(), 0);
427 std::string seq_string("AAAGGG");
429 BOOST_CHECK_EQUAL( s.size(), seq_string.size() );
431 BOOST_CHECK_EQUAL( s.size(), 0);
433 BOOST_CHECK_EQUAL( s.size(), 0);
436 BOOST_AUTO_TEST_CASE( sequence_empty_equality )
438 Sequence szero("", Sequence::reduced_dna_alphabet);
439 BOOST_CHECK_EQUAL(szero.empty(), true);
440 BOOST_CHECK_EQUAL(szero, szero);
441 BOOST_CHECK_EQUAL(szero, "");
443 Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
445 BOOST_CHECK_EQUAL(sclear.empty(), true);
446 BOOST_CHECK_EQUAL(sclear, sclear);
447 BOOST_CHECK_EQUAL(sclear, szero);
448 BOOST_CHECK_EQUAL(sclear, "");
451 BOOST_AUTO_TEST_CASE ( sequence_iterators )
453 std::string seq_string = "AAGGCCTTNNTATA";
454 Sequence s(seq_string, Sequence::reduced_dna_alphabet);
455 const Sequence cs(s);
456 std::string::size_type count = 0;
458 std::string::iterator str_itor;
459 Sequence::const_iterator s_itor;
460 Sequence::const_iterator cs_itor;
462 for( str_itor = seq_string.begin(),
464 cs_itor = cs.begin();
465 str_itor != seq_string.end() and
466 s_itor != s.end() and
468 ++str_itor, ++s_itor, ++cs_itor, ++count)
470 BOOST_CHECK_EQUAL ( *str_itor, *s_itor );
471 BOOST_CHECK_EQUAL ( *s_itor, *cs_itor );
472 BOOST_CHECK_EQUAL ( *cs_itor, *str_itor );
474 BOOST_CHECK_EQUAL( seq_string.size(), count );
475 BOOST_CHECK_EQUAL( s.size(), count );
476 BOOST_CHECK_EQUAL( cs.size(), count );
479 BOOST_AUTO_TEST_CASE( sequence_motifs )
482 string bogus("AATTGGAA");
483 Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
485 list<motif>::const_iterator motif_i = s1.motifs().begin();
486 list<motif>::const_iterator motif_end = s1.motifs().end();
488 // do our iterators work?
489 BOOST_CHECK( motif_i == s1.motifs().begin() );
490 BOOST_CHECK( motif_end == s1.motifs().end() );
491 BOOST_CHECK( motif_i == motif_end );
493 // this shouldn't show up
495 BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
496 BOOST_CHECK_EQUAL( s1.motifs().size(), 0 );
499 BOOST_CHECK( s1.motifs().begin() != s1.motifs().end() );
500 BOOST_CHECK_EQUAL( s1.motifs().size(), 2 );
502 for(motif_i = s1.motifs().begin();
503 motif_i != s1.motifs().end();
506 BOOST_CHECK_EQUAL( motif_i->type, "motif" );
507 BOOST_CHECK_EQUAL( motif_i->name, m);
508 BOOST_CHECK_EQUAL( motif_i->sequence, m);
512 BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
514 /* FIXME: enable this when i find a way of passing storing the motif name
515 // does our annotation travel?
516 Sequence motif_seq(m);
517 motif_seq.set_fasta_header("hi");
518 s1.add_motif(motif_seq);
520 BOOST_CHECK_EQUAL(s1.motifs().size(), 2);
521 for(motif_i = s1.motifs().begin();
522 motif_i != s1.motifs().end();
525 BOOST_CHECK_EQUAL( motif_i->type, "motif" );
526 BOOST_CHECK_EQUAL( motif_i->name, "hi");
527 BOOST_CHECK_EQUAL( motif_i->sequence, m);
532 BOOST_AUTO_TEST_CASE( annot_test )
534 annot a(0, 10, "test", "thing");
536 BOOST_CHECK_EQUAL( a.begin, 0 );
537 BOOST_CHECK_EQUAL( a.end, 10 );
538 BOOST_CHECK_EQUAL( a.type, "test" );
539 BOOST_CHECK_EQUAL( a.name, "thing" );
541 motif m(10, "AAGGCC");
542 BOOST_CHECK_EQUAL( m.begin, 10 );
543 BOOST_CHECK_EQUAL( m.type, "motif" );
544 BOOST_CHECK_EQUAL( m.name, "AAGGCC" );
545 BOOST_CHECK_EQUAL( m.end, 10+6 );
548 BOOST_AUTO_TEST_CASE( annotate_from_sequence )
550 string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
551 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
552 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
553 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
554 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
555 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
556 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
557 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
559 string gga("GGACACCTC");
560 Sequence seq(s, Sequence::reduced_dna_alphabet);
562 std::list<Sequence> query_list;
563 std::list<string> string_list;
564 query_list.push_back(Sequence(gc));
565 string_list.push_back(gc);
566 query_list.push_back(Sequence(gga));
567 string_list.push_back(gga);
569 BOOST_CHECK_EQUAL( seq.annotations().size(), 0 );
570 seq.find_sequences(query_list.begin(), query_list.end());
573 for(list<string>::iterator string_i = string_list.begin();
574 string_i != string_list.end();
577 string::size_type pos=0;
578 while(pos != string::npos) {
579 pos = s.find(*string_i, pos);
580 if (pos != string::npos) {
586 BOOST_CHECK_EQUAL(seq.annotations().size(), count);
587 const std::list<annot> &a = seq.annotations();
588 for (std::list<annot>::const_iterator annot_i = a.begin();
592 int count = annot_i->end - annot_i->begin ;
596 BOOST_AUTO_TEST_CASE( subseq_annotation_test )
598 string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
599 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
600 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
601 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
602 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
603 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
604 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
605 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
606 Sequence seq(s, Sequence::reduced_dna_alphabet);
609 seq.add_annotation(annot(0, 10, "0-10", "0-10"));
610 seq.add_annotation(annot(10, 20, "10-20", "10-20"));
611 seq.add_annotation(annot(0, 20, "0-20", "0-20"));
612 seq.add_annotation(annot(8, 12, "8-12", "8-12"));
613 seq.add_annotation(annot(100, 5000, "100-5000", "100-5000"));
615 Sequence subseq = seq.subseq(5, 10);
616 const list<annot> annots = subseq.annotations();
617 // generate some ground truth
619 correct.push_back(annot(0, 5, "0-10", "0-10"));
620 correct.push_back(annot(5,10, "10-20", "10-20"));
621 correct.push_back(annot(0,10, "0-20", "0-20"));
622 correct.push_back(annot(3, 7, "8-12", "8-12"));
623 BOOST_REQUIRE_EQUAL( annots.size(), correct.size() );
625 list<annot>::iterator correct_i = correct.begin();
626 list<annot>::const_iterator annot_i = annots.begin();
627 for(; annot_i != annots.end(); ++annot_i, ++correct_i)
629 BOOST_CHECK( *annot_i == *correct_i );
633 BOOST_AUTO_TEST_CASE( motif_annotation_update )
635 string s("CCGTCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
636 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
637 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
638 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
639 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
640 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
641 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
642 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
643 Sequence seq(s, Sequence::reduced_dna_alphabet);
645 // starting conditions
646 BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
647 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
648 seq.add_annotation(annot(0, 10, "0-10", "0-10"));
649 seq.add_annotation(annot(10, 20, "10-20", "10-20"));
650 seq.add_annotation(annot(0, 20, "0-20", "0-20"));
651 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
652 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
653 seq.add_motif("CCGTCCC");
654 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
655 BOOST_CHECK_EQUAL(seq.motifs().size(), 1);
657 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
658 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
661 BOOST_AUTO_TEST_CASE( out_operator )
663 string s("AAGGCCTT");
664 Sequence seq(s, Sequence::reduced_dna_alphabet);
668 BOOST_CHECK_EQUAL( s, buf.str() );
671 BOOST_AUTO_TEST_CASE( get_name )
673 Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
675 BOOST_CHECK_EQUAL( seq.get_name(), "" );
676 seq.set_species("hooman"); // anyone remember tradewars?
677 BOOST_CHECK_EQUAL( seq.get_name(), "hooman");
678 seq.set_fasta_header("fasta human");
679 BOOST_CHECK_EQUAL( seq.get_name(), "fasta human");
682 BOOST_AUTO_TEST_CASE( serialize_simple )
684 std::string seq_string = "AAGGCCTT";
685 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
686 seq.set_species("ribbet");
687 std::ostringstream oss;
688 // allocate/deallocate serialization components
690 boost::archive::text_oarchive oarchive(oss);
691 const Sequence& const_seq(seq);
692 BOOST_CHECK_EQUAL(seq, const_seq);
693 oarchive << const_seq;
697 std::istringstream iss(oss.str());
698 boost::archive::text_iarchive iarchive(iss);
699 iarchive >> seq_loaded;
701 BOOST_CHECK_EQUAL(seq_loaded, seq);
702 BOOST_CHECK_EQUAL(seq.get_species(), "ribbet");
705 BOOST_AUTO_TEST_CASE( serialize_tree )
707 std::string seq_string = "AAGGCCTT";
708 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
709 seq.set_species("ribbet");
712 annot a1(6,7,"t","t");
713 seq.add_annotation(a1);
715 std::ostringstream oss;
716 // allocate/deallocate serialization components
718 boost::archive::text_oarchive oarchive(oss);
719 const Sequence& const_seq(seq);
720 BOOST_CHECK_EQUAL(seq, const_seq);
721 oarchive << const_seq;
726 std::istringstream iss(oss.str());
727 boost::archive::text_iarchive iarchive(iss);
728 iarchive >> seq_loaded;
730 BOOST_CHECK_EQUAL(seq_loaded, seq);
733 // this writes out an "old" style annotated sequence
734 // with annotations attached as "motifs" and "annots"
735 BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
737 std::string seq_string = "AAGGCCTT";
738 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
739 seq.set_species("ribbet");
742 annot a1(6,7,"t","t");
743 seq.add_annotation(a1);
745 std::ostringstream oss;
746 // allocate/deallocate serialization components
748 boost::archive::xml_oarchive oarchive(oss);
749 const Sequence& const_seq(seq);
750 BOOST_CHECK_EQUAL(seq, const_seq);
751 oarchive << boost::serialization::make_nvp("root", const_seq);
755 std::istringstream iss(oss.str());
756 boost::archive::xml_iarchive iarchive(iss);
757 iarchive >> boost::serialization::make_nvp("root", seq_loaded);
759 BOOST_CHECK_EQUAL(seq_loaded, seq);
762 BOOST_AUTO_TEST_CASE( serialize_xml_two )
764 std::string seq_string = "AAGGCCTT";
765 Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
768 std::ostringstream oss;
769 // allocate/deallocate serialization components
771 boost::archive::xml_oarchive oarchive(oss);
772 const Sequence& const_seq1(seq1);
773 const Sequence& const_seq2(seq2);
774 oarchive << boost::serialization::make_nvp("seq1", const_seq1);
775 oarchive << boost::serialization::make_nvp("seq2", const_seq2);
777 //std::cout << "xml: " << oss.str() << std::endl;
778 Sequence seq1_loaded;
779 Sequence seq2_loaded;
781 std::istringstream iss(oss.str());
782 boost::archive::xml_iarchive iarchive(iss);
783 iarchive >> boost::serialization::make_nvp("seq1", seq1_loaded);
784 iarchive >> boost::serialization::make_nvp("seq2", seq2_loaded);
786 BOOST_CHECK_EQUAL(seq1_loaded, seq1);
787 BOOST_CHECK_EQUAL(seq2_loaded, seq2);
788 // test if our pointers are the same
789 BOOST_CHECK_EQUAL(seq1_loaded.c_str(), seq2_loaded.c_str());