1 #include <boost/test/auto_unit_test.hpp>
2 #include <boost/filesystem/path.hpp>
3 #include <boost/filesystem/operations.hpp>
4 namespace fs=boost::filesystem;
10 #include <boost/archive/text_oarchive.hpp>
11 #include <boost/archive/text_iarchive.hpp>
12 #include <boost/archive/xml_oarchive.hpp>
13 #include <boost/archive/xml_iarchive.hpp>
15 #include "alg/sequence.hpp"
16 #include "mussa_exceptions.hpp"
20 BOOST_AUTO_TEST_CASE( sequence_get_sequence )
23 // make sure that retrieving the sequence doesn't throw an error
24 BOOST_CHECK_EQUAL(s.get_sequence(), std::string() );
27 //! when we try to load a missing file, do we get an error?
28 BOOST_AUTO_TEST_CASE( sequence_load_exception )
31 // there should be errors when we try to load something that doesn't exist
32 BOOST_CHECK_THROW( s.load_fasta("alkejralk", 1, 0, 0), mussa_load_error);
33 BOOST_CHECK_THROW( s.load_annot("alkejralk", 0, 0), mussa_load_error);
36 BOOST_AUTO_TEST_CASE( sequence_eol_conventions )
38 string header(">Header");
39 string line1("AAAAGGGGCCCCTTTTT");
40 string line2("AAAAGGGGCCCCTTTTT");
41 int seq_len = line1.size() + line2.size();
44 cr << header << "\015" << line1 << "\015" << line2 << "\015";
46 seq_cr.load_fasta(cr);
49 crlf << header << "\015\012" << line1 << "\015\012" << line2 << "\015\012";
51 seq_crlf.load_fasta(crlf);
54 lf << header << "\012" << line1 << "\012" << line2 << "\012";
56 seq_lf.load_fasta(lf);
58 BOOST_CHECK_EQUAL(seq_cr.size(), seq_len);
59 BOOST_CHECK_EQUAL(seq_crlf.size(), seq_len);
60 BOOST_CHECK_EQUAL(seq_cr.size(), seq_len);
64 //! Do simple operations work correctly?
65 BOOST_AUTO_TEST_CASE( sequence_filter )
67 const char *core_seq = "AATTGGCC";
68 Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
69 BOOST_CHECK_EQUAL(s1, core_seq);
71 Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
72 BOOST_CHECK_EQUAL(s2, "AATTGGCC");
73 BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
74 BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
75 BOOST_CHECK_EQUAL(s2.size(), s2.size());
76 BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
78 Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
79 BOOST_CHECK_EQUAL(s3, "ANNNG");
80 BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
82 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2);
83 BOOST_CHECK_EQUAL(s3, "AA");
84 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
85 BOOST_CHECK_EQUAL( s3, "GG");
86 s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
87 BOOST_CHECK_EQUAL( s3, "CCTT");
90 BOOST_CHECK_EQUAL(s3, "AAGGNN");
93 BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
95 std::string agct("AGCT");
96 Sequence seq(agct, Sequence::nucleic_alphabet);
97 BOOST_CHECK_EQUAL(seq.size(), agct.size());
98 BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
100 std::string bdv("BDv");
101 Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
102 BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
103 BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
107 BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
109 std::string agct("AGCT");
111 BOOST_CHECK_EQUAL(seq.size(), agct.size());
112 BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
113 BOOST_CHECK_EQUAL(seq[0], agct[0]);
114 BOOST_CHECK_EQUAL(seq[1], agct[1]);
115 BOOST_CHECK_EQUAL(seq[2], agct[2]);
116 BOOST_CHECK_EQUAL(seq[3], agct[3]);
118 std::string bdv("BDv");
119 Sequence seq_bdv(bdv);
120 BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
121 // default alphabet only allows AGCTUN
122 BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");
125 BOOST_AUTO_TEST_CASE( subseq_names )
127 Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
128 s1.set_species("species");
129 s1.set_fasta_header("a fasta header");
130 Sequence s2 = s1.subseq(2,2);
131 BOOST_CHECK_EQUAL(s2, "GG");
132 BOOST_CHECK_EQUAL(s2.get_species(), s1.get_species());
133 BOOST_CHECK_EQUAL(s2.get_fasta_header(), s1.get_fasta_header());
136 BOOST_AUTO_TEST_CASE( sequence_start_stop )
139 BOOST_CHECK_EQUAL( s1.start(), 0 );
140 BOOST_CHECK_EQUAL( s1.stop(), 0 );
142 std::string seq_string("AAGGCCTT");
143 Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
144 BOOST_CHECK_EQUAL( s2.start(), 0 );
145 BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
147 std::string s3seq_string = seq_string.substr(2,3);
148 Sequence s3 = s2.subseq(2,3);
149 BOOST_CHECK_EQUAL( s3.start(), 2);
150 BOOST_CHECK_EQUAL( s3.stop(), 2+3);
151 BOOST_CHECK_EQUAL( s3.size(), 3);
152 BOOST_CHECK_EQUAL( s3, s3seq_string);
154 std::string s4seq_string = s3seq_string.substr(1,1);
155 Sequence s4 = s3.subseq(1,1);
156 BOOST_CHECK_EQUAL( s4.start(), 1 );
157 BOOST_CHECK_EQUAL( s4.stop(), 1+1);
158 BOOST_CHECK_EQUAL( s4.size(), 1);
159 BOOST_CHECK_EQUAL( s4, s4seq_string);
162 //! Can we load data from a file
163 BOOST_AUTO_TEST_CASE( sequence_load )
165 fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
166 seq_path /= "human_mck_pro.fa";
168 s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
169 BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
170 BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
171 BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
172 "muscle creatine kinase gene (CKMM), "
176 BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
178 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
179 std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
180 std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
181 std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
182 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
183 std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
184 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
185 std::stringstream garbage_fasta(garbage_string);
188 s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
189 BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta,
190 Sequence::reduced_dna_alphabet),
191 sequence_invalid_load_error);
192 BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta,
193 Sequence::reduced_dna_alphabet),
194 sequence_invalid_load_error);
195 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
196 Sequence::reduced_dna_alphabet),
197 sequence_invalid_load_error);
201 BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
203 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
204 std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
205 std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
206 std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
207 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
208 std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
209 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
210 std::stringstream garbage_fasta(garbage_string);
213 s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
214 BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta,
215 Sequence::reduced_rna_alphabet),
216 sequence_invalid_load_error);
217 BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta,
218 Sequence::reduced_rna_alphabet),
219 sequence_invalid_load_error);
220 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta,
221 Sequence::reduced_rna_alphabet),
222 sequence_invalid_load_error);
225 BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
227 std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
228 std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
229 std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
230 std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
231 std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
232 std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
233 std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
234 std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
235 std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
236 std::stringstream garbage_fasta(garbage_string);
240 // there's two copies of reduced_rna_fasta because i didn't feel like
241 // figuring out how to properly reset the read pointer in a stringstream
242 s.load_fasta(reduced_rna_fasta1);
243 specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
244 BOOST_CHECK_EQUAL(s, specific);
246 s.load_fasta(reduced_dna_fasta1);
247 specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
248 BOOST_CHECK_EQUAL(s, specific);
250 BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta),
251 sequence_invalid_load_error);
252 BOOST_CHECK_THROW(s.load_fasta(garbage_fasta),
253 sequence_invalid_load_error);
256 BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
258 std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
259 Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
260 Sequence seqr = seq.rev_comp();
262 BOOST_CHECK( seq != seqr );
263 BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
264 BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
267 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
269 std::string dna_str("AGCTN");
270 Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
271 BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");
272 BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
275 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
277 std::string rna_str("AGCUN");
278 Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
279 BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");
280 BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
283 BOOST_AUTO_TEST_CASE( annotation_load )
285 string annot_data = "human\n"
289 "50\t55 anothername\n"
294 "75\t90\tname2\ttype2\n"
295 "100 120 name-asdf type!@#$%\n"
299 Sequence seq(s, Sequence::reduced_dna_alphabet);
301 //istringstream annot_stream(annot_data);
302 seq.parse_annot(annot_data, 0, 0);
303 std::list<annot> annots_list = seq.annotations();
304 std::vector<annot> annots(annots_list.begin(), annots_list.end());
305 BOOST_REQUIRE_EQUAL( annots.size(), 8);
306 BOOST_CHECK_EQUAL( annots[0].begin, 0 );
307 BOOST_CHECK_EQUAL( annots[0].end, 10 );
308 BOOST_CHECK_EQUAL( annots[0].type, "type");
309 BOOST_CHECK_EQUAL( annots[0].name, "name");
310 BOOST_CHECK_EQUAL( annots[1].name, "myf7");
311 BOOST_CHECK_EQUAL( annots[2].name, "myod");
312 BOOST_CHECK_EQUAL( annots[3].name, "anothername");
313 BOOST_CHECK_EQUAL( annots[4].name, "backward");
314 BOOST_CHECK_EQUAL( annots[5].name, "name2");
315 BOOST_CHECK_EQUAL( annots[5].end, 90);
316 BOOST_CHECK_EQUAL( annots[6].begin, 100);
317 BOOST_CHECK_EQUAL( annots[6].end, 120);
318 BOOST_CHECK_EQUAL( annots[6].name, "name-asdf");
319 BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%");
320 // sequence defined annotations will always be after the
321 // absolute positions
322 BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf");
323 BOOST_CHECK_EQUAL( annots[7].begin, 100);
325 //BOOST_CHECK_EQUAL( annots
329 BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
331 // this actually is basically what's returned by UCSC
332 // (well actually with some of the sequence and copies of fasta blocks
333 // removed to make the example shorter
334 string annot_data = "\n"
336 ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
337 "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
338 ">hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
339 "GGTGGAGACGACCTGGACCCTAACTACGT\n"
347 "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
349 "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
350 Sequence seq(s, Sequence::reduced_dna_alphabet);
351 seq.parse_annot(annot_data);
352 std::list<annot> annots = seq.annotations();
353 BOOST_CHECK_EQUAL( annots.size(), 2);
356 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
358 string annot_data = "0 10 name type\n"
361 "50\t55 anothername\n"
366 "75\t90\tname2\ttype2\n"
367 "100 120 name-asdf type!@#$%\n"
371 Sequence seq(s, Sequence::reduced_dna_alphabet);
373 //istringstream annot_stream(annot_data);
374 seq.parse_annot(annot_data, 0, 0);
375 std::list<annot> annots_list = seq.annotations();
376 std::vector<annot> annots(annots_list.begin(), annots_list.end());
377 BOOST_REQUIRE_EQUAL( annots.size(), 8);
378 BOOST_CHECK_EQUAL( annots[0].begin, 0 );
379 BOOST_CHECK_EQUAL( annots[0].end, 10 );
380 BOOST_CHECK_EQUAL( annots[0].type, "type");
383 // ticket:83 when you try to load a sequence from a file that doesn't
384 // have fasta headers it crashes.
385 BOOST_AUTO_TEST_CASE( sequence_past_end )
387 fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq" );
388 seq_path /= "misformated_seq.fa";
390 BOOST_CHECK_THROW( s.load_fasta(seq_path), mussa_load_error );
393 BOOST_AUTO_TEST_CASE ( sequence_empty )
397 BOOST_CHECK_EQUAL( s.empty(), true );
399 BOOST_CHECK_EQUAL( s.empty(), false );
401 BOOST_CHECK_EQUAL( s.empty(), true);
403 BOOST_CHECK_EQUAL( s.empty(), true);
406 BOOST_AUTO_TEST_CASE ( sequence_size )
410 BOOST_CHECK_EQUAL( s.size(), 0);
411 std::string seq_string("AAAGGG");
413 BOOST_CHECK_EQUAL( s.size(), seq_string.size() );
415 BOOST_CHECK_EQUAL( s.size(), 0);
417 BOOST_CHECK_EQUAL( s.size(), 0);
420 BOOST_AUTO_TEST_CASE( sequence_empty_equality )
422 Sequence szero("", Sequence::reduced_dna_alphabet);
423 BOOST_CHECK_EQUAL(szero.empty(), true);
424 BOOST_CHECK_EQUAL(szero, szero);
425 BOOST_CHECK_EQUAL(szero, "");
427 Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
429 BOOST_CHECK_EQUAL(sclear.empty(), true);
430 BOOST_CHECK_EQUAL(sclear, sclear);
431 BOOST_CHECK_EQUAL(sclear, szero);
432 BOOST_CHECK_EQUAL(sclear, "");
435 BOOST_AUTO_TEST_CASE ( sequence_iterators )
437 std::string seq_string = "AAGGCCTTNNTATA";
438 Sequence s(seq_string, Sequence::reduced_dna_alphabet);
439 const Sequence cs(s);
440 std::string::size_type count = 0;
442 std::string::iterator str_itor;
443 Sequence::const_iterator s_itor;
444 Sequence::const_iterator cs_itor;
446 for( str_itor = seq_string.begin(),
448 cs_itor = cs.begin();
449 str_itor != seq_string.end() and
450 s_itor != s.end() and
452 ++str_itor, ++s_itor, ++cs_itor, ++count)
454 BOOST_CHECK_EQUAL ( *str_itor, *s_itor );
455 BOOST_CHECK_EQUAL ( *s_itor, *cs_itor );
456 BOOST_CHECK_EQUAL ( *cs_itor, *str_itor );
458 BOOST_CHECK_EQUAL( seq_string.size(), count );
459 BOOST_CHECK_EQUAL( s.size(), count );
460 BOOST_CHECK_EQUAL( cs.size(), count );
463 BOOST_AUTO_TEST_CASE( sequence_motifs )
466 string bogus("AATTGGAA");
467 Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
469 list<motif>::const_iterator motif_i = s1.motifs().begin();
470 list<motif>::const_iterator motif_end = s1.motifs().end();
472 // do our iterators work?
473 BOOST_CHECK( motif_i == s1.motifs().begin() );
474 BOOST_CHECK( motif_end == s1.motifs().end() );
475 BOOST_CHECK( motif_i == motif_end );
477 // this shouldn't show up
479 BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
480 BOOST_CHECK_EQUAL( s1.motifs().size(), 0 );
483 BOOST_CHECK( s1.motifs().begin() != s1.motifs().end() );
484 BOOST_CHECK_EQUAL( s1.motifs().size(), 2 );
486 for(motif_i = s1.motifs().begin();
487 motif_i != s1.motifs().end();
490 BOOST_CHECK_EQUAL( motif_i->type, "motif" );
491 BOOST_CHECK_EQUAL( motif_i->name, m);
492 BOOST_CHECK_EQUAL( motif_i->sequence, m);
496 BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
498 /* FIXME: enable this when i find a way of passing storing the motif name
499 // does our annotation travel?
500 Sequence motif_seq(m);
501 motif_seq.set_fasta_header("hi");
502 s1.add_motif(motif_seq);
504 BOOST_CHECK_EQUAL(s1.motifs().size(), 2);
505 for(motif_i = s1.motifs().begin();
506 motif_i != s1.motifs().end();
509 BOOST_CHECK_EQUAL( motif_i->type, "motif" );
510 BOOST_CHECK_EQUAL( motif_i->name, "hi");
511 BOOST_CHECK_EQUAL( motif_i->sequence, m);
516 BOOST_AUTO_TEST_CASE( annot_test )
518 annot a(0, 10, "test", "thing");
520 BOOST_CHECK_EQUAL( a.begin, 0 );
521 BOOST_CHECK_EQUAL( a.end, 10 );
522 BOOST_CHECK_EQUAL( a.type, "test" );
523 BOOST_CHECK_EQUAL( a.name, "thing" );
525 motif m(10, "AAGGCC");
526 BOOST_CHECK_EQUAL( m.begin, 10 );
527 BOOST_CHECK_EQUAL( m.type, "motif" );
528 BOOST_CHECK_EQUAL( m.name, "AAGGCC" );
529 BOOST_CHECK_EQUAL( m.end, 10+6 );
532 BOOST_AUTO_TEST_CASE( annotate_from_sequence )
534 string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
535 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
536 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
537 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
538 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
539 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
540 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
541 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
543 string gga("GGACACCTC");
544 Sequence seq(s, Sequence::reduced_dna_alphabet);
546 std::list<Sequence> query_list;
547 std::list<string> string_list;
548 query_list.push_back(Sequence(gc));
549 string_list.push_back(gc);
550 query_list.push_back(Sequence(gga));
551 string_list.push_back(gga);
553 BOOST_CHECK_EQUAL( seq.annotations().size(), 0 );
554 seq.find_sequences(query_list.begin(), query_list.end());
557 for(list<string>::iterator string_i = string_list.begin();
558 string_i != string_list.end();
561 string::size_type pos=0;
562 while(pos != string::npos) {
563 pos = s.find(*string_i, pos);
564 if (pos != string::npos) {
570 BOOST_CHECK_EQUAL(seq.annotations().size(), count);
571 const std::list<annot> &a = seq.annotations();
572 for (std::list<annot>::const_iterator annot_i = a.begin();
576 int count = annot_i->end - annot_i->begin ;
580 BOOST_AUTO_TEST_CASE( subseq_annotation_test )
582 string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
583 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
584 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
585 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
586 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
587 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
588 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
589 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
590 Sequence seq(s, Sequence::reduced_dna_alphabet);
593 seq.add_annotation(annot(0, 10, "0-10", "0-10"));
594 seq.add_annotation(annot(10, 20, "10-20", "10-20"));
595 seq.add_annotation(annot(0, 20, "0-20", "0-20"));
596 seq.add_annotation(annot(8, 12, "8-12", "8-12"));
597 seq.add_annotation(annot(100, 5000, "100-5000", "100-5000"));
599 Sequence subseq = seq.subseq(5, 10);
600 const list<annot> annots = subseq.annotations();
601 // generate some ground truth
603 correct.push_back(annot(0, 5, "0-10", "0-10"));
604 correct.push_back(annot(5,10, "10-20", "10-20"));
605 correct.push_back(annot(0,10, "0-20", "0-20"));
606 correct.push_back(annot(3, 7, "8-12", "8-12"));
607 BOOST_REQUIRE_EQUAL( annots.size(), correct.size() );
609 list<annot>::iterator correct_i = correct.begin();
610 list<annot>::const_iterator annot_i = annots.begin();
611 for(; annot_i != annots.end(); ++annot_i, ++correct_i)
613 BOOST_CHECK( *annot_i == *correct_i );
617 BOOST_AUTO_TEST_CASE( motif_annotation_update )
619 string s("CCGTCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
620 "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
621 "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
622 "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
623 "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
624 "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
625 "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
626 "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
627 Sequence seq(s, Sequence::reduced_dna_alphabet);
629 // starting conditions
630 BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
631 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
632 seq.add_annotation(annot(0, 10, "0-10", "0-10"));
633 seq.add_annotation(annot(10, 20, "10-20", "10-20"));
634 seq.add_annotation(annot(0, 20, "0-20", "0-20"));
635 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
636 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
637 seq.add_motif("CCGTCCC");
638 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
639 BOOST_CHECK_EQUAL(seq.motifs().size(), 1);
641 BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
642 BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
645 BOOST_AUTO_TEST_CASE( out_operator )
647 string s("AAGGCCTT");
648 Sequence seq(s, Sequence::reduced_dna_alphabet);
652 BOOST_CHECK_EQUAL( s, buf.str() );
655 BOOST_AUTO_TEST_CASE( get_name )
657 Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
659 BOOST_CHECK_EQUAL( seq.get_name(), "" );
660 seq.set_species("hooman"); // anyone remember tradewars?
661 BOOST_CHECK_EQUAL( seq.get_name(), "hooman");
662 seq.set_fasta_header("fasta human");
663 BOOST_CHECK_EQUAL( seq.get_name(), "fasta human");
666 BOOST_AUTO_TEST_CASE( serialize_simple )
668 std::string seq_string = "AAGGCCTT";
669 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
670 seq.set_species("ribbet");
671 std::ostringstream oss;
672 // allocate/deallocate serialization components
674 boost::archive::text_oarchive oarchive(oss);
675 const Sequence& const_seq(seq);
676 BOOST_CHECK_EQUAL(seq, const_seq);
677 oarchive << const_seq;
681 std::istringstream iss(oss.str());
682 boost::archive::text_iarchive iarchive(iss);
683 iarchive >> seq_loaded;
685 BOOST_CHECK_EQUAL(seq_loaded, seq);
686 BOOST_CHECK_EQUAL(seq.get_species(), "ribbet");
689 BOOST_AUTO_TEST_CASE( serialize_tree )
691 std::string seq_string = "AAGGCCTT";
692 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
693 seq.set_species("ribbet");
696 annot a1(6,7,"t","t");
697 seq.add_annotation(a1);
699 std::ostringstream oss;
700 // allocate/deallocate serialization components
702 boost::archive::text_oarchive oarchive(oss);
703 const Sequence& const_seq(seq);
704 BOOST_CHECK_EQUAL(seq, const_seq);
705 oarchive << const_seq;
710 std::istringstream iss(oss.str());
711 boost::archive::text_iarchive iarchive(iss);
712 iarchive >> seq_loaded;
714 BOOST_CHECK_EQUAL(seq_loaded, seq);
717 // this writes out an "old" style annotated sequence
718 // with annotations attached as "motifs" and "annots"
719 BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
721 std::string seq_string = "AAGGCCTT";
722 Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
723 seq.set_species("ribbet");
726 annot a1(6,7,"t","t");
727 seq.add_annotation(a1);
729 std::ostringstream oss;
730 // allocate/deallocate serialization components
732 boost::archive::xml_oarchive oarchive(oss);
733 const Sequence& const_seq(seq);
734 BOOST_CHECK_EQUAL(seq, const_seq);
735 oarchive << boost::serialization::make_nvp("root", const_seq);
739 std::istringstream iss(oss.str());
740 boost::archive::xml_iarchive iarchive(iss);
741 iarchive >> boost::serialization::make_nvp("root", seq_loaded);
743 BOOST_CHECK_EQUAL(seq_loaded, seq);
746 BOOST_AUTO_TEST_CASE( serialize_xml_two )
748 std::string seq_string = "AAGGCCTT";
749 Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
752 std::ostringstream oss;
753 // allocate/deallocate serialization components
755 boost::archive::xml_oarchive oarchive(oss);
756 const Sequence& const_seq1(seq1);
757 const Sequence& const_seq2(seq2);
758 oarchive << boost::serialization::make_nvp("seq1", const_seq1);
759 oarchive << boost::serialization::make_nvp("seq2", const_seq2);
761 //std::cout << "xml: " << oss.str() << std::endl;
762 Sequence seq1_loaded;
763 Sequence seq2_loaded;
765 std::istringstream iss(oss.str());
766 boost::archive::xml_iarchive iarchive(iss);
767 iarchive >> boost::serialization::make_nvp("seq1", seq1_loaded);
768 iarchive >> boost::serialization::make_nvp("seq2", seq2_loaded);
770 BOOST_CHECK_EQUAL(seq1_loaded, seq1);
771 BOOST_CHECK_EQUAL(seq2_loaded, seq2);
772 // test if our pointers are the same
773 BOOST_CHECK_EQUAL(seq1_loaded.c_str(), seq2_loaded.c_str());