implement alphabet for sequence
[mussa.git] / alg / test / test_sequence.cpp
1 #include <boost/test/auto_unit_test.hpp>
2 #include <boost/filesystem/path.hpp>
3 #include <boost/filesystem/operations.hpp>
4 namespace fs=boost::filesystem;
5
6 #include <list>
7 #include <iostream>
8 #include <sstream>
9
10 #include <boost/archive/text_oarchive.hpp>
11 #include <boost/archive/text_iarchive.hpp>
12 #include <boost/archive/xml_oarchive.hpp>
13 #include <boost/archive/xml_iarchive.hpp>
14
15 #include "alg/sequence.hpp"
16 #include "mussa_exceptions.hpp"
17
18 using namespace std;
19
20 BOOST_AUTO_TEST_CASE( sequence_get_sequence )
21 {
22         Sequence s;
23         // make sure that retrieving the sequence doesn't throw an error
24         BOOST_CHECK_EQUAL(s.get_sequence(), std::string() );
25 }
26
27 //! when we try to load a missing file, do we get an error?
28 BOOST_AUTO_TEST_CASE( sequence_load_exception )
29 {
30   Sequence s;
31   // there should be errors when we try to load something that doesn't exist
32   BOOST_CHECK_THROW( s.load_fasta("alkejralk", 1, 0, 0), mussa_load_error);
33   BOOST_CHECK_THROW( s.load_annot("alkejralk", 0, 0), mussa_load_error);
34 }
35
36 BOOST_AUTO_TEST_CASE( sequence_eol_conventions )
37 {
38   string header(">Header");
39   string line1("AAAAGGGGCCCCTTTTT");
40   string line2("AAAAGGGGCCCCTTTTT");
41   int seq_len = line1.size() + line2.size();
42
43   stringstream cr;
44   cr << header << "\015" << line1 << "\015" << line2 << "\015";
45   Sequence seq_cr;
46   seq_cr.load_fasta(cr);
47
48   stringstream crlf;
49   crlf << header << "\015\012" << line1 << "\015\012" << line2 << "\015\012";
50   Sequence seq_crlf;
51   seq_crlf.load_fasta(crlf);
52
53   stringstream lf;
54   lf << header << "\012" << line1 << "\012" << line2 << "\012";
55   Sequence seq_lf;
56   seq_lf.load_fasta(lf);
57
58   BOOST_CHECK_EQUAL(seq_cr.size(),   seq_len);
59   BOOST_CHECK_EQUAL(seq_crlf.size(), seq_len);
60   BOOST_CHECK_EQUAL(seq_cr.size(),   seq_len);
61 }
62
63
64 //! Do simple operations work correctly?
65 BOOST_AUTO_TEST_CASE( sequence_filter )
66 {
67   const char *core_seq = "AATTGGCC";
68   Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
69   BOOST_CHECK_EQUAL(s1, core_seq);
70
71   Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
72   BOOST_CHECK_EQUAL(s2, "AATTGGCC");
73   BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
74   BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
75   BOOST_CHECK_EQUAL(s2.size(), s2.size());
76   BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
77
78   Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
79   BOOST_CHECK_EQUAL(s3, "ANNNG");
80   BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
81
82   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2); 
83   BOOST_CHECK_EQUAL(s3, "AA");
84   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
85   BOOST_CHECK_EQUAL( s3, "GG");
86   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
87   BOOST_CHECK_EQUAL( s3, "CCTT");
88
89   s3 = "AAGGFF";
90   BOOST_CHECK_EQUAL(s3, "AAGGNN");
91 }
92
93 BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
94 {
95   std::string agct("AGCT");
96   Sequence seq(agct, Sequence::nucleic_alphabet);
97   BOOST_CHECK_EQUAL(seq.size(), agct.size());
98   BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
99   
100   std::string bdv("BDv");
101   Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
102   BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
103   BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
104   
105 }
106
107 BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
108 {
109   std::string agct("AGCT");
110   Sequence seq(agct);
111   BOOST_CHECK_EQUAL(seq.size(), agct.size());
112   BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
113   BOOST_CHECK_EQUAL(seq[0], agct[0]);
114   BOOST_CHECK_EQUAL(seq[1], agct[1]);
115   BOOST_CHECK_EQUAL(seq[2], agct[2]);
116   BOOST_CHECK_EQUAL(seq[3], agct[3]);
117   
118   std::string bdv("BDv");
119   Sequence seq_bdv(bdv);
120   BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
121   // default alphabet only allows AGCTUN
122   BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");  
123 }
124
125 BOOST_AUTO_TEST_CASE( subseq_names )
126 {
127   Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
128   s1.set_species("species");
129   s1.set_fasta_header("a fasta header");
130   Sequence s2 = s1.subseq(2,2);
131   BOOST_CHECK_EQUAL(s2, "GG");
132   BOOST_CHECK_EQUAL(s2.get_species(), s1.get_species());
133   BOOST_CHECK_EQUAL(s2.get_fasta_header(), s1.get_fasta_header());
134 }
135
136 BOOST_AUTO_TEST_CASE( sequence_start_stop )
137 {
138   Sequence s1;
139   BOOST_CHECK_EQUAL( s1.start(), 0 );
140   BOOST_CHECK_EQUAL( s1.stop(), 0 );
141
142   std::string seq_string("AAGGCCTT");
143   Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
144   BOOST_CHECK_EQUAL( s2.start(), 0 );
145   BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
146
147   std::string s3seq_string = seq_string.substr(2,3);
148   Sequence s3 = s2.subseq(2,3);
149   BOOST_CHECK_EQUAL( s3.start(), 2);
150   BOOST_CHECK_EQUAL( s3.stop(), 2+3);
151   BOOST_CHECK_EQUAL( s3.size(), 3);
152   BOOST_CHECK_EQUAL( s3, s3seq_string);
153   
154   std::string s4seq_string = s3seq_string.substr(1,1);
155   Sequence s4 = s3.subseq(1,1);
156   BOOST_CHECK_EQUAL( s4.start(), 1 );
157   BOOST_CHECK_EQUAL( s4.stop(), 1+1);
158   BOOST_CHECK_EQUAL( s4.size(), 1);
159   BOOST_CHECK_EQUAL( s4, s4seq_string);
160 }
161
162 //! Can we load data from a file
163 BOOST_AUTO_TEST_CASE( sequence_load )
164 {
165   fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
166   seq_path /=  "human_mck_pro.fa";
167   Sequence s;
168   s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
169   BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
170   BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
171   BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
172                                     "muscle creatine kinase gene (CKMM), "
173                                     "5' flank");
174 }
175
176 BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
177 {
178   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
179   std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
180   std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
181   std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
182   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
183   std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
184   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
185   std::stringstream garbage_fasta(garbage_string);
186   
187   Sequence s;
188   s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
189   BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta, 
190                                  Sequence::reduced_dna_alphabet),
191                     sequence_invalid_load_error);
192   BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta, 
193                                  Sequence::reduced_dna_alphabet),
194                     sequence_invalid_load_error);
195   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
196                                  Sequence::reduced_dna_alphabet),
197                     sequence_invalid_load_error);
198
199 }
200
201 BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
202 {
203   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
204   std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
205   std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
206   std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
207   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
208   std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
209   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
210   std::stringstream garbage_fasta(garbage_string);
211   
212   Sequence s;
213   s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
214   BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta, 
215                                  Sequence::reduced_rna_alphabet),
216                     sequence_invalid_load_error);
217   BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta, 
218                                  Sequence::reduced_rna_alphabet),
219                     sequence_invalid_load_error);
220   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
221                                  Sequence::reduced_rna_alphabet),
222                     sequence_invalid_load_error);
223 }
224
225 BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
226 {
227   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
228   std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
229   std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
230   std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
231   std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
232   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
233   std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
234   std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
235   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
236   std::stringstream garbage_fasta(garbage_string);
237   
238   Sequence s;
239   Sequence specific;
240   // there's two copies of reduced_rna_fasta because i didn't feel like
241   // figuring out how to properly reset the read pointer in a stringstream
242   s.load_fasta(reduced_rna_fasta1);
243   specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
244   BOOST_CHECK_EQUAL(s, specific);
245   
246   s.load_fasta(reduced_dna_fasta1);
247   specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
248   BOOST_CHECK_EQUAL(s, specific);
249   
250   BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta), 
251                     sequence_invalid_load_error);
252   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta), 
253                     sequence_invalid_load_error);
254 }
255
256 BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
257 {
258   std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
259   Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
260   Sequence seqr = seq.rev_comp();
261   
262   BOOST_CHECK( seq != seqr );
263   BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
264   BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
265 }
266
267 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
268 {
269   std::string dna_str("AGCTN");
270   Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
271   BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");  
272   BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
273 }
274
275 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
276 {
277   std::string rna_str("AGCUN");
278   Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
279   BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");  
280   BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
281 }
282
283 BOOST_AUTO_TEST_CASE( annotation_load )
284 {
285   string annot_data = "human\n"
286                       "0 10 name   type\n"
287                       "10 20 myf7\n"
288                       "20 30 myod\n"
289                       "50\t55 anothername\n"
290                       "60 50 backward\n"
291                       ">ident3 asdf\n"
292                       "GCT\n"
293                       "gCTn\n"
294                       "75\t90\tname2\ttype2\n"
295                       "100 120 name-asdf type!@#$%\n"
296                       ;
297   string s(100, 'A');
298   s += "GCTGCTAATT";
299   Sequence seq(s, Sequence::reduced_dna_alphabet);
300                      
301   //istringstream annot_stream(annot_data);
302   seq.parse_annot(annot_data, 0, 0);
303   std::list<annot> annots_list = seq.annotations();
304   std::vector<annot> annots(annots_list.begin(), annots_list.end());
305   BOOST_REQUIRE_EQUAL( annots.size(), 8);
306   BOOST_CHECK_EQUAL( annots[0].begin, 0 );
307   BOOST_CHECK_EQUAL( annots[0].end, 10 );
308   BOOST_CHECK_EQUAL( annots[0].type, "type");
309   BOOST_CHECK_EQUAL( annots[0].name, "name");
310   BOOST_CHECK_EQUAL( annots[1].name, "myf7");
311   BOOST_CHECK_EQUAL( annots[2].name, "myod");
312   BOOST_CHECK_EQUAL( annots[3].name, "anothername");
313   BOOST_CHECK_EQUAL( annots[4].name, "backward");
314   BOOST_CHECK_EQUAL( annots[5].name, "name2");
315   BOOST_CHECK_EQUAL( annots[5].end, 90);
316   BOOST_CHECK_EQUAL( annots[6].begin, 100);
317   BOOST_CHECK_EQUAL( annots[6].end, 120);
318   BOOST_CHECK_EQUAL( annots[6].name, "name-asdf");
319   BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%");
320   // sequence defined annotations will always be after the
321   // absolute positions
322   BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf");
323   BOOST_CHECK_EQUAL( annots[7].begin, 100);
324
325   //BOOST_CHECK_EQUAL( annots
326 }
327
328
329 BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
330 {
331   // this actually is basically what's returned by UCSC
332   // (well actually with some of the sequence and copies of fasta blocks
333   // removed to make the example shorter
334   string annot_data = "\n"
335     "<PRE>\n"
336     ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
337     "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
338     "&gt;hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
339     "GGTGGAGACGACCTGGACCCTAACTACGT\n"
340     "</PRE>\n"
341     "\n"
342     "</BODY>\n"
343     "</HTML>\n"
344     ;
345
346   string s = 
347     "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
348     "AAAAA"
349     "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
350   Sequence seq(s, Sequence::reduced_dna_alphabet);
351   seq.parse_annot(annot_data);
352   std::list<annot> annots = seq.annotations();
353   BOOST_CHECK_EQUAL( annots.size(), 2);
354 }
355
356 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
357 {
358   string annot_data = "0 10 name   type\n"
359                       "10 20 myf7\n"
360                       "20 30 myod\n"
361                       "50\t55 anothername\n"
362                       "60 50 backward\n"
363                       ">ident3 asdf\n"
364                       "GCT\n"
365                       "gCTn\n"
366                       "75\t90\tname2\ttype2\n"
367                       "100 120 name-asdf type!@#$%\n"
368                       ;
369   string s(100, 'A');
370   s += "GCTGCTAATT";
371   Sequence seq(s, Sequence::reduced_dna_alphabet);
372                      
373   //istringstream annot_stream(annot_data);
374   seq.parse_annot(annot_data, 0, 0);
375   std::list<annot> annots_list = seq.annotations();
376   std::vector<annot> annots(annots_list.begin(), annots_list.end());
377   BOOST_REQUIRE_EQUAL( annots.size(), 8);
378   BOOST_CHECK_EQUAL( annots[0].begin, 0 );
379   BOOST_CHECK_EQUAL( annots[0].end, 10 );
380   BOOST_CHECK_EQUAL( annots[0].type, "type");
381 }
382
383 // ticket:83 when you try to load a sequence from a file that doesn't
384 // have fasta headers it crashes. 
385 BOOST_AUTO_TEST_CASE( sequence_past_end ) 
386 {
387   fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq" );
388   seq_path /=  "misformated_seq.fa";
389   Sequence s;
390   BOOST_CHECK_THROW( s.load_fasta(seq_path), mussa_load_error );
391 }
392
393 BOOST_AUTO_TEST_CASE ( sequence_empty )
394 {
395   
396   Sequence s;
397   BOOST_CHECK_EQUAL( s.empty(), true );
398   s = "AAAGGG";
399   BOOST_CHECK_EQUAL( s.empty(), false );
400   s.clear();
401   BOOST_CHECK_EQUAL( s.empty(), true);
402   s = "";
403   BOOST_CHECK_EQUAL( s.empty(), true);
404 }
405
406 BOOST_AUTO_TEST_CASE ( sequence_size )
407 {
408   
409   Sequence s;
410   BOOST_CHECK_EQUAL( s.size(), 0);
411   std::string seq_string("AAAGGG");
412   s = seq_string;
413   BOOST_CHECK_EQUAL( s.size(), seq_string.size() );
414   s.clear();
415   BOOST_CHECK_EQUAL( s.size(), 0);
416   s = "";
417   BOOST_CHECK_EQUAL( s.size(), 0);
418 }
419
420 BOOST_AUTO_TEST_CASE( sequence_empty_equality )
421 {
422   Sequence szero("", Sequence::reduced_dna_alphabet);
423   BOOST_CHECK_EQUAL(szero.empty(), true);
424   BOOST_CHECK_EQUAL(szero, szero);
425   BOOST_CHECK_EQUAL(szero, "");
426
427   Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
428   sclear.clear();
429   BOOST_CHECK_EQUAL(sclear.empty(), true);
430   BOOST_CHECK_EQUAL(sclear, sclear);
431   BOOST_CHECK_EQUAL(sclear, szero);
432   BOOST_CHECK_EQUAL(sclear, "");
433
434 }
435 BOOST_AUTO_TEST_CASE ( sequence_iterators )
436 {
437   std::string seq_string = "AAGGCCTTNNTATA";
438   Sequence s(seq_string, Sequence::reduced_dna_alphabet);
439   const Sequence cs(s);
440   std::string::size_type count = 0;
441
442   std::string::iterator str_itor;
443   Sequence::const_iterator s_itor;
444   Sequence::const_iterator cs_itor;
445
446   for( str_itor = seq_string.begin(),
447        s_itor   = s.begin(),
448        cs_itor  = cs.begin();
449        str_itor != seq_string.end() and
450        s_itor   != s.end() and
451        cs_itor  != cs.end();
452        ++str_itor, ++s_itor, ++cs_itor, ++count)
453   {
454     BOOST_CHECK_EQUAL ( *str_itor, *s_itor );
455     BOOST_CHECK_EQUAL ( *s_itor, *cs_itor );
456     BOOST_CHECK_EQUAL ( *cs_itor, *str_itor );
457   }
458   BOOST_CHECK_EQUAL( seq_string.size(), count );
459   BOOST_CHECK_EQUAL( s.size(), count );
460   BOOST_CHECK_EQUAL( cs.size(), count );
461 }
462
463 BOOST_AUTO_TEST_CASE( sequence_motifs )
464 {
465   string m("AAAA");
466   string bogus("AATTGGAA");
467   Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
468
469   list<motif>::const_iterator motif_i = s1.motifs().begin();
470   list<motif>::const_iterator motif_end = s1.motifs().end();
471
472   // do our iterators work?
473   BOOST_CHECK( motif_i == s1.motifs().begin() );
474   BOOST_CHECK( motif_end == s1.motifs().end() );
475   BOOST_CHECK( motif_i == motif_end );
476
477   // this shouldn't show up
478   s1.add_motif(bogus);
479   BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
480   BOOST_CHECK_EQUAL( s1.motifs().size(), 0 );
481
482   s1.add_motif(m);
483   BOOST_CHECK( s1.motifs().begin() != s1.motifs().end() );
484   BOOST_CHECK_EQUAL( s1.motifs().size(), 2 );
485
486   for(motif_i = s1.motifs().begin(); 
487       motif_i != s1.motifs().end(); 
488       ++motif_i)
489   {
490     BOOST_CHECK_EQUAL( motif_i->type, "motif" );
491     BOOST_CHECK_EQUAL( motif_i->name, m);
492     BOOST_CHECK_EQUAL( motif_i->sequence, m);
493   }
494
495   s1.clear_motifs();
496   BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
497
498   /* FIXME: enable this when i find a way of passing storing the motif name
499   // does our annotation travel?
500   Sequence motif_seq(m);
501   motif_seq.set_fasta_header("hi");
502   s1.add_motif(motif_seq);
503
504   BOOST_CHECK_EQUAL(s1.motifs().size(), 2);
505   for(motif_i = s1.motifs().begin(); 
506       motif_i != s1.motifs().end(); 
507       ++motif_i)
508   {
509     BOOST_CHECK_EQUAL( motif_i->type, "motif" );
510     BOOST_CHECK_EQUAL( motif_i->name, "hi");
511     BOOST_CHECK_EQUAL( motif_i->sequence, m);
512   }
513   */
514 }
515
516 BOOST_AUTO_TEST_CASE( annot_test )
517 {
518   annot a(0, 10, "test", "thing");
519
520   BOOST_CHECK_EQUAL( a.begin, 0 );
521   BOOST_CHECK_EQUAL( a.end,   10 );
522   BOOST_CHECK_EQUAL( a.type,  "test" );
523   BOOST_CHECK_EQUAL( a.name,  "thing" );
524
525   motif m(10, "AAGGCC");
526   BOOST_CHECK_EQUAL( m.begin, 10 );
527   BOOST_CHECK_EQUAL( m.type, "motif" );
528   BOOST_CHECK_EQUAL( m.name, "AAGGCC" );
529   BOOST_CHECK_EQUAL( m.end,  10+6 );
530 }
531
532 BOOST_AUTO_TEST_CASE( annotate_from_sequence )
533 {
534   string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
535            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
536            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
537            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
538            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
539            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
540            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
541            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
542   string gc("GCCCCC");
543   string gga("GGACACCTC");
544   Sequence seq(s, Sequence::reduced_dna_alphabet);
545
546   std::list<Sequence> query_list;
547   std::list<string> string_list;
548   query_list.push_back(Sequence(gc));
549   string_list.push_back(gc);
550   query_list.push_back(Sequence(gga));
551   string_list.push_back(gga);
552
553   BOOST_CHECK_EQUAL( seq.annotations().size(), 0 );
554   seq.find_sequences(query_list.begin(), query_list.end());
555   
556   int count = 0;
557   for(list<string>::iterator string_i = string_list.begin();
558       string_i != string_list.end();
559       ++string_i)
560   {
561     string::size_type pos=0;
562     while(pos != string::npos) {
563       pos = s.find(*string_i, pos);
564       if (pos != string::npos) {
565         ++count;
566         ++pos;
567       }
568     }
569   }
570   BOOST_CHECK_EQUAL(seq.annotations().size(), count);
571   const std::list<annot> &a = seq.annotations();
572   for (std::list<annot>::const_iterator annot_i = a.begin();
573        annot_i != a.end();
574        ++annot_i)
575   {
576     int count = annot_i->end - annot_i->begin ;
577   }
578 }
579
580 BOOST_AUTO_TEST_CASE( subseq_annotation_test )
581 {
582   string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
583            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
584            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
585            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
586            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
587            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
588            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
589            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
590   Sequence seq(s, Sequence::reduced_dna_alphabet);
591
592
593   seq.add_annotation(annot(0, 10, "0-10", "0-10"));
594   seq.add_annotation(annot(10, 20, "10-20", "10-20"));
595   seq.add_annotation(annot(0, 20, "0-20", "0-20"));
596   seq.add_annotation(annot(8, 12, "8-12", "8-12"));
597   seq.add_annotation(annot(100, 5000, "100-5000", "100-5000"));
598
599   Sequence subseq = seq.subseq(5, 10);
600   const list<annot> annots = subseq.annotations();
601   // generate some ground truth
602   list<annot> correct;
603   correct.push_back(annot(0, 5, "0-10",  "0-10"));
604   correct.push_back(annot(5,10, "10-20", "10-20"));
605   correct.push_back(annot(0,10, "0-20",  "0-20"));
606   correct.push_back(annot(3, 7, "8-12",  "8-12"));
607   BOOST_REQUIRE_EQUAL( annots.size(), correct.size() );
608
609   list<annot>::iterator correct_i = correct.begin();
610   list<annot>::const_iterator annot_i = annots.begin();
611   for(; annot_i != annots.end(); ++annot_i, ++correct_i)
612   {
613     BOOST_CHECK( *annot_i == *correct_i );
614   }
615 }
616
617 BOOST_AUTO_TEST_CASE( motif_annotation_update )
618 {
619   string s("CCGTCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
620            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
621            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
622            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
623            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
624            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
625            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
626            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
627   Sequence seq(s, Sequence::reduced_dna_alphabet);
628
629   // starting conditions
630   BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
631   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
632   seq.add_annotation(annot(0, 10, "0-10", "0-10"));
633   seq.add_annotation(annot(10, 20, "10-20", "10-20"));
634   seq.add_annotation(annot(0, 20, "0-20", "0-20"));
635   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
636   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
637   seq.add_motif("CCGTCCC");
638   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
639   BOOST_CHECK_EQUAL(seq.motifs().size(), 1);
640   seq.clear_motifs();
641   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
642   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
643 }
644
645 BOOST_AUTO_TEST_CASE( out_operator )
646 {
647   string s("AAGGCCTT");
648   Sequence seq(s, Sequence::reduced_dna_alphabet);
649
650   ostringstream buf;
651   buf << s;
652   BOOST_CHECK_EQUAL( s, buf.str() );
653 }
654
655 BOOST_AUTO_TEST_CASE( get_name )
656 {
657   Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
658
659   BOOST_CHECK_EQUAL( seq.get_name(), "" );
660   seq.set_species("hooman"); // anyone remember tradewars?
661   BOOST_CHECK_EQUAL( seq.get_name(), "hooman");
662   seq.set_fasta_header("fasta human");
663   BOOST_CHECK_EQUAL( seq.get_name(), "fasta human");
664 }
665
666 BOOST_AUTO_TEST_CASE( serialize_simple )
667 {
668   std::string seq_string = "AAGGCCTT";
669   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
670   seq.set_species("ribbet");
671   std::ostringstream oss;
672   // allocate/deallocate serialization components
673   {
674     boost::archive::text_oarchive oarchive(oss);
675     const Sequence& const_seq(seq);
676     BOOST_CHECK_EQUAL(seq, const_seq);
677     oarchive << const_seq;
678   }
679   Sequence seq_loaded;
680   {
681     std::istringstream iss(oss.str());
682     boost::archive::text_iarchive iarchive(iss);
683     iarchive >> seq_loaded;
684   }
685   BOOST_CHECK_EQUAL(seq_loaded, seq);
686   BOOST_CHECK_EQUAL(seq.get_species(), "ribbet");
687 }  
688
689 BOOST_AUTO_TEST_CASE( serialize_tree )
690 {
691   std::string seq_string = "AAGGCCTT";
692   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
693   seq.set_species("ribbet");
694   seq.add_motif("AA");
695   seq.add_motif("GC");
696   annot a1(6,7,"t","t");
697   seq.add_annotation(a1);
698
699   std::ostringstream oss;
700   // allocate/deallocate serialization components
701   {
702     boost::archive::text_oarchive oarchive(oss);
703     const Sequence& const_seq(seq);
704     BOOST_CHECK_EQUAL(seq, const_seq);
705     oarchive << const_seq;
706   }
707
708   Sequence seq_loaded;
709   {
710     std::istringstream iss(oss.str());
711     boost::archive::text_iarchive iarchive(iss);
712     iarchive >> seq_loaded;
713   }
714   BOOST_CHECK_EQUAL(seq_loaded, seq);
715 }  
716
717 // this writes out an "old" style annotated sequence
718 // with annotations attached as "motifs" and "annots"
719 BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
720 {
721   std::string seq_string = "AAGGCCTT";
722   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
723   seq.set_species("ribbet");
724   seq.add_motif("AA");
725   seq.add_motif("GC");
726   annot a1(6,7,"t","t");
727   seq.add_annotation(a1);
728
729   std::ostringstream oss;
730   // allocate/deallocate serialization components
731   {
732     boost::archive::xml_oarchive oarchive(oss);
733     const Sequence& const_seq(seq);
734     BOOST_CHECK_EQUAL(seq, const_seq);
735     oarchive << boost::serialization::make_nvp("root", const_seq);
736   }
737   Sequence seq_loaded;
738   {
739     std::istringstream iss(oss.str());
740     boost::archive::xml_iarchive iarchive(iss);
741     iarchive >> boost::serialization::make_nvp("root", seq_loaded);
742   }
743   BOOST_CHECK_EQUAL(seq_loaded, seq);
744 }
745
746 BOOST_AUTO_TEST_CASE( serialize_xml_two )
747 {
748   std::string seq_string = "AAGGCCTT";
749   Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
750   Sequence seq2(seq1);
751
752   std::ostringstream oss;
753   // allocate/deallocate serialization components
754   {
755     boost::archive::xml_oarchive oarchive(oss);
756     const Sequence& const_seq1(seq1);
757     const Sequence& const_seq2(seq2);
758     oarchive << boost::serialization::make_nvp("seq1", const_seq1);
759     oarchive << boost::serialization::make_nvp("seq2", const_seq2);
760   }
761   //std::cout << "xml: " << oss.str() << std::endl;
762   Sequence seq1_loaded;
763   Sequence seq2_loaded;
764   {
765     std::istringstream iss(oss.str());
766     boost::archive::xml_iarchive iarchive(iss);
767     iarchive >> boost::serialization::make_nvp("seq1", seq1_loaded);
768     iarchive >> boost::serialization::make_nvp("seq2", seq2_loaded);
769   }
770   BOOST_CHECK_EQUAL(seq1_loaded, seq1);
771   BOOST_CHECK_EQUAL(seq2_loaded, seq2);
772   // test if our pointers are the same
773   BOOST_CHECK_EQUAL(seq1_loaded.c_str(), seq2_loaded.c_str());
774 }