throw errors when spirit parsing fails
[mussa.git] / alg / test / test_sequence.cpp
1 #include <boost/test/auto_unit_test.hpp>
2 #include <boost/filesystem/path.hpp>
3 #include <boost/filesystem/operations.hpp>
4 namespace fs=boost::filesystem;
5
6 #include <list>
7 #include <iostream>
8 #include <sstream>
9
10 #include <boost/archive/text_oarchive.hpp>
11 #include <boost/archive/text_iarchive.hpp>
12 #include <boost/archive/xml_oarchive.hpp>
13 #include <boost/archive/xml_iarchive.hpp>
14
15 #include "alg/sequence.hpp"
16 #include "mussa_exceptions.hpp"
17
18 using namespace std;
19
20 BOOST_AUTO_TEST_CASE( sequence_get_sequence )
21 {
22         Sequence s;
23         // make sure that retrieving the sequence doesn't throw an error
24         BOOST_CHECK_EQUAL(s.get_sequence(), std::string() );
25 }
26
27 //! when we try to load a missing file, do we get an error?
28 BOOST_AUTO_TEST_CASE( sequence_load_exception )
29 {
30   Sequence s;
31   // there should be errors when we try to load something that doesn't exist
32   BOOST_CHECK_THROW( s.load_fasta("alkejralk", 1, 0, 0), mussa_load_error);
33   BOOST_CHECK_THROW( s.load_annot("alkejralk", 0, 0), mussa_load_error);
34 }
35
36 BOOST_AUTO_TEST_CASE( sequence_eol_conventions )
37 {
38   string header(">Header");
39   string line1("AAAAGGGGCCCCTTTTT");
40   string line2("AAAAGGGGCCCCTTTTT");
41   int seq_len = line1.size() + line2.size();
42
43   stringstream cr;
44   cr << header << "\015" << line1 << "\015" << line2 << "\015";
45   Sequence seq_cr;
46   seq_cr.load_fasta(cr);
47
48   stringstream crlf;
49   crlf << header << "\015\012" << line1 << "\015\012" << line2 << "\015\012";
50   Sequence seq_crlf;
51   seq_crlf.load_fasta(crlf);
52
53   stringstream lf;
54   lf << header << "\012" << line1 << "\012" << line2 << "\012";
55   Sequence seq_lf;
56   seq_lf.load_fasta(lf);
57
58   BOOST_CHECK_EQUAL(seq_cr.size(),   seq_len);
59   BOOST_CHECK_EQUAL(seq_crlf.size(), seq_len);
60   BOOST_CHECK_EQUAL(seq_cr.size(),   seq_len);
61 }
62
63
64 //! Do simple operations work correctly?
65 BOOST_AUTO_TEST_CASE( sequence_filter )
66 {
67   const char *core_seq = "AATTGGCC";
68   Sequence s1(core_seq, Sequence::reduced_dna_alphabet);
69   BOOST_CHECK_EQUAL(s1, core_seq);
70
71   Sequence s2("aattggcc", Sequence::reduced_dna_alphabet);
72   BOOST_CHECK_EQUAL(s2, "AATTGGCC");
73   BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT");
74   BOOST_CHECK_EQUAL(s2.rev_comp(), "ggccaatt"); // we should be case insensitive now
75   BOOST_CHECK_EQUAL(s2.size(), s2.size());
76   BOOST_CHECK_EQUAL(s2.get_sequence(), "aattggcc");
77
78   Sequence s3("asdfg", Sequence::reduced_dna_alphabet);
79   BOOST_CHECK_EQUAL(s3, "ANNNG");
80   BOOST_CHECK_EQUAL(s3.subseq(0,2), "AN");
81
82   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 0, 2); 
83   BOOST_CHECK_EQUAL(s3, "AA");
84   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 2, 2);
85   BOOST_CHECK_EQUAL( s3, "GG");
86   s3.set_filtered_sequence("AAGGCCTT", Sequence::reduced_dna_alphabet, 4);
87   BOOST_CHECK_EQUAL( s3, "CCTT");
88
89   s3 = "AAGGFF";
90   BOOST_CHECK_EQUAL(s3, "AAGGNN");
91 }
92
93 BOOST_AUTO_TEST_CASE( sequence_nucleic_alphabet )
94 {
95   std::string agct("AGCT");
96   Sequence seq(agct, Sequence::nucleic_alphabet);
97   BOOST_CHECK_EQUAL(seq.size(), agct.size());
98   BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
99   
100   std::string bdv("BDv");
101   Sequence seq_bdv(bdv, Sequence::nucleic_alphabet);
102   BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
103   BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), bdv);
104   
105 }
106
107 BOOST_AUTO_TEST_CASE( sequence_default_alphabet )
108 {
109   std::string agct("AGCT");
110   Sequence seq(agct);
111   BOOST_CHECK_EQUAL(seq.size(), agct.size());
112   BOOST_CHECK_EQUAL(seq.get_sequence(), agct);
113   BOOST_CHECK_EQUAL(seq[0], agct[0]);
114   BOOST_CHECK_EQUAL(seq[1], agct[1]);
115   BOOST_CHECK_EQUAL(seq[2], agct[2]);
116   BOOST_CHECK_EQUAL(seq[3], agct[3]);
117   
118   std::string bdv("BDv");
119   Sequence seq_bdv(bdv);
120   BOOST_CHECK_EQUAL(seq_bdv.size(), bdv.size());
121   // default alphabet only allows AGCTUN
122   BOOST_CHECK_EQUAL(seq_bdv.get_sequence(), "NNN");  
123 }
124
125 BOOST_AUTO_TEST_CASE( subseq_names )
126 {
127   Sequence s1("AAGGCCTT", Sequence::reduced_dna_alphabet);
128   s1.set_species("species");
129   s1.set_fasta_header("a fasta header");
130   Sequence s2 = s1.subseq(2,2);
131   BOOST_CHECK_EQUAL(s2, "GG");
132   BOOST_CHECK_EQUAL(s2.get_species(), s1.get_species());
133   BOOST_CHECK_EQUAL(s2.get_fasta_header(), s1.get_fasta_header());
134 }
135
136 BOOST_AUTO_TEST_CASE( sequence_start_stop )
137 {
138   Sequence s1;
139   BOOST_CHECK_EQUAL( s1.start(), 0 );
140   BOOST_CHECK_EQUAL( s1.stop(), 0 );
141
142   std::string seq_string("AAGGCCTT");
143   Sequence s2(seq_string, Sequence::reduced_dna_alphabet);
144   BOOST_CHECK_EQUAL( s2.start(), 0 );
145   BOOST_CHECK_EQUAL( s2.stop(), seq_string.size() );
146
147   std::string s3seq_string = seq_string.substr(2,3);
148   Sequence s3 = s2.subseq(2,3);
149   BOOST_CHECK_EQUAL( s3.start(), 2);
150   BOOST_CHECK_EQUAL( s3.stop(), 2+3);
151   BOOST_CHECK_EQUAL( s3.size(), 3);
152   BOOST_CHECK_EQUAL( s3, s3seq_string);
153   
154   std::string s4seq_string = s3seq_string.substr(1,1);
155   Sequence s4 = s3.subseq(1,1);
156   BOOST_CHECK_EQUAL( s4.start(), 1 );
157   BOOST_CHECK_EQUAL( s4.stop(), 1+1);
158   BOOST_CHECK_EQUAL( s4.size(), 1);
159   BOOST_CHECK_EQUAL( s4, s4seq_string);
160 }
161
162 //! Can we load data from a file
163 BOOST_AUTO_TEST_CASE( sequence_load )
164 {
165   fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq");
166   seq_path /=  "human_mck_pro.fa";
167   Sequence s;
168   s.load_fasta(seq_path, Sequence::reduced_dna_alphabet);
169   BOOST_CHECK_EQUAL(s.subseq(0, 5), "GGATC"); // first few chars of fasta file
170   BOOST_CHECK_EQUAL(s.subseq(2, 3), "ATC");
171   BOOST_CHECK_EQUAL(s.get_fasta_header(), "gi|180579|gb|M21487.1|HUMCKMM1 Human "
172                                     "muscle creatine kinase gene (CKMM), "
173                                     "5' flank");
174 }
175
176 BOOST_AUTO_TEST_CASE( sequence_load_dna_reduced )
177 {
178   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
179   std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
180   std::string invalid_dna_fasta_string(">wrong\nAUSSI\n");
181   std::stringstream invalid_dna_fasta(invalid_dna_fasta_string);
182   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN-\n");
183   std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
184   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
185   std::stringstream garbage_fasta(garbage_string);
186   
187   Sequence s;
188   s.load_fasta(reduced_dna_fasta, Sequence::reduced_dna_alphabet);
189   BOOST_CHECK_THROW(s.load_fasta(invalid_dna_fasta, 
190                                  Sequence::reduced_dna_alphabet),
191                     sequence_invalid_load_error);
192   BOOST_CHECK_THROW(s.load_fasta(reduced_rna_fasta, 
193                                  Sequence::reduced_dna_alphabet),
194                     sequence_invalid_load_error);
195   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
196                                  Sequence::reduced_dna_alphabet),
197                     sequence_invalid_load_error);
198
199 }
200
201 BOOST_AUTO_TEST_CASE( sequence_load_rna_reduced )
202 {
203   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
204   std::stringstream reduced_rna_fasta(reduced_rna_fasta_string);
205   std::string invalid_rna_fasta_string(">wrong\nATSSI\n");
206   std::stringstream invalid_rna_fasta(invalid_rna_fasta_string);
207   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
208   std::stringstream reduced_dna_fasta(reduced_dna_fasta_string);
209   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
210   std::stringstream garbage_fasta(garbage_string);
211   
212   Sequence s;
213   s.load_fasta(reduced_rna_fasta, Sequence::reduced_rna_alphabet);
214   BOOST_CHECK_THROW(s.load_fasta(invalid_rna_fasta, 
215                                  Sequence::reduced_rna_alphabet),
216                     sequence_invalid_load_error);
217   BOOST_CHECK_THROW(s.load_fasta(reduced_dna_fasta, 
218                                  Sequence::reduced_rna_alphabet),
219                     sequence_invalid_load_error);
220   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta, 
221                                  Sequence::reduced_rna_alphabet),
222                     sequence_invalid_load_error);
223 }
224
225 BOOST_AUTO_TEST_CASE( sequence_load_fasta_default )
226 {
227   std::string reduced_rna_fasta_string(">foo\nAAGGCCUUNN\n");
228   std::stringstream reduced_rna_fasta1(reduced_rna_fasta_string);
229   std::stringstream reduced_rna_fasta2(reduced_rna_fasta_string);
230   std::string invalid_nucleotide_fasta_string(">wrong\nATSSI\n");
231   std::stringstream invalid_nucleotide_fasta(invalid_nucleotide_fasta_string);
232   std::string reduced_dna_fasta_string(">foo\nAAGGCCTTNN\n");
233   std::stringstream reduced_dna_fasta1(reduced_dna_fasta_string);
234   std::stringstream reduced_dna_fasta2(reduced_dna_fasta_string);
235   std::string garbage_string(">foo\na34ralk3547oilk,jual;(*&^#%-\n");
236   std::stringstream garbage_fasta(garbage_string);
237   
238   Sequence s;
239   Sequence specific;
240   // there's two copies of reduced_rna_fasta because i didn't feel like
241   // figuring out how to properly reset the read pointer in a stringstream
242   s.load_fasta(reduced_rna_fasta1);
243   specific.load_fasta(reduced_rna_fasta2, Sequence::reduced_nucleic_alphabet);
244   BOOST_CHECK_EQUAL(s, specific);
245   
246   s.load_fasta(reduced_dna_fasta1);
247   specific.load_fasta(reduced_dna_fasta2, Sequence::reduced_nucleic_alphabet);
248   BOOST_CHECK_EQUAL(s, specific);
249   
250   BOOST_CHECK_THROW(s.load_fasta(invalid_nucleotide_fasta), 
251                     sequence_invalid_load_error);
252   BOOST_CHECK_THROW(s.load_fasta(garbage_fasta), 
253                     sequence_invalid_load_error);
254 }
255
256 BOOST_AUTO_TEST_CASE( sequence_reverse_complement )
257 {
258   std::string iupac_symbols("AaCcGgTtRrYySsWwKkMmBbDdHhVvNn-~.?");
259   Sequence seq(iupac_symbols, Sequence::nucleic_alphabet);
260   Sequence seqr = seq.rev_comp();
261   
262   BOOST_CHECK( seq != seqr );
263   BOOST_CHECK_EQUAL( seq, seqr.rev_comp() );
264   BOOST_CHECK_EQUAL( seq.get_sequence(), iupac_symbols );
265 }
266
267 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_dna )
268 {
269   std::string dna_str("AGCTN");
270   Sequence dna_seq(dna_str, Sequence::reduced_dna_alphabet);
271   BOOST_CHECK_EQUAL(dna_seq.rev_comp(), "NAGCT");  
272   BOOST_CHECK_EQUAL(dna_seq, dna_seq.rev_comp().rev_comp());
273 }
274
275 BOOST_AUTO_TEST_CASE( sequence_reverse_complement_rna )
276 {
277   std::string rna_str("AGCUN");
278   Sequence rna_seq(rna_str, Sequence::reduced_rna_alphabet);
279   BOOST_CHECK_EQUAL(rna_seq.rev_comp(), "NAGCU");  
280   BOOST_CHECK_EQUAL(rna_seq, rna_seq.rev_comp().rev_comp());
281 }
282
283 BOOST_AUTO_TEST_CASE( annotation_load )
284 {
285   string annot_data = "human\n"
286                       "0 10 name   type\n"
287                       "10 20 myf7\n"
288                       "20 30 myod\n"
289                       "50\t55 anothername\n"
290                       "60 50 backward\n"
291                       ">ident3 asdf\n"
292                       "GCT\n"
293                       "gCTn\n"
294                       "75\t90\tname2\ttype2\n"
295                       "100 120 name-asdf type!@#$%\n"
296                       ;
297   string s(100, 'A');
298   s += "GCTGCTAATT";
299   Sequence seq(s, Sequence::reduced_dna_alphabet);
300                      
301   //istringstream annot_stream(annot_data);
302   seq.parse_annot(annot_data, 0, 0);
303   std::list<annot> annots_list = seq.annotations();
304   std::vector<annot> annots(annots_list.begin(), annots_list.end());
305   BOOST_REQUIRE_EQUAL( annots.size(), 8);
306   BOOST_CHECK_EQUAL( annots[0].begin, 0 );
307   BOOST_CHECK_EQUAL( annots[0].end, 10 );
308   BOOST_CHECK_EQUAL( annots[0].type, "type");
309   BOOST_CHECK_EQUAL( annots[0].name, "name");
310   BOOST_CHECK_EQUAL( annots[1].name, "myf7");
311   BOOST_CHECK_EQUAL( annots[2].name, "myod");
312   BOOST_CHECK_EQUAL( annots[3].name, "anothername");
313   BOOST_CHECK_EQUAL( annots[4].name, "backward");
314   BOOST_CHECK_EQUAL( annots[5].name, "name2");
315   BOOST_CHECK_EQUAL( annots[5].end, 90);
316   BOOST_CHECK_EQUAL( annots[6].begin, 100);
317   BOOST_CHECK_EQUAL( annots[6].end, 120);
318   BOOST_CHECK_EQUAL( annots[6].name, "name-asdf");
319   BOOST_CHECK_EQUAL( annots[6].type, "type!@#$%");
320   // sequence defined annotations will always be after the
321   // absolute positions
322   BOOST_CHECK_EQUAL( annots[7].name, "ident3 asdf");
323   BOOST_CHECK_EQUAL( annots[7].begin, 100);
324
325   //BOOST_CHECK_EQUAL( annots
326 }
327
328 BOOST_AUTO_TEST_CASE( annotation_broken_load )
329 {
330   string annot_data = "human\n"
331                       "0 10 name   type\n"
332                       "blah60 50 backward\n"
333                       ">ident3 asdf\n"
334                       "GCT\n"
335                       "gCTn\n"
336                       ;
337   string s(100, 'A');
338   s += "GCTGCTAATT";
339   Sequence seq(s, Sequence::reduced_dna_alphabet);
340                      
341   BOOST_CHECK_THROW(seq.parse_annot(annot_data, 0, 0), annotation_load_error);
342   BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
343   }
344
345 BOOST_AUTO_TEST_CASE(annotation_ucsc_html_load)
346 {
347   // this actually is basically what's returned by UCSC
348   // (well actually with some of the sequence and copies of fasta blocks
349   // removed to make the example shorter
350   string annot_data = "\n"
351     "<PRE>\n"
352     ">hg17_knownGene_NM_001824_0 range=chr19:50517919-50517974 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
353     "GGGTCAGTGTCACCTCCAGGATACAGACAG\n"
354     "&gt;hg17_knownGene_NM_001824_3 range=chr19:50510563-50510695 5'pad=0 3'pad=0 revComp=TRUE strand=- repeatMasking=none\n"
355     "GGTGGAGACGACCTGGACCCTAACTACGT\n"
356     "</PRE>\n"
357     "\n"
358     "</BODY>\n"
359     "</HTML>\n"
360     ;
361
362   string s = 
363     "TGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAG"
364     "AAAAA"
365     "GGTGGAGACGACCTGGACCCTAACTACGTGCTCAGCAGCCGCGTCCGCAC";
366   Sequence seq(s, Sequence::reduced_dna_alphabet);
367   seq.parse_annot(annot_data);
368   std::list<annot> annots = seq.annotations();
369   BOOST_CHECK_EQUAL( annots.size(), 2);
370 }
371
372 BOOST_AUTO_TEST_CASE( annotation_load_no_species_name )
373 {
374   string annot_data = "0 10 name   type\n"
375                       "10 20 myf7\n"
376                       "20 30 myod\n"
377                       "50\t55 anothername\n"
378                       "60 50 backward\n"
379                       ">ident3 asdf\n"
380                       "GCT\n"
381                       "gCTn\n"
382                       "75\t90\tname2\ttype2\n"
383                       "100 120 name-asdf type!@#$%\n"
384                       ;
385   string s(100, 'A');
386   s += "GCTGCTAATT";
387   Sequence seq(s, Sequence::reduced_dna_alphabet);
388                      
389   //istringstream annot_stream(annot_data);
390   seq.parse_annot(annot_data, 0, 0);
391   std::list<annot> annots_list = seq.annotations();
392   std::vector<annot> annots(annots_list.begin(), annots_list.end());
393   BOOST_REQUIRE_EQUAL( annots.size(), 8);
394   BOOST_CHECK_EQUAL( annots[0].begin, 0 );
395   BOOST_CHECK_EQUAL( annots[0].end, 10 );
396   BOOST_CHECK_EQUAL( annots[0].type, "type");
397 }
398
399 // ticket:83 when you try to load a sequence from a file that doesn't
400 // have fasta headers it crashes. 
401 BOOST_AUTO_TEST_CASE( sequence_past_end ) 
402 {
403   fs::path seq_path(fs::path(EXAMPLE_DIR, fs::native)/ "seq" );
404   seq_path /=  "misformated_seq.fa";
405   Sequence s;
406   BOOST_CHECK_THROW( s.load_fasta(seq_path), mussa_load_error );
407 }
408
409 BOOST_AUTO_TEST_CASE ( sequence_empty )
410 {
411   
412   Sequence s;
413   BOOST_CHECK_EQUAL( s.empty(), true );
414   s = "AAAGGG";
415   BOOST_CHECK_EQUAL( s.empty(), false );
416   s.clear();
417   BOOST_CHECK_EQUAL( s.empty(), true);
418   s = "";
419   BOOST_CHECK_EQUAL( s.empty(), true);
420 }
421
422 BOOST_AUTO_TEST_CASE ( sequence_size )
423 {
424   
425   Sequence s;
426   BOOST_CHECK_EQUAL( s.size(), 0);
427   std::string seq_string("AAAGGG");
428   s = seq_string;
429   BOOST_CHECK_EQUAL( s.size(), seq_string.size() );
430   s.clear();
431   BOOST_CHECK_EQUAL( s.size(), 0);
432   s = "";
433   BOOST_CHECK_EQUAL( s.size(), 0);
434 }
435
436 BOOST_AUTO_TEST_CASE( sequence_empty_equality )
437 {
438   Sequence szero("", Sequence::reduced_dna_alphabet);
439   BOOST_CHECK_EQUAL(szero.empty(), true);
440   BOOST_CHECK_EQUAL(szero, szero);
441   BOOST_CHECK_EQUAL(szero, "");
442
443   Sequence sclear("AGCT", Sequence::reduced_dna_alphabet);
444   sclear.clear();
445   BOOST_CHECK_EQUAL(sclear.empty(), true);
446   BOOST_CHECK_EQUAL(sclear, sclear);
447   BOOST_CHECK_EQUAL(sclear, szero);
448   BOOST_CHECK_EQUAL(sclear, "");
449
450 }
451 BOOST_AUTO_TEST_CASE ( sequence_iterators )
452 {
453   std::string seq_string = "AAGGCCTTNNTATA";
454   Sequence s(seq_string, Sequence::reduced_dna_alphabet);
455   const Sequence cs(s);
456   std::string::size_type count = 0;
457
458   std::string::iterator str_itor;
459   Sequence::const_iterator s_itor;
460   Sequence::const_iterator cs_itor;
461
462   for( str_itor = seq_string.begin(),
463        s_itor   = s.begin(),
464        cs_itor  = cs.begin();
465        str_itor != seq_string.end() and
466        s_itor   != s.end() and
467        cs_itor  != cs.end();
468        ++str_itor, ++s_itor, ++cs_itor, ++count)
469   {
470     BOOST_CHECK_EQUAL ( *str_itor, *s_itor );
471     BOOST_CHECK_EQUAL ( *s_itor, *cs_itor );
472     BOOST_CHECK_EQUAL ( *cs_itor, *str_itor );
473   }
474   BOOST_CHECK_EQUAL( seq_string.size(), count );
475   BOOST_CHECK_EQUAL( s.size(), count );
476   BOOST_CHECK_EQUAL( cs.size(), count );
477 }
478
479 BOOST_AUTO_TEST_CASE( sequence_motifs )
480 {
481   string m("AAAA");
482   string bogus("AATTGGAA");
483   Sequence s1("AAAAGGGGCCCCTTTT", Sequence::reduced_dna_alphabet);
484
485   list<motif>::const_iterator motif_i = s1.motifs().begin();
486   list<motif>::const_iterator motif_end = s1.motifs().end();
487
488   // do our iterators work?
489   BOOST_CHECK( motif_i == s1.motifs().begin() );
490   BOOST_CHECK( motif_end == s1.motifs().end() );
491   BOOST_CHECK( motif_i == motif_end );
492
493   // this shouldn't show up
494   s1.add_motif(bogus);
495   BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
496   BOOST_CHECK_EQUAL( s1.motifs().size(), 0 );
497
498   s1.add_motif(m);
499   BOOST_CHECK( s1.motifs().begin() != s1.motifs().end() );
500   BOOST_CHECK_EQUAL( s1.motifs().size(), 2 );
501
502   for(motif_i = s1.motifs().begin(); 
503       motif_i != s1.motifs().end(); 
504       ++motif_i)
505   {
506     BOOST_CHECK_EQUAL( motif_i->type, "motif" );
507     BOOST_CHECK_EQUAL( motif_i->name, m);
508     BOOST_CHECK_EQUAL( motif_i->sequence, m);
509   }
510
511   s1.clear_motifs();
512   BOOST_CHECK( s1.motifs().begin() == s1.motifs().end() );
513
514   /* FIXME: enable this when i find a way of passing storing the motif name
515   // does our annotation travel?
516   Sequence motif_seq(m);
517   motif_seq.set_fasta_header("hi");
518   s1.add_motif(motif_seq);
519
520   BOOST_CHECK_EQUAL(s1.motifs().size(), 2);
521   for(motif_i = s1.motifs().begin(); 
522       motif_i != s1.motifs().end(); 
523       ++motif_i)
524   {
525     BOOST_CHECK_EQUAL( motif_i->type, "motif" );
526     BOOST_CHECK_EQUAL( motif_i->name, "hi");
527     BOOST_CHECK_EQUAL( motif_i->sequence, m);
528   }
529   */
530 }
531
532 BOOST_AUTO_TEST_CASE( annot_test )
533 {
534   annot a(0, 10, "test", "thing");
535
536   BOOST_CHECK_EQUAL( a.begin, 0 );
537   BOOST_CHECK_EQUAL( a.end,   10 );
538   BOOST_CHECK_EQUAL( a.type,  "test" );
539   BOOST_CHECK_EQUAL( a.name,  "thing" );
540
541   motif m(10, "AAGGCC");
542   BOOST_CHECK_EQUAL( m.begin, 10 );
543   BOOST_CHECK_EQUAL( m.type, "motif" );
544   BOOST_CHECK_EQUAL( m.name, "AAGGCC" );
545   BOOST_CHECK_EQUAL( m.end,  10+6 );
546 }
547
548 BOOST_AUTO_TEST_CASE( annotate_from_sequence )
549 {
550   string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
551            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
552            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
553            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
554            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
555            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
556            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
557            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
558   string gc("GCCCCC");
559   string gga("GGACACCTC");
560   Sequence seq(s, Sequence::reduced_dna_alphabet);
561
562   std::list<Sequence> query_list;
563   std::list<string> string_list;
564   query_list.push_back(Sequence(gc));
565   string_list.push_back(gc);
566   query_list.push_back(Sequence(gga));
567   string_list.push_back(gga);
568
569   BOOST_CHECK_EQUAL( seq.annotations().size(), 0 );
570   seq.find_sequences(query_list.begin(), query_list.end());
571   
572   int count = 0;
573   for(list<string>::iterator string_i = string_list.begin();
574       string_i != string_list.end();
575       ++string_i)
576   {
577     string::size_type pos=0;
578     while(pos != string::npos) {
579       pos = s.find(*string_i, pos);
580       if (pos != string::npos) {
581         ++count;
582         ++pos;
583       }
584     }
585   }
586   BOOST_CHECK_EQUAL(seq.annotations().size(), count);
587   const std::list<annot> &a = seq.annotations();
588   for (std::list<annot>::const_iterator annot_i = a.begin();
589        annot_i != a.end();
590        ++annot_i)
591   {
592     int count = annot_i->end - annot_i->begin ;
593   }
594 }
595
596 BOOST_AUTO_TEST_CASE( subseq_annotation_test )
597 {
598   string s("CCGCCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
599            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
600            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
601            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
602            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
603            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
604            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
605            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
606   Sequence seq(s, Sequence::reduced_dna_alphabet);
607
608
609   seq.add_annotation(annot(0, 10, "0-10", "0-10"));
610   seq.add_annotation(annot(10, 20, "10-20", "10-20"));
611   seq.add_annotation(annot(0, 20, "0-20", "0-20"));
612   seq.add_annotation(annot(8, 12, "8-12", "8-12"));
613   seq.add_annotation(annot(100, 5000, "100-5000", "100-5000"));
614
615   Sequence subseq = seq.subseq(5, 10);
616   const list<annot> annots = subseq.annotations();
617   // generate some ground truth
618   list<annot> correct;
619   correct.push_back(annot(0, 5, "0-10",  "0-10"));
620   correct.push_back(annot(5,10, "10-20", "10-20"));
621   correct.push_back(annot(0,10, "0-20",  "0-20"));
622   correct.push_back(annot(3, 7, "8-12",  "8-12"));
623   BOOST_REQUIRE_EQUAL( annots.size(), correct.size() );
624
625   list<annot>::iterator correct_i = correct.begin();
626   list<annot>::const_iterator annot_i = annots.begin();
627   for(; annot_i != annots.end(); ++annot_i, ++correct_i)
628   {
629     BOOST_CHECK( *annot_i == *correct_i );
630   }
631 }
632
633 BOOST_AUTO_TEST_CASE( motif_annotation_update )
634 {
635   string s("CCGTCCCCCATCATCGCGGCTCTCCGAGAGTCCCGCGCCCCACTCCCGGC"
636            "ACCCACCTGACCGCGGGCGGCTCCGGCCCCGCTTCGCCCCACTGCGATCA"
637            "GTCGCGTCCCGCAGGCCAGGCACGCCCCGCCGCTCCCGCTGCGCCGGGCG"
638            "TCTGGGACCTCGGGCGGCTCCTCCGAGGGGCGGGGCAGCCGGGAGCCACG"
639            "CCCCCGCAGGTGAGCCGGCCACGCCCACCGCCCGTGGGAAGTTCAGCCTC"
640            "GGGGCTCCAGCCCCGCGGGAATGGCAGAACTTCGCACGCGGAACTGGTAA"
641            "CCTCCAGGACACCTCGAATCAGGGTGATTGTAGCGCAGGGGCCTTGGCCA"
642            "AGCTAAAACTTTGGAAACTTTAGATCCCAGACAGGTGGCTTTCTTGCAGT");
643   Sequence seq(s, Sequence::reduced_dna_alphabet);
644
645   // starting conditions
646   BOOST_CHECK_EQUAL(seq.annotations().size(), 0);
647   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
648   seq.add_annotation(annot(0, 10, "0-10", "0-10"));
649   seq.add_annotation(annot(10, 20, "10-20", "10-20"));
650   seq.add_annotation(annot(0, 20, "0-20", "0-20"));
651   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
652   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
653   seq.add_motif("CCGTCCC");
654   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
655   BOOST_CHECK_EQUAL(seq.motifs().size(), 1);
656   seq.clear_motifs();
657   BOOST_CHECK_EQUAL(seq.annotations().size(), 3);
658   BOOST_CHECK_EQUAL(seq.motifs().size(), 0);
659 }
660
661 BOOST_AUTO_TEST_CASE( out_operator )
662 {
663   string s("AAGGCCTT");
664   Sequence seq(s, Sequence::reduced_dna_alphabet);
665
666   ostringstream buf;
667   buf << s;
668   BOOST_CHECK_EQUAL( s, buf.str() );
669 }
670
671 BOOST_AUTO_TEST_CASE( get_name )
672 {
673   Sequence seq("AAGGCCTT", Sequence::reduced_dna_alphabet);
674
675   BOOST_CHECK_EQUAL( seq.get_name(), "" );
676   seq.set_species("hooman"); // anyone remember tradewars?
677   BOOST_CHECK_EQUAL( seq.get_name(), "hooman");
678   seq.set_fasta_header("fasta human");
679   BOOST_CHECK_EQUAL( seq.get_name(), "fasta human");
680 }
681
682 BOOST_AUTO_TEST_CASE( serialize_simple )
683 {
684   std::string seq_string = "AAGGCCTT";
685   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
686   seq.set_species("ribbet");
687   std::ostringstream oss;
688   // allocate/deallocate serialization components
689   {
690     boost::archive::text_oarchive oarchive(oss);
691     const Sequence& const_seq(seq);
692     BOOST_CHECK_EQUAL(seq, const_seq);
693     oarchive << const_seq;
694   }
695   Sequence seq_loaded;
696   {
697     std::istringstream iss(oss.str());
698     boost::archive::text_iarchive iarchive(iss);
699     iarchive >> seq_loaded;
700   }
701   BOOST_CHECK_EQUAL(seq_loaded, seq);
702   BOOST_CHECK_EQUAL(seq.get_species(), "ribbet");
703 }  
704
705 BOOST_AUTO_TEST_CASE( serialize_tree )
706 {
707   std::string seq_string = "AAGGCCTT";
708   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
709   seq.set_species("ribbet");
710   seq.add_motif("AA");
711   seq.add_motif("GC");
712   annot a1(6,7,"t","t");
713   seq.add_annotation(a1);
714
715   std::ostringstream oss;
716   // allocate/deallocate serialization components
717   {
718     boost::archive::text_oarchive oarchive(oss);
719     const Sequence& const_seq(seq);
720     BOOST_CHECK_EQUAL(seq, const_seq);
721     oarchive << const_seq;
722   }
723
724   Sequence seq_loaded;
725   {
726     std::istringstream iss(oss.str());
727     boost::archive::text_iarchive iarchive(iss);
728     iarchive >> seq_loaded;
729   }
730   BOOST_CHECK_EQUAL(seq_loaded, seq);
731 }  
732
733 // this writes out an "old" style annotated sequence
734 // with annotations attached as "motifs" and "annots"
735 BOOST_AUTO_TEST_CASE( serialize_xml_sequence )
736 {
737   std::string seq_string = "AAGGCCTT";
738   Sequence seq(seq_string, Sequence::reduced_dna_alphabet);
739   seq.set_species("ribbet");
740   seq.add_motif("AA");
741   seq.add_motif("GC");
742   annot a1(6,7,"t","t");
743   seq.add_annotation(a1);
744
745   std::ostringstream oss;
746   // allocate/deallocate serialization components
747   {
748     boost::archive::xml_oarchive oarchive(oss);
749     const Sequence& const_seq(seq);
750     BOOST_CHECK_EQUAL(seq, const_seq);
751     oarchive << boost::serialization::make_nvp("root", const_seq);
752   }
753   Sequence seq_loaded;
754   {
755     std::istringstream iss(oss.str());
756     boost::archive::xml_iarchive iarchive(iss);
757     iarchive >> boost::serialization::make_nvp("root", seq_loaded);
758   }
759   BOOST_CHECK_EQUAL(seq_loaded, seq);
760 }
761
762 BOOST_AUTO_TEST_CASE( serialize_xml_two )
763 {
764   std::string seq_string = "AAGGCCTT";
765   Sequence seq1(seq_string, Sequence::reduced_dna_alphabet);
766   Sequence seq2(seq1);
767
768   std::ostringstream oss;
769   // allocate/deallocate serialization components
770   {
771     boost::archive::xml_oarchive oarchive(oss);
772     const Sequence& const_seq1(seq1);
773     const Sequence& const_seq2(seq2);
774     oarchive << boost::serialization::make_nvp("seq1", const_seq1);
775     oarchive << boost::serialization::make_nvp("seq2", const_seq2);
776   }
777   //std::cout << "xml: " << oss.str() << std::endl;
778   Sequence seq1_loaded;
779   Sequence seq2_loaded;
780   {
781     std::istringstream iss(oss.str());
782     boost::archive::xml_iarchive iarchive(iss);
783     iarchive >> boost::serialization::make_nvp("seq1", seq1_loaded);
784     iarchive >> boost::serialization::make_nvp("seq2", seq2_loaded);
785   }
786   BOOST_CHECK_EQUAL(seq1_loaded, seq1);
787   BOOST_CHECK_EQUAL(seq2_loaded, seq2);
788   // test if our pointers are the same
789   BOOST_CHECK_EQUAL(seq1_loaded.c_str(), seq2_loaded.c_str());
790 }