len(mussa.paths()) should equal the number of found paths
[mussa.git] / alg / test / test_nway.cpp
1 #include <boost/test/auto_unit_test.hpp>
2 #include <boost/filesystem/path.hpp>
3 namespace fs = boost::filesystem;
4
5 #include <string>
6 #include <iostream>
7
8 #include "alg/mussa.hpp"
9 #include "alg/nway_paths.hpp"
10 #include "alg/sequence.hpp"
11
12 using namespace std;
13
14 //! there should be no matches
15 BOOST_AUTO_TEST_CASE( nway_null )
16 {
17   string s0("AAAANNNN");
18   string s1("GGGGNNNN");
19   string s2("TTTTNNNN");
20
21   Mussa analysis;
22   analysis.append_sequence(s0);
23   analysis.append_sequence(s1);
24   analysis.append_sequence(s2);
25   analysis.set_window(4);
26   analysis.set_threshold(3);
27   analysis.analyze();
28   NwayPaths npath = analysis.paths();
29   // we added 3 sequences, but none-matched
30   BOOST_CHECK_EQUAL( npath.sequence_count(), 0); 
31   BOOST_CHECK_EQUAL( npath.size(), 0 );
32   BOOST_CHECK_EQUAL( npath.path_size(), 0 );
33   BOOST_CHECK_EQUAL( npath.refined_path_size(), 0 );
34   // there should be no paths for these sequences
35   for (std::list<ConservedPath >::iterator pathz_i = npath.pathz.begin();
36        pathz_i != npath.pathz.end();
37        ++pathz_i)
38   {
39     BOOST_CHECK_EQUAL( pathz_i->size(), 0);
40   }
41 }
42
43 BOOST_AUTO_TEST_CASE( nway_test )
44 {
45   string s0("ATATGCGC");
46   string s1("GGGGGGGC");
47   Sequence seq1(s1);
48
49   Mussa analysis;
50   analysis.append_sequence(s0);
51   analysis.append_sequence(s1);
52   analysis.set_window(4);
53   analysis.set_threshold(3);
54   analysis.analyze();
55   NwayPaths npath = analysis.paths();
56   BOOST_CHECK_EQUAL( npath.sequence_count(), 2); 
57   BOOST_CHECK_EQUAL( npath.size(), 2 );
58   BOOST_CHECK_EQUAL( npath.path_size(), 2 );
59   BOOST_CHECK_EQUAL( npath.refined_path_size(), 2 );
60   for (std::list<ConservedPath >::iterator pathz_i = npath.pathz.begin();
61        pathz_i != npath.pathz.end();
62        ++pathz_i)
63   {
64     for( ConservedPath::iterator path_i = pathz_i->begin();
65          path_i != pathz_i->end();
66          ++path_i)
67     {      
68       BOOST_CHECK( *path_i == 4 || *path_i == -4);
69     }
70   }
71 }
72
73 BOOST_AUTO_TEST_CASE( nway_refine )
74 {
75   fs::path mupa_path( EXAMPLE_DIR, fs::native );
76   mupa_path /= "mck3test.mupa";
77   Mussa m1;
78   m1.load_mupa_file( mupa_path );
79   m1.analyze();
80   const NwayPaths& npath = m1.paths();
81   //BOOST_CHECK_EQUAL (npath.path_size(), npath.refined_path_size());
82   // FIXME: shouldn't these be equal to start with?
83   BOOST_CHECK(npath.path_size() > npath.refined_path_size());
84   size_t first_refined_size = npath.refined_path_size();
85   BOOST_CHECK( first_refined_size > 0 );
86
87   // we're using a window size 30 (threshold 20) example
88   m1.set_soft_threshold(22);
89   m1.nway();
90   BOOST_CHECK_EQUAL( npath.path_size(), npath.size() );
91   BOOST_CHECK( npath.path_size() > 0 );
92   BOOST_CHECK( npath.refined_path_size() > 0);
93   BOOST_CHECK( npath.refined_path_size() < first_refined_size);
94
95   m1.set_soft_threshold(20);
96   m1.nway();
97   BOOST_CHECK_EQUAL(npath.refined_path_size(), first_refined_size);
98 }
99
100 // The following data causes a crash...
101 // ticket:85 is the user report 
102 // ticket:83 provided the sample data
103 // ticket:64 was my version where I didn't have a consistent way of 
104 // duplicating.
105 BOOST_AUTO_TEST_CASE( nway_threshold_crash )
106 {
107   string seq1 = "CACTCCCTCGAAGCTGCTGT\
108 TCTCTTGTCTGTCTGTCTCTGTCTTGAAGCTCAGCCAAGAAACTTTCCCGTGTCACGCCT\
109 GCGTCCCACCGTGGGGCTCTCTTGGAGCACCCAGGGACACCCAGCGTGCAACAGCCACGG\
110 GAAGCCTTTCTGCCGCCCAGGCCCACAGGTCTCGAGACGCACATGCACGCCTGGGCGTGG\
111 CAGCCTCACAGGGAACACGGGACAGACGCCGGCGACGCGCAGACACACGGACACGCGGAA\
112 GCCAAGCACACTCTGGCGGGTCCCGCAAGGGACGCCGTGGAAGAAAGGAGCCTGTGGCAA\
113 CAGGCGGCCGAGCTGCCGAATTCAGTTGACACGAGGCACAGAAAACAAATATCAAAGATC\
114 TAATAATACAAAACAAACTTGATTAAAACTGGTGCTTAAAGTTTATTACCCACAACTCCA\
115 CAGTCTCTGTGTAAACCACTCGACTCATCTTGTAGCTTATTTTTTTTTAAAGAGGACGTT\
116 TTCTACGGCTGTGGCCCGCCTCTGTGAACCATAGCGGTGTGCGGCGGGGGGTCTGCACCC\
117 GGGTGGGGGACAGAGGGACCTTTAAAGAAAACAAAACTGGACAGAAACAGGAATGTGAGC\
118 TGGGGGAGCTGGCTTGAGTTTCTCAAAAGCCATCGGAAGATGCGAGTTTGTGCCTTTTTT\
119 TTTATTGCTCTGGTGGATTTTTGTGGCTGGGTTTTCTGAAGTCTGAGGAACAATGCCTTA\
120 AGAAAAAACAAACAGCAGGAATCGGTGGGACAGTTTCCTGTGGCCAGCCGAGCCTGGCAG\
121 TGCTGGCACCGCGAGCTGGCCTGACGCCTCAAGCACGGGCACCAGCCGTCATCTCCGGGG\
122 CCAGGGGCTGCAGCCCGGCGGTCCCTGTTTTGCTTTATTGCTGTTTAAGAAAAATGGAGG\
123 TAGTTCCAAAAAAGTGGCAAATCCCGTTGGAGGTTTTGAAGTCCAACAAATTTTAAACGA\
124 ATCCAAAGTGTTCTCACACGTCACATACGATTGAGCATCTCCATCTGGTCGTGAAGCATG\
125 TGGTAGGCACACTTGCAGTGTTACGATCGGAATGCTTTTTATTAAAAGCAAGTAGCATGA\
126 AGTATTGCTTAAATTTTAGGTATAAATAAATATATATATGTATAATATATATTCCAATGT\
127 ATTCCAAGCTAAGAAACTTACTTGATTCTTATGAAATCTTGATAAAATATTTATAATGCA\
128 TTTATAGAAAAAGTATATATATATATATAAAATGAATGCAGATTGCGAAGGTCCCTGCAA\
129 ATGGATGGCTTGTGAATTTGCTCTCAAGGTGCTTATGGAAAGGGATCCTGATTGATTGAA\
130 ATTCATGTTTTCTCAAGCTCCAGATTGGCTAGATTTCAGATCGCCAACACATTCGCCACT\
131 GGGCAACTACCCTACAAGTTTGTACTTTCATTTTAATTATTTTCTAACAGAACCGCTCCC\
132 GTCTCCAAGCCTTCATGCACATATGTACCTAATGAGTTTTTATAGCAAAGAATATAAATT\
133 TGCTGTTGATTTTTGTATGAATTTTTTCACAAAAAGATCCTGAATAAGCATTGTTTTATG\
134 AATTTTACATTTTTCCTCACCATTTAGCAATTTTCTGAATGGTAATAATGTCTAAATCTT\
135 TTTCCTTTCTGAATTCTTGCTTGTACATTTTTTTTTACCTTTCAAAGGTTTTTAATTATT\
136 TTTGTTTTTATTTTTGTACGATGAGTTTTCTGCAGCGTACAGAATTGTTGCTGTCAGATT\
137 CTATTTTCAGAAAGTGAGAGGAGGGACCGTAGGTCTTTTCGGAGTGACACCAACGATTGT\
138 GTCTTTCCTGGTCTGTCCTAGGAGCTGTATAAAGAAGCCCAGGGGCTCTTTTTAACTTTC\
139 AACACTAGTAGTATTACGAGGGGTGGTGTGTTTTTCCCCTCCGTGGCAAGGGCAGGGAGG\
140 GTTGCTTAGGATGCCCGGCCACCCTGGGAGGCTTGCCAGATGCCGGGGGCAGTCAGCATT\
141 AATGAAACTCATGTTTAAACTTCTCTGACCACATCGTCAGGATAGAATTCTAACTTGAGT\
142 TTTCCAAAGACCTTTTGAGCATGTCAGCAATGCATGGGGCACACGTGGGGCTCTTTACCC\
143 ACTTGGGTTTTTCCACTGCAGCCACGTGGCCAGCCCTGGATTTTGGAGCCTGTGGCTGCA\
144 AGGAACCCAGGGACCCTTGTTGCCTGGTGAACCTGCAGGGAGGGTATGATTGCCTGACCA\
145 GGACAGCCAGTCTTTACTCTTTTTCTCTTCAACAGTAACTGACAGTCACGTTTTACTGGT\
146 AACTTATTTTCCAGCACATGAAGCCACCAGTTTCATTCCAAAGTGTATATTGGGTTCAGA\
147 CTTGGGGGCAGAAGTTCAGACACACCGTGCTCAGGAGGGACCCAGAGCCGAGTTTCGGAG\
148 TTTGGTAAAGTTTACAGGGTAGCTTCTGAAATTAACTCAAACTTTTGACCAAATGAGTGC\
149 AGATTCTTGGATTCACTTGGTCACTGGGCTGCTGATGGTCAGCTCTGAGACAGTGGTTTG\
150 AGAGCAGGCAGAACGGTCTTGGGACTTGTTTGACTTTCCCCTCCCTGGTGGCCACTCTTT\
151 GCTCTGAAGCCCAGATTGGCAAGAGGAGCTGGTCCATTCCCCATTCATGGCACAGAGCAG\
152 TGGCAGGGCCCAGCTAGCAGGCTCTTCTGGCCTCCTTGGCCTCATTCTCTGCATAGCCCT\
153 CTGGGGATCCTGCCACCTGCCCTCTTACCCCGCCGTGGCTTATGGGGAGGAATGCATCAT\
154 CTCACTTTTTTTTTTTAAGCAGATGATGGGATAACATGGACTGCTCAGTGGCCAGGTTAT\
155 CAGTGGGGGGACTTAATTCTAATCTCATTCAAATGGAGACGCCCTCTGCAAAGGCCTGGC\
156 AGGGGGAGGCACGTTTCATCTGTCAGCTCACTCCAGCTTCACAAATGTGCTGAGAGCATT\
157 ACTGTGTAGCCTTTTCTTTGAAGACACACTCGGCTCTTCTCCACAGCAAGCGTCCAGGGC\
158 AGATGGCAGAGGATCTGCCTCGGCGTCTGCAGGCGGGACCACGTCAGGGAGGGTTCCTTC\
159 ATGTGTTCTCCCTGTGGGTCCTTGGACCTTTAGCCTTTTTCTTCCTTTGCAAAGGCCTTG\
160 GGGGCACTGGCTGGGAGTCAGCAAGCGAGCACTTTATATCCCTTTGAGGGAAACCCTGAT\
161 GACGCCACTGGGCCTCTTGGCGTCTGCCCTGCCCTCGCGGCTTCCCGCCGTGCCGCAGCG\
162 TGCCCACGTGCCCACGCCCCACCAGCAGGCGGCTGTCCCGGAGGCCGTGGCCCGCTGGGA\
163 CTGGCCGCCCCTCCCCAGCGTCCCAGGGCTCTGGTTCTGGAGGGCCACTTTGTCAAGGTG\
164 TTTCAGTTTTTCTTTACTTCTTTTGAAAATCTGTTTGCAAGGGGAAGGACCATTTCGTAA\
165 TGGTCTGACACAAAAGCAAGTTTGATTTTTGCAGCACTAGCAATGGACTTTGTTGTTTTT\
166 CTTTTTGATCAGAACATTCCTTCTTTACTGGTCACAGCCACGTGCTCATTCCATTCTTCT\
167 TTTTGTAGACTTTGGGCCCACGTGTTTTATGGGCATTGATACATATATAAATATATAGAT\
168 ATAAATATATATGAATATATTTTTTTAAGTTTCCTACACCTGGAGGTTGCATGGACTGTA\
169 CGACCGGCATGACTTTATATTGTATACAGATTTTGCACGCCAAACTCGGCAGCTTTGGGG\
170 AAGAAGAAAAATGCCTTTCTGTTCCCCTCTCATGACATTTGCAGATACAAAAGATGGAAA\
171 TTTTTCTGTAAAACAAAACCTTGAAGGAGAGGAGGGCGGGGAAGTTTGCGTCTTATTGAA\
172 CTTATTCTTAAGAAATTGTACTTTTTATTGTAAGAAAAATAAAAAGGACTACTTAAACAT\
173 TTGTCATATTAAGAAAAAAAGTTTATCTAGCACTTGTGACATACCAATAATAGAGTTTAT\
174 TGTATTTATGTGGAAACAGTGTTTTAGGGAAACTACTCAGAATTCACAGTGAACTGCCTG\
175 TCTCTCTCGAGTTGATTTGGAGGAATTTTGTTTTGTTTTGTTTTGTTTGTTTCCTTTTAT\
176 CTCCTTCCACGGGCCAGGCGAGCGCCGCCCGCCCTCACTGGCCTTGTGACGGTTTATTCT\
177 GATTGAGAACTGGGCGGACTCGAAAGAGTCCCCTTTTCCGCACAGCTGTGTTGACTTTTT\
178 AATTACTTTTAGGTGATGTATGGCTAAGATTTCACTTTAAGCAGTCGTGAACTGTGCGAG\
179 CACTGTGGTTTACAATTATACTTTGCATCGAAAGGAAACCATTTCTTCATTGTAACGAAG\
180 CTGAGCGTGTTCTTAGCTCGGCCTCACTTTGTCTCTGGCATTGATTAAAAGTCTGCTATT";
181   string seq2="ATCCCAGCACATGACAACACTTCAGAAGGG\
182 TCCCCCTGCTGACTGGAGAGCTGGGAATATGGCATTTGGACACTTCATTTGTAAATAGTG\
183 TACATTTTAAACATTGGCTCGAAACTTCAGAGATAAGTCATGGAGAGGACATTGGAGGGG\
184 AGAAATGCAGTTGCTGACTGGGAATTTAAGAATGTGAACTTCTCACTAGAATTGGTATGG\
185 AAAAGCAAAATACTGTAAATAAACTTTTTTTCTAACAATTTGCC";
186   string seq3="GGAGCTGGATGAATGAGAGGCCCCCAGATGCAGAGAGACTGGAGAGGGT\
187 GGGGAGGGGCCCAGCGGCCTTGGTGACAGGCCCAGGGTGGGAGGGGTCGGGGCCCCTGGA\
188 GGGGCAATGGGGAGGTGATGTCTTCTCTCTGCTCAGAGAGCAGGGACTAGGGTAGGACCC\
189 TCACCGCTGCGTCCAGCAGACACTGAACCAGAATTGGAAACGTGCTTGAAACAATCACAC\
190 AGGACACTTTTCTACATTGGTGCAAAATGGAATATTTTGTACATTTTTAAAATGTGATTT\
191 TTGTATATACTTGTATATGTATGCCAATTTGGTGCTTTTTGTAAAGGAACTTTTGTATAA\
192 TAATGCCTGGTCGTTGGGTGACCTGCGATTGTCAGAAAGAGGGGAAGGAAGCCAGGTTGA\
193 TACAGCTGCCCACTTCCTTTCCTGAGCAGGAGGATGGGGTAGCACTCACAGGGACGATGT\
194 GCTGTATTTCAGTGCCTATCCCAGACATACGGGGTGGTAACTGAGTTTGTGTTATATGTT\
195 GTTTTAATAAATGCACAATGCTCTCTTCCTGTTCTTC";
196
197   // now that we've got some data lets see if it crashes
198   Mussa m1;
199   m1.append_sequence(seq1);
200   m1.append_sequence(seq2);
201   m1.append_sequence(seq3);
202
203   m1.set_window(10);
204   m1.set_threshold(8);
205   m1.analyze();
206   m1.set_soft_threshold(10);
207   m1.nway();
208 }