419d5edfbf03441171fb3f32add1fdd8f609aced
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
22
23 #include <iostream>
24 #include <stdexcept>
25
26 using namespace std;
27
28 NwayPaths::NwayPaths()
29   : progress_cb(0)
30 {
31 }
32
33 void
34 NwayPaths::setup(int w, int t)
35 {
36   threshold = t;
37   soft_thres = threshold;
38   win_size = w;
39   progress_cb = 0;
40   pathz.clear();
41
42   //cout << "nway: thres = " << threshold
43   //     << ", soft threo = " << soft_thres << endl;
44 }
45
46 void
47 NwayPaths::set_soft_threshold(int sft_thres)
48 {
49   soft_thres = sft_thres;
50 }
51
52 int NwayPaths::get_threshold() const
53 {
54   return threshold;
55 }
56
57 int NwayPaths::get_window() const
58 {
59   return win_size;
60 }
61
62 void NwayPaths::set_progress_callback(analysis_callback cb)
63 {
64   progress_cb = cb;
65 }
66
67 analysis_callback NwayPaths::get_progress_callback() const
68 {
69   return progress_cb;
70 }
71
72 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
73 // doesn't deal with interleaved adjacency
74 void
75 NwayPaths::simple_refine()
76 {
77   // ext_path remembers the first window set in an extending path
78   ConservedPath ext_path, new_path;
79   list<ConservedPath>::iterator cur_path, next_path;
80   list<ConservedPath>::iterator pathz_i;
81   int win_ext_len = 0;
82   bool extending, end = false;
83
84   refined_pathz.clear();
85
86   //cout << "path number is: " << pathz.size() << endl;
87   pathz_i = pathz.begin();
88   int path_count = 0;
89
90   // only try to extend when pathz isn't empty.
91   if (pathz_i != pathz.end())
92   {
93     ext_path = *pathz_i;
94     ++path_count;
95
96     while(pathz_i != pathz.end())
97     {
98       // keep track of current path and advance to next path
99       cur_path = pathz_i;
100       ++pathz_i;
101       ++path_count;
102
103       if (pathz_i == pathz.end()) {
104         end = true;
105         extending = false;
106       } else {
107         next_path = pathz_i;
108         // if node for each seq is equal to the next node+1 then for all
109         // sequences then we are extending
110         extending = cur_path->nextTo(*next_path);
111       }
112
113       if (extending)
114       {
115         win_ext_len++;
116       }
117       else
118       {
119         // add the extend window length as first element and add as refined
120         // now that we have the path to extend save it
121         new_path = ext_path;
122         new_path.extend(win_ext_len);
123         refined_pathz.push_back(new_path);
124         if (not end) {
125           // reset stuff
126           win_ext_len = 0;
127           ext_path = *next_path;
128         }
129       }
130       if (progress_cb) {
131         progress_cb("refine", path_count-1, pathz.size());
132       }
133     }
134   }
135   //cout << "r_path number is: " << refined_pathz.size() << endl;
136 }
137
138
139 void
140 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
141 {
142   pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
143 }
144
145 void
146 NwayPaths::add_path(ConservedPath loaded_path)
147 {
148   pathz.push_back(loaded_path);
149 }
150
151
152 void
153 NwayPaths::save(fs::path save_file_path)
154 {
155   fs::fstream save_file;
156   list<ConservedPath >::iterator path_i, paths_end;
157
158   save_file.open(save_file_path, ios::out);
159
160   save_file << "<Mussa type=flp seq_count=" << sequence_count();
161   save_file << " win=" << win_size;
162   // add a function para new_thres defaults to -1 to later deal with
163   // reanalysis with higher thres - if statement whether to record base thres
164   // or new thres (ie if -1, then base)
165   save_file << " thres=" << threshold << " >\n";
166
167   path_i = refined_pathz.begin();
168   paths_end = refined_pathz.end();
169   //path_i = pathz.begin();
170   //paths_end = pathz.end();
171   while (path_i != paths_end)
172   {
173     ConservedPath& a_path = *path_i;
174     //cout << a_path.size() << endl;
175     //first entry is the window length of the windows in the path
176     save_file << a_path.window_size << ":";
177     for(size_t i = 0; i != sequence_count(); ++i)
178     {
179       save_file << a_path[i];
180       if (i != sequence_count())
181         save_file << ",";
182     }
183     save_file << endl;
184     ++path_i;
185   }
186
187   save_file << "</Mussa>\n";
188   save_file.close();
189 }
190
191
192 size_t
193 NwayPaths::sequence_count()
194 {
195   if (refined_pathz.begin() == refined_pathz.end() )
196     return 0;
197   else
198     return refined_pathz.begin()->size();
199 }
200
201
202 void
203 NwayPaths::load(fs::path load_file_path)
204 {
205   fs::fstream load_file;
206   string file_data_line, header_data, data, path_node, path_width;
207   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
208   vector<int> loaded_path;
209
210   load_file.open(load_file_path, ios::in);
211
212   if (!load_file)
213   {
214     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
215   }
216   else
217   {
218     // get header data
219     // grab mussa tag - discard for now...maybe check in future...
220     getline(load_file,file_data_line);
221     space_split_i = file_data_line.find(" ");
222     file_data_line = file_data_line.substr(space_split_i+1);
223     // grab type tag - need in future to distinguish between flp and vlp paths
224     space_split_i = file_data_line.find(" ");
225     file_data_line = file_data_line.substr(space_split_i+1);
226     // get species/seq number
227     space_split_i = file_data_line.find(" ");
228     header_data = file_data_line.substr(0,space_split_i); 
229     equal_split_i = header_data.find("=");
230     data = file_data_line.substr(equal_split_i+1); 
231     unsigned int species_num = atoi (data.c_str());
232     file_data_line = file_data_line.substr(space_split_i+1);
233     // get window size
234     space_split_i = file_data_line.find(" ");
235     header_data = file_data_line.substr(0,space_split_i); 
236     equal_split_i = header_data.find("=");
237     data = file_data_line.substr(equal_split_i+1); 
238     win_size = atoi (data.c_str());
239     file_data_line = file_data_line.substr(space_split_i+1);
240     // get threshold size
241     space_split_i = file_data_line.find(" ");
242     header_data = file_data_line.substr(0,space_split_i); 
243     equal_split_i = header_data.find("=");
244     data = file_data_line.substr(equal_split_i+1); 
245     threshold = atoi (data.c_str());
246     file_data_line = file_data_line.substr(space_split_i+1);
247     
248     //cout << "seq_num=" << species_num << " win=" << win_size;
249     //cout << " thres=" << threshold << endl;
250     
251     // clear out the current data
252     refined_pathz.clear();
253     
254     int temp;
255     
256     getline(load_file,file_data_line);
257     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
258     {
259       if (file_data_line != "")
260       {
261         loaded_path.clear();
262         colon_split_i = file_data_line.find(":");
263         // whats our window size?
264         path_width = file_data_line.substr(0,colon_split_i); 
265         file_data_line = file_data_line.substr(colon_split_i+1);
266         for(size_t i = 0; i < species_num; i++)
267         {
268           comma_split_i = file_data_line.find(",");
269           path_node = file_data_line.substr(0, comma_split_i); 
270           temp = atoi (path_node.c_str());
271           loaded_path.push_back(temp);
272           file_data_line = file_data_line.substr(comma_split_i+1);
273         }
274         assert (loaded_path.size() == species_num );
275         refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()), 
276                                                    threshold, 
277                                                    loaded_path));
278       }
279       getline(load_file,file_data_line);
280     }
281     load_file.close();
282   }
283 }
284
285
286 void
287 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_t depth)
288 {
289   list<int> new_nodes, trans_check_nodes;
290   list<int>::iterator new_nodes_i, new_nodes_end;
291   bool trans_check_good;
292
293   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
294   new_nodes_i = new_nodes.begin();
295   new_nodes_end = new_nodes.end();
296   while(new_nodes_i != new_nodes_end)
297   {
298     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
299     // check transitivity with previous nodes in path
300     trans_check_good = true;
301     for(size_t i = 0; i < depth - 1; i++)
302     {
303       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
304       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
305                                             trans_check_nodes.end(),
306                                             *new_nodes_i) ) &&
307            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
308                                            trans_check_nodes.end(),
309                                            *new_nodes_i * -1) ) )
310         trans_check_good = false;
311     }
312
313     if (trans_check_good)
314     {
315       // this makes sure path nodes are recorded with RC status relative to
316       // the base species
317       if ( path[depth-1] >= 0)
318         path.push_back(*new_nodes_i);
319       else
320         path.push_back(*new_nodes_i * -1);
321
322       if (depth < all_comparisons.size() - 1)
323         path_search(all_comparisons, path, depth + 1);
324       else
325         pathz.push_back(path);
326       path.pop_back();
327     } 
328     ++new_nodes_i;
329   }
330 }
331 /* use this if I ever get the friggin seqcomp match lists to sort...
332       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
333                         *new_nodes_i))
334 */
335
336 void
337 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
338 {
339   ConservedPath path;
340   int win_i, window_num;
341   list<int> new_nodes;
342   list<int>::iterator new_nodes_i, new_nodes_end;
343
344   pathz.clear();
345   window_num = all_comparisons[0][1].size();
346   // loop thru all windows in first species
347   for (win_i = 0; win_i < window_num; win_i++)
348   {
349     path.clear();
350     path.push_back(win_i);
351     new_nodes = all_comparisons[0][1].match_locations(path[0]);
352     new_nodes_i = new_nodes.begin();
353     new_nodes_end = new_nodes.end();
354     //if (new_nodes_i != new_nodes_end)
355     //cout << "* species 0 node: " << win_i << endl;
356     path.push_back(0);
357     while(new_nodes_i != new_nodes_end)
358     {
359       //cout << "  * species 1 node: " << *new_nodes_i << endl;
360       path[1] = *new_nodes_i;
361       path_search(all_comparisons, path, 2);
362       ++new_nodes_i;
363     }
364   }
365 }
366
367
368 void
369 NwayPaths::save_old(fs::path save_file_path)
370 {
371   fs::fstream save_file;
372   list<ConservedPath >::iterator path_i, paths_end;
373   size_t i;
374
375   save_file.open(save_file_path, ios::app);
376
377   path_i = pathz.begin();
378   paths_end = pathz.end();
379   while(path_i != paths_end)
380   {
381     ConservedPath& a_path = *path_i;
382     //cout << a_path.size() << endl;
383     for(i = 0; i < path_i->size(); ++i)
384       save_file << i << "," << a_path[i] << " ";
385     save_file << endl;
386     ++path_i;
387   }
388   save_file.close();
389 }
390
391
392 /*
393 void
394 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
395 {
396   int win_i, sp_i;
397   vector<list<list<int> > > path_src_tree;
398   list<int> new_nodes;
399   list<int>::iterator node_i, node_end;
400   <list<list<int> > >::iterator branch_i, branch_end;  
401
402   pathz.clear();
403   path_src_tree.reserve(all_comparisons.size()- 1);
404
405
406   // loop thru all windows in first species
407   for (win_i = 0; win_i < window_num; win_i++)
408   {
409     // clear the path search tree
410     for(i = 0; i < all_comparisons.size(); i++)
411       path_src_tree[i].clear();
412
413     // top level kept empty even tho implicity has one entry of the first
414     // species at this window - why bother, adds a little speed
415
416     // get connection list for first species, creating a list of nodes
417     // of second species connected to the first species at this window
418     new_nodes = all_comparisons[0][1];
419     path_src_tree[1].push_back(new_nodes);
420
421     // loop thru rest of species for this window to see if any paths of matches
422     // go across all species
423     // if path search tree becomes empty, break out of loop, no reason to search further
424     sp_i = 1;
425     while ((sp_i < all_comparisons.size()) && (path tree not empty))
426     {
427       branch_i = path_src_tree[1].begin();
428       branch_end = path_src_tree[1].end();
429       while (branch_i != branch_end)
430       {
431         node_i = branch_i->begin();
432         node_end = branch_i->end();
433       }
434
435
436       // loop over all current nodes
437          // get connection list for each node
438          // loop over each previous node in list
439             // get those nodes connection list
440             // intersect previous node connections with current
441
442       ++sp_i;
443     }
444
445     // insert any of the paths found into the master list of paths
446
447     // add no paths if tmp_pathz is empty...
448   }
449 }
450
451 void NwayPaths::refine()
452 {
453 }
454 */
455
456
457 void NwayPaths::print(list<vector<int> >& dump_path)
458 {
459   list<vector<int> >::iterator pathz_i;
460   vector<int>::iterator path_i;
461
462   cout << "printing list of lists\n";
463   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
464   {
465     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
466       cout << *path_i << " ";
467     cout << endl;
468   }
469 }