fix more warnings
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include "alg/nway_paths.hpp"
16 #include "alg/conserved_path.hpp"
17 #include "mussa_exceptions.hpp"
18
19 #include <fstream>
20 #include <iostream>
21 #include <stdexcept>
22
23 using namespace std;
24
25 NwayPaths::NwayPaths()
26 {
27 }
28
29 void
30 NwayPaths::setup(int w, int t)
31 {
32   threshold = t;
33   soft_thres = threshold;
34   win_size = w;
35   pathz.clear();
36
37   //cout << "nway: thres = " << threshold
38   //     << ", soft threo = " << soft_thres << endl;
39 }
40
41 void
42 NwayPaths::set_soft_thres(int sft_thres)
43 {
44   soft_thres = sft_thres;
45 }
46
47 int NwayPaths::get_threshold() const
48 {
49   return threshold;
50 }
51
52 int NwayPaths::get_window() const
53 {
54   return win_size;
55 }
56
57 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
58 // doesn't deal with interleaved adjacency
59 void
60 NwayPaths::simple_refine()
61 {
62   // ext_path remembers the first window set in an extending path
63   ExtendedConservedPath ext_path, new_path;
64   list<ConservedPath>::iterator cur_path, next_path;
65   list<ConservedPath>::iterator pathz_i;
66   int win_ext_len = 0;
67   bool extending, end = false;
68
69   refined_pathz.clear();
70
71   //cout << "path number is: " << pathz.size() << endl;
72   pathz_i = pathz.begin();
73
74   // only try to extend when pathz isn't empty.
75   if (pathz_i != pathz.end())
76   {
77     ext_path = ExtendedConservedPath( win_size, *pathz_i);
78
79     while(pathz_i != pathz.end())
80     {
81       // keep track of current path and advance to next path
82       cur_path = pathz_i;
83       ++pathz_i;
84       if (pathz_i == pathz.end())
85         end = true;
86       else
87         next_path = pathz_i;
88
89       if (not end)
90       {
91         // if node for each seq is equal to the next node+1 then for all
92         // sequences then we are extending
93         extending = cur_path->nextTo(*next_path);
94       }
95       else
96         extending = false;
97   
98       if (extending)
99       {
100         win_ext_len++;
101       }
102       else
103       {
104         // add the extend window length as first element and add as refined
105         // now that we have the path to extend save it
106         new_path = ext_path;
107         new_path.extend(win_ext_len);
108         refined_pathz.push_back(new_path);
109         // reset stuff
110         win_ext_len = 0;
111         ext_path = ExtendedConservedPath( win_size, *next_path);
112       }
113     }
114   }
115   //cout << "r_path number is: " << refined_pathz.size() << endl;
116 }
117
118
119 void
120 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
121 {
122   pathz.push_back(ConservedPath(threshold, loaded_path));
123 }
124
125 void
126 NwayPaths::add_path(ConservedPath loaded_path)
127 {
128   pathz.push_back(loaded_path);
129 }
130
131
132 void
133 NwayPaths::save(string save_file_path)
134 {
135   fstream save_file;
136   list<ExtendedConservedPath >::iterator path_i, paths_end;
137
138   save_file.open(save_file_path.c_str(), ios::out);
139
140   save_file << "<Mussa type=flp seq_count=" << sequence_count();
141   save_file << " win=" << win_size;
142   // add a function para new_thres defaults to -1 to later deal with
143   // reanalysis with higher thres - if statement whether to record base thres
144   // or new thres (ie if -1, then base)
145   save_file << " thres=" << threshold << " >\n";
146
147   path_i = refined_pathz.begin();
148   paths_end = refined_pathz.end();
149   //path_i = pathz.begin();
150   //paths_end = pathz.end();
151   while (path_i != paths_end)
152   {
153     ExtendedConservedPath& a_path = *path_i;
154     //cout << a_path.size() << endl;
155     //first entry is the window length of the windows in the path
156     save_file << a_path.window_size << ":";
157     for(size_t i = 0; i != sequence_count(); ++i)
158     {
159       save_file << a_path[i];
160       if (i != sequence_count())
161         save_file << ",";
162     }
163     save_file << endl;
164     ++path_i;
165   }
166
167   save_file << "</Mussa>\n";
168   save_file.close();
169 }
170
171
172 size_t
173 NwayPaths::sequence_count()
174 {
175   if (refined_pathz.begin() == refined_pathz.end() )
176     return 0;
177   else
178     return refined_pathz.begin()->size();
179 }
180
181
182 void
183 NwayPaths::load(string load_file_path)
184 {
185   fstream load_file;
186   string file_data_line, header_data, data, path_node, path_width;
187   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
188   vector<int> loaded_path;
189
190   load_file.open(load_file_path.c_str(), ios::in);
191
192   if (!load_file)
193   {
194     throw mussa_load_error("Sequence File: " + load_file_path + " not found");
195   }
196   else
197   {
198     // get header data
199     // grab mussa tag - discard for now...maybe check in future...
200     getline(load_file,file_data_line);
201     space_split_i = file_data_line.find(" ");
202     file_data_line = file_data_line.substr(space_split_i+1);
203     // grab type tag - need in future to distinguish between flp and vlp paths
204     space_split_i = file_data_line.find(" ");
205     file_data_line = file_data_line.substr(space_split_i+1);
206     // get species/seq number
207     space_split_i = file_data_line.find(" ");
208     header_data = file_data_line.substr(0,space_split_i); 
209     equal_split_i = header_data.find("=");
210     data = file_data_line.substr(equal_split_i+1); 
211     unsigned int species_num = atoi (data.c_str());
212     file_data_line = file_data_line.substr(space_split_i+1);
213     // get window size
214     space_split_i = file_data_line.find(" ");
215     header_data = file_data_line.substr(0,space_split_i); 
216     equal_split_i = header_data.find("=");
217     data = file_data_line.substr(equal_split_i+1); 
218     win_size = atoi (data.c_str());
219     file_data_line = file_data_line.substr(space_split_i+1);
220     // get threshold size
221     space_split_i = file_data_line.find(" ");
222     header_data = file_data_line.substr(0,space_split_i); 
223     equal_split_i = header_data.find("=");
224     data = file_data_line.substr(equal_split_i+1); 
225     threshold = atoi (data.c_str());
226     file_data_line = file_data_line.substr(space_split_i+1);
227     
228     //cout << "seq_num=" << species_num << " win=" << win_size;
229     //cout << " thres=" << threshold << endl;
230     
231     // clear out the current data
232     refined_pathz.clear();
233     
234     int temp;
235     
236     getline(load_file,file_data_line);
237     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
238     {
239       if (file_data_line != "")
240       {
241         loaded_path.clear();
242         colon_split_i = file_data_line.find(":");
243         // whats our window size?
244         path_width = file_data_line.substr(0,colon_split_i); 
245         file_data_line = file_data_line.substr(colon_split_i+1);
246         for(size_t i = 0; i < species_num; i++)
247         {
248           comma_split_i = file_data_line.find(",");
249           path_node = file_data_line.substr(0, comma_split_i); 
250           temp = atoi (path_node.c_str());
251           loaded_path.push_back(temp);
252           file_data_line = file_data_line.substr(comma_split_i+1);
253         }
254         assert (loaded_path.size() == species_num );
255         refined_pathz.push_back(ExtendedConservedPath(atoi(path_width.c_str()), 
256                                                       threshold, 
257                                                       loaded_path));
258       }
259       getline(load_file,file_data_line);
260     }
261     load_file.close();
262   }
263 }
264
265
266 void
267 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_t depth)
268 {
269   list<int> new_nodes, trans_check_nodes;
270   list<int>::iterator new_nodes_i, new_nodes_end;
271   bool trans_check_good;
272
273   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
274   new_nodes_i = new_nodes.begin();
275   new_nodes_end = new_nodes.end();
276   while(new_nodes_i != new_nodes_end)
277   {
278     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
279     // check transitivity with previous nodes in path
280     trans_check_good = true;
281     for(size_t i = 0; i < depth - 1; i++)
282     {
283       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
284       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
285                                             trans_check_nodes.end(),
286                                             *new_nodes_i) ) &&
287            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
288                                            trans_check_nodes.end(),
289                                            *new_nodes_i * -1) ) )
290         trans_check_good = false;
291     }
292
293     if (trans_check_good)
294     {
295       // this makes sure path nodes are recorded with RC status relative to
296       // the base species
297       if ( path[depth-1] >= 0)
298         path.push_back(*new_nodes_i);
299       else
300         path.push_back(*new_nodes_i * -1);
301
302       if (depth < all_comparisons.size() - 1)
303         path_search(all_comparisons, path, depth + 1);
304       else
305         pathz.push_back(path);
306       path.pop_back();
307     } 
308     ++new_nodes_i;
309   }
310 }
311 /* use this if I ever get the friggin seqcomp match lists to sort...
312       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
313                         *new_nodes_i))
314 */
315
316 void
317 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
318 {
319   ConservedPath path;
320   int win_i, window_num;
321   list<int> new_nodes;
322   list<int>::iterator new_nodes_i, new_nodes_end;
323
324   pathz.clear();
325   window_num = all_comparisons[0][1].size();
326   // loop thru all windows in first species
327   for (win_i = 0; win_i < window_num; win_i++)
328   {
329     path.clear();
330     path.push_back(win_i);
331     new_nodes = all_comparisons[0][1].match_locations(path[0]);
332     new_nodes_i = new_nodes.begin();
333     new_nodes_end = new_nodes.end();
334     //if (new_nodes_i != new_nodes_end)
335     //cout << "* species 0 node: " << win_i << endl;
336     path.push_back(0);
337     while(new_nodes_i != new_nodes_end)
338     {
339       //cout << "  * species 1 node: " << *new_nodes_i << endl;
340       path[1] = *new_nodes_i;
341       path_search(all_comparisons, path, 2);
342       ++new_nodes_i;
343     }
344   }
345 }
346
347
348 void
349 NwayPaths::save_old(string save_file_path)
350 {
351   fstream save_file;
352   list<ConservedPath >::iterator path_i, paths_end;
353   size_t i;
354
355   save_file.open(save_file_path.c_str(), ios::app);
356
357   path_i = pathz.begin();
358   paths_end = pathz.end();
359   while(path_i != paths_end)
360   {
361     ConservedPath& a_path = *path_i;
362     //cout << a_path.size() << endl;
363     for(i = 0; i < path_i->size(); ++i)
364       save_file << i << "," << a_path[i] << " ";
365     save_file << endl;
366     ++path_i;
367   }
368   save_file.close();
369 }
370
371
372 /*
373 void
374 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
375 {
376   int win_i, sp_i;
377   vector<list<list<int> > > path_src_tree;
378   list<int> new_nodes;
379   list<int>::iterator node_i, node_end;
380   <list<list<int> > >::iterator branch_i, branch_end;  
381
382   pathz.clear();
383   path_src_tree.reserve(all_comparisons.size()- 1);
384
385
386   // loop thru all windows in first species
387   for (win_i = 0; win_i < window_num; win_i++)
388   {
389     // clear the path search tree
390     for(i = 0; i < all_comparisons.size(); i++)
391       path_src_tree[i].clear();
392
393     // top level kept empty even tho implicity has one entry of the first
394     // species at this window - why bother, adds a little speed
395
396     // get connection list for first species, creating a list of nodes
397     // of second species connected to the first species at this window
398     new_nodes = all_comparisons[0][1];
399     path_src_tree[1].push_back(new_nodes);
400
401     // loop thru rest of species for this window to see if any paths of matches
402     // go across all species
403     // if path search tree becomes empty, break out of loop, no reason to search further
404     sp_i = 1;
405     while ((sp_i < all_comparisons.size()) && (path tree not empty))
406     {
407       branch_i = path_src_tree[1].begin();
408       branch_end = path_src_tree[1].end();
409       while (branch_i != branch_end)
410       {
411         node_i = branch_i->begin();
412         node_end = branch_i->end();
413       }
414
415
416       // loop over all current nodes
417          // get connection list for each node
418          // loop over each previous node in list
419             // get those nodes connection list
420             // intersect previous node connections with current
421
422       ++sp_i;
423     }
424
425     // insert any of the paths found into the master list of paths
426
427     // add no paths if tmp_pathz is empty...
428   }
429 }
430
431 void NwayPaths::refine()
432 {
433 }
434 */
435
436
437 void NwayPaths::print(list<vector<int> >& dump_path)
438 {
439   list<vector<int> >::iterator pathz_i;
440   vector<int>::iterator path_i;
441
442   cout << "printing list of lists\n";
443   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
444   {
445     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
446       cout << *path_i << " ";
447     cout << endl;
448   }
449 }