Load saved muway and set to muways soft threshold.
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
22
23 #include <iostream>
24 #include <stdexcept>
25
26 using namespace std;
27
28 NwayPaths::NwayPaths()
29 {
30 }
31
32 NwayPaths::NwayPaths(const NwayPaths &o)
33   : pathz(o.pathz),
34     refined_pathz(o.refined_pathz),
35     threshold(o.threshold),
36     win_size(o.win_size),
37     soft_thres(o.soft_thres),
38     ent_thres(o.ent_thres),
39     c_sequences(o.c_sequences)
40 {
41 }
42
43 void NwayPaths::clear()
44 {
45   c_sequences.clear();
46   pathz.clear();
47   refined_pathz.clear();
48 }
49
50 void
51 NwayPaths::setup(int w, int t)
52 {
53   threshold = t;
54   soft_thres = threshold;
55   win_size = w;
56   clear();
57
58   //cout << "nway: thres = " << threshold
59   //     << ", soft threo = " << soft_thres << endl;
60 }
61
62 void
63 NwayPaths::set_soft_threshold(int sft_thres)
64 {
65   soft_thres = sft_thres;
66 }
67
68 int NwayPaths::get_soft_threshold() const
69 {
70   return soft_thres;
71 }
72
73 int NwayPaths::get_threshold() const
74 {
75   return threshold;
76 }
77
78 int NwayPaths::get_window() const
79 {
80   return win_size;
81 }
82
83 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
84 // doesn't deal with interleaved adjacency
85 void
86 NwayPaths::simple_refine()
87 {
88   // ext_path remembers the first window set in an extending path
89   ConservedPath ext_path, new_path;
90   list<ConservedPath>::iterator cur_path, next_path;
91   list<ConservedPath>::iterator pathz_i;
92   int win_ext_len = 0;
93   bool extending, end = false;
94
95   refined_pathz.clear();
96
97   //cout << "path number is: " << pathz.size() << endl;
98   pathz_i = pathz.begin();
99   int path_count = 0;
100
101   // only try to extend when pathz isn't empty.
102   if (pathz_i != pathz.end())
103   {
104     ext_path = *pathz_i;
105     ++path_count;
106
107     while(pathz_i != pathz.end())
108     {
109       // keep track of current path and advance to next path
110       cur_path = pathz_i;
111       ++pathz_i;
112       ++path_count;
113
114       if (pathz_i == pathz.end()) {
115         end = true;
116         extending = false;
117       } else {
118         next_path = pathz_i;
119         // if node for each seq is equal to the next node+1 then for all
120         // sequences then we are extending
121         extending = cur_path->nextTo(*next_path);
122       }
123
124       if (extending)
125       {
126         win_ext_len++;
127       }
128       else
129       {
130         // add the extend window length as first element and add as refined
131         // now that we have the path to extend save it
132         new_path = ext_path;
133         new_path.extend(win_ext_len);
134         refined_pathz.push_back(new_path);
135         if (not end) {
136           // reset stuff
137           win_ext_len = 0;
138           ext_path = *next_path;
139         }
140       }
141       if ((path_count % 100) == 0)
142         emit progress("refine", path_count-1, pathz.size());
143     }
144   }
145   // this mysterious call tells the dialog box that we're actually done
146   emit progress("refine", pathz.size(), pathz.size());
147   //cout << "r_path number is: " << refined_pathz.size() << endl;
148 }
149
150 void
151 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
152 {
153   pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
154 }
155
156 void
157 NwayPaths::add_path(ConservedPath loaded_path)
158 {
159   pathz.push_back(loaded_path);
160 }
161
162
163 void
164 NwayPaths::save(fs::path save_file_path)
165 {
166   fs::fstream save_file;
167   list<ConservedPath >::iterator path_i, paths_end;
168
169   save_file.open(save_file_path, ios::out);
170
171   save_file << "<Mussa type=flp seq_count=" << sequence_count();
172   save_file << " win=" << win_size;
173   // add a function para new_thres defaults to -1 to later deal with
174   // reanalysis with higher thres - if statement whether to record base thres
175   // or new thres (ie if -1, then base)
176   save_file << " thres=" << threshold << " soft_thres=" << soft_thres << " >\n";
177
178   path_i = refined_pathz.begin();
179   paths_end = refined_pathz.end();
180   //path_i = pathz.begin();
181   //paths_end = pathz.end();
182   while (path_i != paths_end)
183   {
184     ConservedPath& a_path = *path_i;
185     //cout << a_path.size() << endl;
186     //first entry is the window length of the windows in the path
187     save_file << a_path.window_size << ":";
188     for(size_type i = 0; i != sequence_count(); ++i)
189     {
190       save_file << a_path[i];
191       if (i != sequence_count())
192         save_file << ",";
193     }
194     save_file << endl;
195     ++path_i;
196   }
197
198   save_file << "</Mussa>\n";
199   save_file.close();
200 }
201
202
203 NwayPaths::size_type NwayPaths::sequence_count() const
204 {
205   if (refined_pathz.begin() == refined_pathz.end() )
206     return 0;
207   else
208     return refined_pathz.begin()->size();
209 }
210
211 NwayPaths::size_type NwayPaths::size() const
212 {
213   return pathz.size();
214 }  
215
216 void
217 NwayPaths::load(fs::path load_file_path)
218 {
219   fs::fstream load_file;
220   string file_data_line, header_data, data, path_node, path_width;
221   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
222   vector<int> loaded_path;
223
224   load_file.open(load_file_path, ios::in);
225
226   if (!load_file)
227   {
228     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
229   }
230   else
231   {
232     // get header data
233     // grab mussa tag - discard for now...maybe check in future...
234     getline(load_file,file_data_line);
235     space_split_i = file_data_line.find(" ");
236     file_data_line = file_data_line.substr(space_split_i+1);
237     // grab type tag - need in future to distinguish between flp and vlp paths
238     space_split_i = file_data_line.find(" ");
239     file_data_line = file_data_line.substr(space_split_i+1);
240     // get species/seq number
241     space_split_i = file_data_line.find(" ");
242     header_data = file_data_line.substr(0,space_split_i); 
243     equal_split_i = header_data.find("=");
244     data = file_data_line.substr(equal_split_i+1); 
245     unsigned int species_num = atoi (data.c_str());
246     file_data_line = file_data_line.substr(space_split_i+1);
247     // get window size
248     space_split_i = file_data_line.find(" ");
249     header_data = file_data_line.substr(0,space_split_i); 
250     equal_split_i = header_data.find("=");
251     data = file_data_line.substr(equal_split_i+1); 
252     win_size = atoi (data.c_str());
253     file_data_line = file_data_line.substr(space_split_i+1);
254     // get threshold size
255     space_split_i = file_data_line.find(" ");
256     header_data = file_data_line.substr(0,space_split_i); 
257     equal_split_i = header_data.find("=");
258     data = file_data_line.substr(equal_split_i+1); 
259     threshold = atoi (data.c_str());
260     file_data_line = file_data_line.substr(space_split_i+1);
261     // get cur_threshold
262     //std::cout << "file_data_line: " << file_data_line << "\n";
263     //std::cout << "find(\">\"): " << file_data_line.find(">") << "\n";
264     if (file_data_line.find(">") != 0)
265     {
266       space_split_i = file_data_line.find(" ");
267       header_data = file_data_line.substr(0,space_split_i);
268       equal_split_i =  header_data.find("=");
269       data = file_data_line.substr(equal_split_i+1);
270       soft_thres = atoi (data.c_str());
271       file_data_line = file_data_line.substr(space_split_i+1);
272     }
273     else
274     {
275       soft_thres = threshold;
276     }
277     //std::cout << "nway_soft_thres: " << soft_thres << "\n";
278     //cout << "seq_num=" << species_num << " win=" << win_size;
279     //cout << " thres=" << threshold << endl;
280     
281     // clear out the current data
282     refined_pathz.clear();
283     
284     int temp;
285     
286     getline(load_file,file_data_line);
287     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
288     {
289       if (file_data_line != "")
290       {
291         loaded_path.clear();
292         colon_split_i = file_data_line.find(":");
293         // whats our window size?
294         path_width = file_data_line.substr(0,colon_split_i); 
295         file_data_line = file_data_line.substr(colon_split_i+1);
296         for(size_type i = 0; i < species_num; i++)
297         {
298           comma_split_i = file_data_line.find(",");
299           path_node = file_data_line.substr(0, comma_split_i); 
300           temp = atoi (path_node.c_str());
301           loaded_path.push_back(temp);
302           file_data_line = file_data_line.substr(comma_split_i+1);
303         }
304         assert (loaded_path.size() == species_num );
305         refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()), 
306                                                    threshold, 
307                                                    loaded_path));
308       }
309       getline(load_file,file_data_line);
310     }
311     load_file.close();
312   }
313 }
314
315
316 void
317 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_type depth)
318 {
319   list<int> new_nodes, trans_check_nodes;
320   list<int>::iterator new_nodes_i, new_nodes_end;
321   bool trans_check_good;
322
323   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
324   new_nodes_i = new_nodes.begin();
325   new_nodes_end = new_nodes.end();
326   while(new_nodes_i != new_nodes_end)
327   {
328     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
329     // check transitivity with previous nodes in path
330     trans_check_good = true;
331     for(size_type i = 0; i < depth - 1; i++)
332     {
333       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
334       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
335                                             trans_check_nodes.end(),
336                                             *new_nodes_i) ) &&
337            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
338                                            trans_check_nodes.end(),
339                                            *new_nodes_i * -1) ) )
340         trans_check_good = false;
341     }
342
343     if (trans_check_good)
344     {
345       // this makes sure path nodes are recorded with RC status relative to
346       // the base species
347       if ( path[depth-1] >= 0)
348         path.push_back(*new_nodes_i);
349       else
350         path.push_back(*new_nodes_i * -1);
351
352       if (depth < all_comparisons.size() - 1)
353         path_search(all_comparisons, path, depth + 1);
354       else
355         pathz.push_back(path);
356       path.pop_back();
357     } 
358     ++new_nodes_i;
359   }
360 }
361 /* use this if I ever get the friggin seqcomp match lists to sort...
362       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
363                         *new_nodes_i))
364 */
365
366 void
367 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
368 {
369   ConservedPath path;
370   int win_i, window_num;
371   list<int> new_nodes;
372   list<int>::iterator new_nodes_i, new_nodes_end;
373
374   pathz.clear();
375   window_num = all_comparisons[0][1].size();
376   // loop thru all windows in first species
377   for (win_i = 0; win_i < window_num; win_i++)
378   {
379     path.clear();
380     path.push_back(win_i);
381     new_nodes = all_comparisons[0][1].match_locations(path[0]);
382     new_nodes_i = new_nodes.begin();
383     new_nodes_end = new_nodes.end();
384     //if (new_nodes_i != new_nodes_end)
385     //cout << "* species 0 node: " << win_i << endl;
386     path.push_back(0);
387     while(new_nodes_i != new_nodes_end)
388     {
389       //cout << "  * species 1 node: " << *new_nodes_i << endl;
390       path[1] = *new_nodes_i;
391       path_search(all_comparisons, path, 2);
392       ++new_nodes_i;
393     }
394   }
395 }
396
397
398 void
399 NwayPaths::save_old(fs::path save_file_path)
400 {
401   fs::fstream save_file;
402   list<ConservedPath >::iterator path_i, paths_end;
403   size_type i;
404
405   save_file.open(save_file_path, ios::app);
406
407   path_i = pathz.begin();
408   paths_end = pathz.end();
409   while(path_i != paths_end)
410   {
411     ConservedPath& a_path = *path_i;
412     //cout << a_path.size() << endl;
413     for(i = 0; i < path_i->size(); ++i)
414       save_file << i << "," << a_path[i] << " ";
415     save_file << endl;
416     ++path_i;
417   }
418   save_file.close();
419 }
420
421
422 /*
423 void
424 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
425 {
426   int win_i, sp_i;
427   vector<list<list<int> > > path_src_tree;
428   list<int> new_nodes;
429   list<int>::iterator node_i, node_end;
430   <list<list<int> > >::iterator branch_i, branch_end;  
431
432   pathz.clear();
433   path_src_tree.reserve(all_comparisons.size()- 1);
434
435
436   // loop thru all windows in first species
437   for (win_i = 0; win_i < window_num; win_i++)
438   {
439     // clear the path search tree
440     for(i = 0; i < all_comparisons.size(); i++)
441       path_src_tree[i].clear();
442
443     // top level kept empty even tho implicity has one entry of the first
444     // species at this window - why bother, adds a little speed
445
446     // get connection list for first species, creating a list of nodes
447     // of second species connected to the first species at this window
448     new_nodes = all_comparisons[0][1];
449     path_src_tree[1].push_back(new_nodes);
450
451     // loop thru rest of species for this window to see if any paths of matches
452     // go across all species
453     // if path search tree becomes empty, break out of loop, no reason to search further
454     sp_i = 1;
455     while ((sp_i < all_comparisons.size()) && (path tree not empty))
456     {
457       branch_i = path_src_tree[1].begin();
458       branch_end = path_src_tree[1].end();
459       while (branch_i != branch_end)
460       {
461         node_i = branch_i->begin();
462         node_end = branch_i->end();
463       }
464
465
466       // loop over all current nodes
467          // get connection list for each node
468          // loop over each previous node in list
469             // get those nodes connection list
470             // intersect previous node connections with current
471
472       ++sp_i;
473     }
474
475     // insert any of the paths found into the master list of paths
476
477     // add no paths if tmp_pathz is empty...
478   }
479 }
480
481 void NwayPaths::refine()
482 {
483 }
484 */
485
486
487 void NwayPaths::print(list<vector<int> >& dump_path)
488 {
489   list<vector<int> >::iterator pathz_i;
490   vector<int>::iterator path_i;
491
492   cout << "printing list of lists\n";
493   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
494   {
495     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
496       cout << *path_i << " ";
497     cout << endl;
498   }
499 }