e7a8474b9ce062d7cafd47cdbe88855d81f3405b
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
22
23 #include <iostream>
24 #include <stdexcept>
25
26 using namespace std;
27
28 NwayPaths::NwayPaths()
29 {
30 }
31
32 NwayPaths::NwayPaths(const NwayPaths &o)
33   : pathz(o.pathz),
34     refined_pathz(o.refined_pathz),
35     threshold(o.threshold),
36     win_size(o.win_size),
37     soft_thres(o.soft_thres),
38     ent_thres(o.ent_thres),
39     c_sequences(o.c_sequences)
40 {
41 }
42
43 void NwayPaths::clear()
44 {
45   c_sequences.clear();
46   pathz.clear();
47   refined_pathz.clear();
48 }
49
50 void
51 NwayPaths::setup(int w, int t)
52 {
53   threshold = t;
54   soft_thres = threshold;
55   win_size = w;
56   clear();
57
58   //cout << "nway: thres = " << threshold
59   //     << ", soft threo = " << soft_thres << endl;
60 }
61
62 void
63 NwayPaths::set_soft_threshold(int sft_thres)
64 {
65   soft_thres = sft_thres;
66 }
67
68 int NwayPaths::get_soft_threshold() const
69 {
70   return soft_thres;
71 }
72
73 int NwayPaths::get_threshold() const
74 {
75   return threshold;
76 }
77
78 int NwayPaths::get_window() const
79 {
80   return win_size;
81 }
82
83 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
84 // doesn't deal with interleaved adjacency
85 void
86 NwayPaths::simple_refine()
87 {
88   // ext_path remembers the first window set in an extending path
89   ConservedPath ext_path, new_path;
90   list<ConservedPath>::iterator cur_path, next_path;
91   list<ConservedPath>::iterator pathz_i;
92   int win_ext_len = 0;
93   bool extending, end = false;
94
95   refined_pathz.clear();
96
97   //cout << "path number is: " << pathz.size() << endl;
98   pathz_i = pathz.begin();
99   int path_count = 0;
100
101   // only try to extend when pathz isn't empty.
102   if (pathz_i != pathz.end())
103   {
104     ext_path = *pathz_i;
105     ++path_count;
106
107     while(pathz_i != pathz.end())
108     {
109       // keep track of current path and advance to next path
110       cur_path = pathz_i;
111       ++pathz_i;
112       ++path_count;
113
114       if (pathz_i == pathz.end()) {
115         end = true;
116         extending = false;
117       } else {
118         next_path = pathz_i;
119         // if node for each seq is equal to the next node+1 then for all
120         // sequences then we are extending
121         extending = cur_path->nextTo(*next_path);
122       }
123
124       if (extending)
125       {
126         win_ext_len++;
127       }
128       else
129       {
130         // add the extend window length as first element and add as refined
131         // now that we have the path to extend save it
132         new_path = ext_path;
133         new_path.extend(win_ext_len);
134         refined_pathz.push_back(new_path);
135         if (not end) {
136           // reset stuff
137           win_ext_len = 0;
138           ext_path = *next_path;
139         }
140       }
141       if ((path_count % 100) == 0)
142         emit progress("refine", path_count-1, pathz.size());
143     }
144   }
145   // this mysterious call tells the dialog box that we're actually done
146   emit progress("refine", pathz.size(), pathz.size());
147   //cout << "r_path number is: " << refined_pathz.size() << endl;
148 }
149
150 void
151 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
152 {
153   pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
154 }
155
156 void
157 NwayPaths::add_path(ConservedPath loaded_path)
158 {
159   pathz.push_back(loaded_path);
160 }
161
162
163 void
164 NwayPaths::save(fs::path save_file_path)
165 {
166   fs::fstream save_file;
167   list<ConservedPath >::iterator path_i, paths_end;
168
169   save_file.open(save_file_path, ios::out);
170
171   save_file << "<Mussa type=flp seq_count=" << sequence_count();
172   save_file << " win=" << win_size;
173   // add a function para new_thres defaults to -1 to later deal with
174   // reanalysis with higher thres - if statement whether to record base thres
175   // or new thres (ie if -1, then base)
176   save_file << " thres=" << threshold << " >\n";
177
178   path_i = refined_pathz.begin();
179   paths_end = refined_pathz.end();
180   //path_i = pathz.begin();
181   //paths_end = pathz.end();
182   while (path_i != paths_end)
183   {
184     ConservedPath& a_path = *path_i;
185     //cout << a_path.size() << endl;
186     //first entry is the window length of the windows in the path
187     save_file << a_path.window_size << ":";
188     for(size_type i = 0; i != sequence_count(); ++i)
189     {
190       save_file << a_path[i];
191       if (i != sequence_count())
192         save_file << ",";
193     }
194     save_file << endl;
195     ++path_i;
196   }
197
198   save_file << "</Mussa>\n";
199   save_file.close();
200 }
201
202
203 NwayPaths::size_type NwayPaths::sequence_count() const
204 {
205   if (refined_pathz.begin() == refined_pathz.end() )
206     return 0;
207   else
208     return refined_pathz.begin()->size();
209 }
210
211 NwayPaths::size_type NwayPaths::size() const
212 {
213   return pathz.size();
214 }  
215
216 void
217 NwayPaths::load(fs::path load_file_path)
218 {
219   fs::fstream load_file;
220   string file_data_line, header_data, data, path_node, path_width;
221   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
222   vector<int> loaded_path;
223
224   load_file.open(load_file_path, ios::in);
225
226   if (!load_file)
227   {
228     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
229   }
230   else
231   {
232     // get header data
233     // grab mussa tag - discard for now...maybe check in future...
234     getline(load_file,file_data_line);
235     space_split_i = file_data_line.find(" ");
236     file_data_line = file_data_line.substr(space_split_i+1);
237     // grab type tag - need in future to distinguish between flp and vlp paths
238     space_split_i = file_data_line.find(" ");
239     file_data_line = file_data_line.substr(space_split_i+1);
240     // get species/seq number
241     space_split_i = file_data_line.find(" ");
242     header_data = file_data_line.substr(0,space_split_i); 
243     equal_split_i = header_data.find("=");
244     data = file_data_line.substr(equal_split_i+1); 
245     unsigned int species_num = atoi (data.c_str());
246     file_data_line = file_data_line.substr(space_split_i+1);
247     // get window size
248     space_split_i = file_data_line.find(" ");
249     header_data = file_data_line.substr(0,space_split_i); 
250     equal_split_i = header_data.find("=");
251     data = file_data_line.substr(equal_split_i+1); 
252     win_size = atoi (data.c_str());
253     file_data_line = file_data_line.substr(space_split_i+1);
254     // get threshold size
255     space_split_i = file_data_line.find(" ");
256     header_data = file_data_line.substr(0,space_split_i); 
257     equal_split_i = header_data.find("=");
258     data = file_data_line.substr(equal_split_i+1); 
259     threshold = atoi (data.c_str());
260     file_data_line = file_data_line.substr(space_split_i+1);
261     
262     //cout << "seq_num=" << species_num << " win=" << win_size;
263     //cout << " thres=" << threshold << endl;
264     
265     // clear out the current data
266     refined_pathz.clear();
267     
268     int temp;
269     
270     getline(load_file,file_data_line);
271     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
272     {
273       if (file_data_line != "")
274       {
275         loaded_path.clear();
276         colon_split_i = file_data_line.find(":");
277         // whats our window size?
278         path_width = file_data_line.substr(0,colon_split_i); 
279         file_data_line = file_data_line.substr(colon_split_i+1);
280         for(size_type i = 0; i < species_num; i++)
281         {
282           comma_split_i = file_data_line.find(",");
283           path_node = file_data_line.substr(0, comma_split_i); 
284           temp = atoi (path_node.c_str());
285           loaded_path.push_back(temp);
286           file_data_line = file_data_line.substr(comma_split_i+1);
287         }
288         assert (loaded_path.size() == species_num );
289         refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()), 
290                                                    threshold, 
291                                                    loaded_path));
292       }
293       getline(load_file,file_data_line);
294     }
295     load_file.close();
296   }
297 }
298
299
300 void
301 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_type depth)
302 {
303   list<int> new_nodes, trans_check_nodes;
304   list<int>::iterator new_nodes_i, new_nodes_end;
305   bool trans_check_good;
306
307   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
308   new_nodes_i = new_nodes.begin();
309   new_nodes_end = new_nodes.end();
310   while(new_nodes_i != new_nodes_end)
311   {
312     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
313     // check transitivity with previous nodes in path
314     trans_check_good = true;
315     for(size_type i = 0; i < depth - 1; i++)
316     {
317       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
318       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
319                                             trans_check_nodes.end(),
320                                             *new_nodes_i) ) &&
321            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
322                                            trans_check_nodes.end(),
323                                            *new_nodes_i * -1) ) )
324         trans_check_good = false;
325     }
326
327     if (trans_check_good)
328     {
329       // this makes sure path nodes are recorded with RC status relative to
330       // the base species
331       if ( path[depth-1] >= 0)
332         path.push_back(*new_nodes_i);
333       else
334         path.push_back(*new_nodes_i * -1);
335
336       if (depth < all_comparisons.size() - 1)
337         path_search(all_comparisons, path, depth + 1);
338       else
339         pathz.push_back(path);
340       path.pop_back();
341     } 
342     ++new_nodes_i;
343   }
344 }
345 /* use this if I ever get the friggin seqcomp match lists to sort...
346       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
347                         *new_nodes_i))
348 */
349
350 void
351 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
352 {
353   ConservedPath path;
354   int win_i, window_num;
355   list<int> new_nodes;
356   list<int>::iterator new_nodes_i, new_nodes_end;
357
358   pathz.clear();
359   window_num = all_comparisons[0][1].size();
360   // loop thru all windows in first species
361   for (win_i = 0; win_i < window_num; win_i++)
362   {
363     path.clear();
364     path.push_back(win_i);
365     new_nodes = all_comparisons[0][1].match_locations(path[0]);
366     new_nodes_i = new_nodes.begin();
367     new_nodes_end = new_nodes.end();
368     //if (new_nodes_i != new_nodes_end)
369     //cout << "* species 0 node: " << win_i << endl;
370     path.push_back(0);
371     while(new_nodes_i != new_nodes_end)
372     {
373       //cout << "  * species 1 node: " << *new_nodes_i << endl;
374       path[1] = *new_nodes_i;
375       path_search(all_comparisons, path, 2);
376       ++new_nodes_i;
377     }
378   }
379 }
380
381
382 void
383 NwayPaths::save_old(fs::path save_file_path)
384 {
385   fs::fstream save_file;
386   list<ConservedPath >::iterator path_i, paths_end;
387   size_type i;
388
389   save_file.open(save_file_path, ios::app);
390
391   path_i = pathz.begin();
392   paths_end = pathz.end();
393   while(path_i != paths_end)
394   {
395     ConservedPath& a_path = *path_i;
396     //cout << a_path.size() << endl;
397     for(i = 0; i < path_i->size(); ++i)
398       save_file << i << "," << a_path[i] << " ";
399     save_file << endl;
400     ++path_i;
401   }
402   save_file.close();
403 }
404
405
406 /*
407 void
408 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
409 {
410   int win_i, sp_i;
411   vector<list<list<int> > > path_src_tree;
412   list<int> new_nodes;
413   list<int>::iterator node_i, node_end;
414   <list<list<int> > >::iterator branch_i, branch_end;  
415
416   pathz.clear();
417   path_src_tree.reserve(all_comparisons.size()- 1);
418
419
420   // loop thru all windows in first species
421   for (win_i = 0; win_i < window_num; win_i++)
422   {
423     // clear the path search tree
424     for(i = 0; i < all_comparisons.size(); i++)
425       path_src_tree[i].clear();
426
427     // top level kept empty even tho implicity has one entry of the first
428     // species at this window - why bother, adds a little speed
429
430     // get connection list for first species, creating a list of nodes
431     // of second species connected to the first species at this window
432     new_nodes = all_comparisons[0][1];
433     path_src_tree[1].push_back(new_nodes);
434
435     // loop thru rest of species for this window to see if any paths of matches
436     // go across all species
437     // if path search tree becomes empty, break out of loop, no reason to search further
438     sp_i = 1;
439     while ((sp_i < all_comparisons.size()) && (path tree not empty))
440     {
441       branch_i = path_src_tree[1].begin();
442       branch_end = path_src_tree[1].end();
443       while (branch_i != branch_end)
444       {
445         node_i = branch_i->begin();
446         node_end = branch_i->end();
447       }
448
449
450       // loop over all current nodes
451          // get connection list for each node
452          // loop over each previous node in list
453             // get those nodes connection list
454             // intersect previous node connections with current
455
456       ++sp_i;
457     }
458
459     // insert any of the paths found into the master list of paths
460
461     // add no paths if tmp_pathz is empty...
462   }
463 }
464
465 void NwayPaths::refine()
466 {
467 }
468 */
469
470
471 void NwayPaths::print(list<vector<int> >& dump_path)
472 {
473   list<vector<int> >::iterator pathz_i;
474   vector<int>::iterator path_i;
475
476   cout << "printing list of lists\n";
477   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
478   {
479     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
480       cout << *path_i << " ";
481     cout << endl;
482   }
483 }