7b085a0844860164b78ca8520b4294b5ffe14f63
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
22
23 #include <iostream>
24 #include <stdexcept>
25
26 using namespace std;
27
28 NwayPaths::NwayPaths()
29 {
30 }
31
32 NwayPaths::NwayPaths(const NwayPaths &o)
33   : pathz(o.pathz),
34     refined_pathz(o.refined_pathz),
35     threshold(o.threshold),
36     win_size(o.win_size),
37     soft_thres(o.soft_thres),
38     ent_thres(o.ent_thres),
39     c_sequences(o.c_sequences)
40 {
41 }
42
43 void NwayPaths::clear()
44 {
45   c_sequences.clear();
46   pathz.clear();
47   refined_pathz.clear();
48 }
49
50 void
51 NwayPaths::setup(int w, int t)
52 {
53   threshold = t;
54   soft_thres = threshold;
55   win_size = w;
56   clear();
57
58   //cout << "nway: thres = " << threshold
59   //     << ", soft threo = " << soft_thres << endl;
60 }
61
62 void
63 NwayPaths::set_soft_threshold(int sft_thres)
64 {
65   soft_thres = sft_thres;
66 }
67
68 int NwayPaths::get_threshold() const
69 {
70   return threshold;
71 }
72
73 int NwayPaths::get_window() const
74 {
75   return win_size;
76 }
77
78 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
79 // doesn't deal with interleaved adjacency
80 void
81 NwayPaths::simple_refine()
82 {
83   // ext_path remembers the first window set in an extending path
84   ConservedPath ext_path, new_path;
85   list<ConservedPath>::iterator cur_path, next_path;
86   list<ConservedPath>::iterator pathz_i;
87   int win_ext_len = 0;
88   bool extending, end = false;
89
90   refined_pathz.clear();
91
92   //cout << "path number is: " << pathz.size() << endl;
93   pathz_i = pathz.begin();
94   int path_count = 0;
95
96   // only try to extend when pathz isn't empty.
97   if (pathz_i != pathz.end())
98   {
99     ext_path = *pathz_i;
100     ++path_count;
101
102     while(pathz_i != pathz.end())
103     {
104       // keep track of current path and advance to next path
105       cur_path = pathz_i;
106       ++pathz_i;
107       ++path_count;
108
109       if (pathz_i == pathz.end()) {
110         end = true;
111         extending = false;
112       } else {
113         next_path = pathz_i;
114         // if node for each seq is equal to the next node+1 then for all
115         // sequences then we are extending
116         extending = cur_path->nextTo(*next_path);
117       }
118
119       if (extending)
120       {
121         win_ext_len++;
122       }
123       else
124       {
125         // add the extend window length as first element and add as refined
126         // now that we have the path to extend save it
127         new_path = ext_path;
128         new_path.extend(win_ext_len);
129         refined_pathz.push_back(new_path);
130         if (not end) {
131           // reset stuff
132           win_ext_len = 0;
133           ext_path = *next_path;
134         }
135       }
136       if ((path_count % 100) == 0)
137         emit progress("refine", path_count-1, pathz.size());
138     }
139   }
140   // this mysterious call tells the dialog box that we're actually done
141   emit progress("refine", pathz.size(), pathz.size());
142   //cout << "r_path number is: " << refined_pathz.size() << endl;
143 }
144
145 void
146 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
147 {
148   pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
149 }
150
151 void
152 NwayPaths::add_path(ConservedPath loaded_path)
153 {
154   pathz.push_back(loaded_path);
155 }
156
157
158 void
159 NwayPaths::save(fs::path save_file_path)
160 {
161   fs::fstream save_file;
162   list<ConservedPath >::iterator path_i, paths_end;
163
164   save_file.open(save_file_path, ios::out);
165
166   save_file << "<Mussa type=flp seq_count=" << sequence_count();
167   save_file << " win=" << win_size;
168   // add a function para new_thres defaults to -1 to later deal with
169   // reanalysis with higher thres - if statement whether to record base thres
170   // or new thres (ie if -1, then base)
171   save_file << " thres=" << threshold << " >\n";
172
173   path_i = refined_pathz.begin();
174   paths_end = refined_pathz.end();
175   //path_i = pathz.begin();
176   //paths_end = pathz.end();
177   while (path_i != paths_end)
178   {
179     ConservedPath& a_path = *path_i;
180     //cout << a_path.size() << endl;
181     //first entry is the window length of the windows in the path
182     save_file << a_path.window_size << ":";
183     for(size_t i = 0; i != sequence_count(); ++i)
184     {
185       save_file << a_path[i];
186       if (i != sequence_count())
187         save_file << ",";
188     }
189     save_file << endl;
190     ++path_i;
191   }
192
193   save_file << "</Mussa>\n";
194   save_file.close();
195 }
196
197
198 size_t
199 NwayPaths::sequence_count()
200 {
201   if (refined_pathz.begin() == refined_pathz.end() )
202     return 0;
203   else
204     return refined_pathz.begin()->size();
205 }
206
207
208 void
209 NwayPaths::load(fs::path load_file_path)
210 {
211   fs::fstream load_file;
212   string file_data_line, header_data, data, path_node, path_width;
213   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
214   vector<int> loaded_path;
215
216   load_file.open(load_file_path, ios::in);
217
218   if (!load_file)
219   {
220     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
221   }
222   else
223   {
224     // get header data
225     // grab mussa tag - discard for now...maybe check in future...
226     getline(load_file,file_data_line);
227     space_split_i = file_data_line.find(" ");
228     file_data_line = file_data_line.substr(space_split_i+1);
229     // grab type tag - need in future to distinguish between flp and vlp paths
230     space_split_i = file_data_line.find(" ");
231     file_data_line = file_data_line.substr(space_split_i+1);
232     // get species/seq number
233     space_split_i = file_data_line.find(" ");
234     header_data = file_data_line.substr(0,space_split_i); 
235     equal_split_i = header_data.find("=");
236     data = file_data_line.substr(equal_split_i+1); 
237     unsigned int species_num = atoi (data.c_str());
238     file_data_line = file_data_line.substr(space_split_i+1);
239     // get window size
240     space_split_i = file_data_line.find(" ");
241     header_data = file_data_line.substr(0,space_split_i); 
242     equal_split_i = header_data.find("=");
243     data = file_data_line.substr(equal_split_i+1); 
244     win_size = atoi (data.c_str());
245     file_data_line = file_data_line.substr(space_split_i+1);
246     // get threshold size
247     space_split_i = file_data_line.find(" ");
248     header_data = file_data_line.substr(0,space_split_i); 
249     equal_split_i = header_data.find("=");
250     data = file_data_line.substr(equal_split_i+1); 
251     threshold = atoi (data.c_str());
252     file_data_line = file_data_line.substr(space_split_i+1);
253     
254     //cout << "seq_num=" << species_num << " win=" << win_size;
255     //cout << " thres=" << threshold << endl;
256     
257     // clear out the current data
258     refined_pathz.clear();
259     
260     int temp;
261     
262     getline(load_file,file_data_line);
263     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
264     {
265       if (file_data_line != "")
266       {
267         loaded_path.clear();
268         colon_split_i = file_data_line.find(":");
269         // whats our window size?
270         path_width = file_data_line.substr(0,colon_split_i); 
271         file_data_line = file_data_line.substr(colon_split_i+1);
272         for(size_t i = 0; i < species_num; i++)
273         {
274           comma_split_i = file_data_line.find(",");
275           path_node = file_data_line.substr(0, comma_split_i); 
276           temp = atoi (path_node.c_str());
277           loaded_path.push_back(temp);
278           file_data_line = file_data_line.substr(comma_split_i+1);
279         }
280         assert (loaded_path.size() == species_num );
281         refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()), 
282                                                    threshold, 
283                                                    loaded_path));
284       }
285       getline(load_file,file_data_line);
286     }
287     load_file.close();
288   }
289 }
290
291
292 void
293 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_t depth)
294 {
295   list<int> new_nodes, trans_check_nodes;
296   list<int>::iterator new_nodes_i, new_nodes_end;
297   bool trans_check_good;
298
299   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
300   new_nodes_i = new_nodes.begin();
301   new_nodes_end = new_nodes.end();
302   while(new_nodes_i != new_nodes_end)
303   {
304     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
305     // check transitivity with previous nodes in path
306     trans_check_good = true;
307     for(size_t i = 0; i < depth - 1; i++)
308     {
309       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
310       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
311                                             trans_check_nodes.end(),
312                                             *new_nodes_i) ) &&
313            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
314                                            trans_check_nodes.end(),
315                                            *new_nodes_i * -1) ) )
316         trans_check_good = false;
317     }
318
319     if (trans_check_good)
320     {
321       // this makes sure path nodes are recorded with RC status relative to
322       // the base species
323       if ( path[depth-1] >= 0)
324         path.push_back(*new_nodes_i);
325       else
326         path.push_back(*new_nodes_i * -1);
327
328       if (depth < all_comparisons.size() - 1)
329         path_search(all_comparisons, path, depth + 1);
330       else
331         pathz.push_back(path);
332       path.pop_back();
333     } 
334     ++new_nodes_i;
335   }
336 }
337 /* use this if I ever get the friggin seqcomp match lists to sort...
338       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
339                         *new_nodes_i))
340 */
341
342 void
343 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
344 {
345   ConservedPath path;
346   int win_i, window_num;
347   list<int> new_nodes;
348   list<int>::iterator new_nodes_i, new_nodes_end;
349
350   pathz.clear();
351   window_num = all_comparisons[0][1].size();
352   // loop thru all windows in first species
353   for (win_i = 0; win_i < window_num; win_i++)
354   {
355     path.clear();
356     path.push_back(win_i);
357     new_nodes = all_comparisons[0][1].match_locations(path[0]);
358     new_nodes_i = new_nodes.begin();
359     new_nodes_end = new_nodes.end();
360     //if (new_nodes_i != new_nodes_end)
361     //cout << "* species 0 node: " << win_i << endl;
362     path.push_back(0);
363     while(new_nodes_i != new_nodes_end)
364     {
365       //cout << "  * species 1 node: " << *new_nodes_i << endl;
366       path[1] = *new_nodes_i;
367       path_search(all_comparisons, path, 2);
368       ++new_nodes_i;
369     }
370   }
371 }
372
373
374 void
375 NwayPaths::save_old(fs::path save_file_path)
376 {
377   fs::fstream save_file;
378   list<ConservedPath >::iterator path_i, paths_end;
379   size_t i;
380
381   save_file.open(save_file_path, ios::app);
382
383   path_i = pathz.begin();
384   paths_end = pathz.end();
385   while(path_i != paths_end)
386   {
387     ConservedPath& a_path = *path_i;
388     //cout << a_path.size() << endl;
389     for(i = 0; i < path_i->size(); ++i)
390       save_file << i << "," << a_path[i] << " ";
391     save_file << endl;
392     ++path_i;
393   }
394   save_file.close();
395 }
396
397
398 /*
399 void
400 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
401 {
402   int win_i, sp_i;
403   vector<list<list<int> > > path_src_tree;
404   list<int> new_nodes;
405   list<int>::iterator node_i, node_end;
406   <list<list<int> > >::iterator branch_i, branch_end;  
407
408   pathz.clear();
409   path_src_tree.reserve(all_comparisons.size()- 1);
410
411
412   // loop thru all windows in first species
413   for (win_i = 0; win_i < window_num; win_i++)
414   {
415     // clear the path search tree
416     for(i = 0; i < all_comparisons.size(); i++)
417       path_src_tree[i].clear();
418
419     // top level kept empty even tho implicity has one entry of the first
420     // species at this window - why bother, adds a little speed
421
422     // get connection list for first species, creating a list of nodes
423     // of second species connected to the first species at this window
424     new_nodes = all_comparisons[0][1];
425     path_src_tree[1].push_back(new_nodes);
426
427     // loop thru rest of species for this window to see if any paths of matches
428     // go across all species
429     // if path search tree becomes empty, break out of loop, no reason to search further
430     sp_i = 1;
431     while ((sp_i < all_comparisons.size()) && (path tree not empty))
432     {
433       branch_i = path_src_tree[1].begin();
434       branch_end = path_src_tree[1].end();
435       while (branch_i != branch_end)
436       {
437         node_i = branch_i->begin();
438         node_end = branch_i->end();
439       }
440
441
442       // loop over all current nodes
443          // get connection list for each node
444          // loop over each previous node in list
445             // get those nodes connection list
446             // intersect previous node connections with current
447
448       ++sp_i;
449     }
450
451     // insert any of the paths found into the master list of paths
452
453     // add no paths if tmp_pathz is empty...
454   }
455 }
456
457 void NwayPaths::refine()
458 {
459 }
460 */
461
462
463 void NwayPaths::print(list<vector<int> >& dump_path)
464 {
465   list<vector<int> >::iterator pathz_i;
466   vector<int>::iterator path_i;
467
468   cout << "printing list of lists\n";
469   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
470   {
471     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
472       cout << *path_i << " ";
473     cout << endl;
474   }
475 }