provide python interpreter for mussa qui via a seperate thread
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
22
23 #include <iostream>
24 #include <stdexcept>
25
26 using namespace std;
27
28 NwayPaths::NwayPaths()
29 {
30 }
31
32 NwayPaths::NwayPaths(const NwayPaths &o)
33   : pathz(o.pathz),
34     refined_pathz(o.refined_pathz),
35     threshold(o.threshold),
36     win_size(o.win_size),
37     soft_thres(o.soft_thres),
38     ent_thres(o.ent_thres),
39     c_sequences(o.c_sequences)
40 {
41 }
42
43 void NwayPaths::clear()
44 {
45   c_sequences.clear();
46   pathz.clear();
47   refined_pathz.clear();
48 }
49
50 void
51 NwayPaths::setup(int w, int t)
52 {
53   threshold = t;
54   soft_thres = threshold;
55   win_size = w;
56   clear();
57
58   //cout << "nway: thres = " << threshold
59   //     << ", soft threo = " << soft_thres << endl;
60 }
61
62 void
63 NwayPaths::set_soft_threshold(int sft_thres)
64 {
65   soft_thres = sft_thres;
66 }
67
68 int NwayPaths::get_soft_threshold() const
69 {
70   return soft_thres;
71 }
72
73 int NwayPaths::get_threshold() const
74 {
75   return threshold;
76 }
77
78 int NwayPaths::get_window() const
79 {
80   return win_size;
81 }
82
83 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
84 // doesn't deal with interleaved adjacency
85 void
86 NwayPaths::simple_refine()
87 {
88   // ext_path remembers the first window set in an extending path
89   ConservedPath ext_path, new_path;
90   list<ConservedPath>::iterator cur_path, next_path;
91   list<ConservedPath>::iterator pathz_i;
92   int win_ext_len = 0;
93   bool extending, end = false;
94
95   refined_pathz.clear();
96
97   //cout << "path number is: " << pathz.size() << endl;
98   pathz_i = pathz.begin();
99   int path_count = 0;
100
101   // only try to extend when pathz isn't empty.
102   if (pathz_i != pathz.end())
103   {
104     ext_path = *pathz_i;
105     ++path_count;
106
107     while(pathz_i != pathz.end())
108     {
109       // keep track of current path and advance to next path
110       cur_path = pathz_i;
111       ++pathz_i;
112       ++path_count;
113
114       if (pathz_i == pathz.end()) {
115         end = true;
116         extending = false;
117       } else {
118         next_path = pathz_i;
119         // if node for each seq is equal to the next node+1 then for all
120         // sequences then we are extending
121         extending = cur_path->nextTo(*next_path);
122       }
123
124       if (extending)
125       {
126         win_ext_len++;
127       }
128       else
129       {
130         // add the extend window length as first element and add as refined
131         // now that we have the path to extend save it
132         new_path = ext_path;
133         new_path.extend(win_ext_len);
134         refined_pathz.push_back(new_path);
135         if (not end) {
136           // reset stuff
137           win_ext_len = 0;
138           ext_path = *next_path;
139         }
140       }
141       if ((path_count % 100) == 0)
142         emit progress("refine", path_count-1, pathz.size());
143     }
144   }
145   // this mysterious call tells the dialog box that we're actually done
146   emit progress("refine", pathz.size(), pathz.size());
147   //cout << "r_path number is: " << refined_pathz.size() << endl;
148 }
149
150 void
151 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
152 {
153   pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
154 }
155
156 void
157 NwayPaths::add_path(ConservedPath loaded_path)
158 {
159   pathz.push_back(loaded_path);
160 }
161
162
163 void
164 NwayPaths::save(fs::path save_file_path)
165 {
166   fs::fstream save_file;
167   list<ConservedPath >::iterator path_i, paths_end;
168
169   save_file.open(save_file_path, ios::out);
170
171   save_file << "<Mussa type=flp seq_count=" << sequence_count();
172   save_file << " win=" << win_size;
173   // add a function para new_thres defaults to -1 to later deal with
174   // reanalysis with higher thres - if statement whether to record base thres
175   // or new thres (ie if -1, then base)
176   save_file << " thres=" << threshold << " >\n";
177
178   path_i = refined_pathz.begin();
179   paths_end = refined_pathz.end();
180   //path_i = pathz.begin();
181   //paths_end = pathz.end();
182   while (path_i != paths_end)
183   {
184     ConservedPath& a_path = *path_i;
185     //cout << a_path.size() << endl;
186     //first entry is the window length of the windows in the path
187     save_file << a_path.window_size << ":";
188     for(size_t i = 0; i != sequence_count(); ++i)
189     {
190       save_file << a_path[i];
191       if (i != sequence_count())
192         save_file << ",";
193     }
194     save_file << endl;
195     ++path_i;
196   }
197
198   save_file << "</Mussa>\n";
199   save_file.close();
200 }
201
202
203 size_t
204 NwayPaths::sequence_count()
205 {
206   if (refined_pathz.begin() == refined_pathz.end() )
207     return 0;
208   else
209     return refined_pathz.begin()->size();
210 }
211
212
213 void
214 NwayPaths::load(fs::path load_file_path)
215 {
216   fs::fstream load_file;
217   string file_data_line, header_data, data, path_node, path_width;
218   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
219   vector<int> loaded_path;
220
221   load_file.open(load_file_path, ios::in);
222
223   if (!load_file)
224   {
225     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
226   }
227   else
228   {
229     // get header data
230     // grab mussa tag - discard for now...maybe check in future...
231     getline(load_file,file_data_line);
232     space_split_i = file_data_line.find(" ");
233     file_data_line = file_data_line.substr(space_split_i+1);
234     // grab type tag - need in future to distinguish between flp and vlp paths
235     space_split_i = file_data_line.find(" ");
236     file_data_line = file_data_line.substr(space_split_i+1);
237     // get species/seq number
238     space_split_i = file_data_line.find(" ");
239     header_data = file_data_line.substr(0,space_split_i); 
240     equal_split_i = header_data.find("=");
241     data = file_data_line.substr(equal_split_i+1); 
242     unsigned int species_num = atoi (data.c_str());
243     file_data_line = file_data_line.substr(space_split_i+1);
244     // get window size
245     space_split_i = file_data_line.find(" ");
246     header_data = file_data_line.substr(0,space_split_i); 
247     equal_split_i = header_data.find("=");
248     data = file_data_line.substr(equal_split_i+1); 
249     win_size = atoi (data.c_str());
250     file_data_line = file_data_line.substr(space_split_i+1);
251     // get threshold size
252     space_split_i = file_data_line.find(" ");
253     header_data = file_data_line.substr(0,space_split_i); 
254     equal_split_i = header_data.find("=");
255     data = file_data_line.substr(equal_split_i+1); 
256     threshold = atoi (data.c_str());
257     file_data_line = file_data_line.substr(space_split_i+1);
258     
259     //cout << "seq_num=" << species_num << " win=" << win_size;
260     //cout << " thres=" << threshold << endl;
261     
262     // clear out the current data
263     refined_pathz.clear();
264     
265     int temp;
266     
267     getline(load_file,file_data_line);
268     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
269     {
270       if (file_data_line != "")
271       {
272         loaded_path.clear();
273         colon_split_i = file_data_line.find(":");
274         // whats our window size?
275         path_width = file_data_line.substr(0,colon_split_i); 
276         file_data_line = file_data_line.substr(colon_split_i+1);
277         for(size_t i = 0; i < species_num; i++)
278         {
279           comma_split_i = file_data_line.find(",");
280           path_node = file_data_line.substr(0, comma_split_i); 
281           temp = atoi (path_node.c_str());
282           loaded_path.push_back(temp);
283           file_data_line = file_data_line.substr(comma_split_i+1);
284         }
285         assert (loaded_path.size() == species_num );
286         refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()), 
287                                                    threshold, 
288                                                    loaded_path));
289       }
290       getline(load_file,file_data_line);
291     }
292     load_file.close();
293   }
294 }
295
296
297 void
298 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_t depth)
299 {
300   list<int> new_nodes, trans_check_nodes;
301   list<int>::iterator new_nodes_i, new_nodes_end;
302   bool trans_check_good;
303
304   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
305   new_nodes_i = new_nodes.begin();
306   new_nodes_end = new_nodes.end();
307   while(new_nodes_i != new_nodes_end)
308   {
309     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
310     // check transitivity with previous nodes in path
311     trans_check_good = true;
312     for(size_t i = 0; i < depth - 1; i++)
313     {
314       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
315       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
316                                             trans_check_nodes.end(),
317                                             *new_nodes_i) ) &&
318            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
319                                            trans_check_nodes.end(),
320                                            *new_nodes_i * -1) ) )
321         trans_check_good = false;
322     }
323
324     if (trans_check_good)
325     {
326       // this makes sure path nodes are recorded with RC status relative to
327       // the base species
328       if ( path[depth-1] >= 0)
329         path.push_back(*new_nodes_i);
330       else
331         path.push_back(*new_nodes_i * -1);
332
333       if (depth < all_comparisons.size() - 1)
334         path_search(all_comparisons, path, depth + 1);
335       else
336         pathz.push_back(path);
337       path.pop_back();
338     } 
339     ++new_nodes_i;
340   }
341 }
342 /* use this if I ever get the friggin seqcomp match lists to sort...
343       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
344                         *new_nodes_i))
345 */
346
347 void
348 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
349 {
350   ConservedPath path;
351   int win_i, window_num;
352   list<int> new_nodes;
353   list<int>::iterator new_nodes_i, new_nodes_end;
354
355   pathz.clear();
356   window_num = all_comparisons[0][1].size();
357   // loop thru all windows in first species
358   for (win_i = 0; win_i < window_num; win_i++)
359   {
360     path.clear();
361     path.push_back(win_i);
362     new_nodes = all_comparisons[0][1].match_locations(path[0]);
363     new_nodes_i = new_nodes.begin();
364     new_nodes_end = new_nodes.end();
365     //if (new_nodes_i != new_nodes_end)
366     //cout << "* species 0 node: " << win_i << endl;
367     path.push_back(0);
368     while(new_nodes_i != new_nodes_end)
369     {
370       //cout << "  * species 1 node: " << *new_nodes_i << endl;
371       path[1] = *new_nodes_i;
372       path_search(all_comparisons, path, 2);
373       ++new_nodes_i;
374     }
375   }
376 }
377
378
379 void
380 NwayPaths::save_old(fs::path save_file_path)
381 {
382   fs::fstream save_file;
383   list<ConservedPath >::iterator path_i, paths_end;
384   size_t i;
385
386   save_file.open(save_file_path, ios::app);
387
388   path_i = pathz.begin();
389   paths_end = pathz.end();
390   while(path_i != paths_end)
391   {
392     ConservedPath& a_path = *path_i;
393     //cout << a_path.size() << endl;
394     for(i = 0; i < path_i->size(); ++i)
395       save_file << i << "," << a_path[i] << " ";
396     save_file << endl;
397     ++path_i;
398   }
399   save_file.close();
400 }
401
402
403 /*
404 void
405 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
406 {
407   int win_i, sp_i;
408   vector<list<list<int> > > path_src_tree;
409   list<int> new_nodes;
410   list<int>::iterator node_i, node_end;
411   <list<list<int> > >::iterator branch_i, branch_end;  
412
413   pathz.clear();
414   path_src_tree.reserve(all_comparisons.size()- 1);
415
416
417   // loop thru all windows in first species
418   for (win_i = 0; win_i < window_num; win_i++)
419   {
420     // clear the path search tree
421     for(i = 0; i < all_comparisons.size(); i++)
422       path_src_tree[i].clear();
423
424     // top level kept empty even tho implicity has one entry of the first
425     // species at this window - why bother, adds a little speed
426
427     // get connection list for first species, creating a list of nodes
428     // of second species connected to the first species at this window
429     new_nodes = all_comparisons[0][1];
430     path_src_tree[1].push_back(new_nodes);
431
432     // loop thru rest of species for this window to see if any paths of matches
433     // go across all species
434     // if path search tree becomes empty, break out of loop, no reason to search further
435     sp_i = 1;
436     while ((sp_i < all_comparisons.size()) && (path tree not empty))
437     {
438       branch_i = path_src_tree[1].begin();
439       branch_end = path_src_tree[1].end();
440       while (branch_i != branch_end)
441       {
442         node_i = branch_i->begin();
443         node_end = branch_i->end();
444       }
445
446
447       // loop over all current nodes
448          // get connection list for each node
449          // loop over each previous node in list
450             // get those nodes connection list
451             // intersect previous node connections with current
452
453       ++sp_i;
454     }
455
456     // insert any of the paths found into the master list of paths
457
458     // add no paths if tmp_pathz is empty...
459   }
460 }
461
462 void NwayPaths::refine()
463 {
464 }
465 */
466
467
468 void NwayPaths::print(list<vector<int> >& dump_path)
469 {
470   list<vector<int> >::iterator pathz_i;
471   vector<int>::iterator path_i;
472
473   cout << "printing list of lists\n";
474   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
475   {
476     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
477       cout << *path_i << " ";
478     cout << endl;
479   }
480 }