switch to using boost::filesystem
[mussa.git] / alg / nway_paths.cpp
1 //  This file is part of the Mussa source distribution.
2 //  http://mussa.caltech.edu/
3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
4
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
9
10
11 //                        ----------------------------------------
12 //                         ---------- mussa_nway.cc  -----------
13 //                        ----------------------------------------
14
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
17
18 #include "alg/nway_paths.hpp"
19 #include "alg/conserved_path.hpp"
20 #include "mussa_exceptions.hpp"
21
22 #include <iostream>
23 #include <stdexcept>
24
25 using namespace std;
26
27 NwayPaths::NwayPaths()
28 {
29 }
30
31 void
32 NwayPaths::setup(int w, int t)
33 {
34   threshold = t;
35   soft_thres = threshold;
36   win_size = w;
37   pathz.clear();
38
39   //cout << "nway: thres = " << threshold
40   //     << ", soft threo = " << soft_thres << endl;
41 }
42
43 void
44 NwayPaths::set_soft_thres(int sft_thres)
45 {
46   soft_thres = sft_thres;
47 }
48
49 int NwayPaths::get_threshold() const
50 {
51   return threshold;
52 }
53
54 int NwayPaths::get_window() const
55 {
56   return win_size;
57 }
58
59 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
60 // doesn't deal with interleaved adjacency
61 void
62 NwayPaths::simple_refine()
63 {
64   // ext_path remembers the first window set in an extending path
65   ExtendedConservedPath ext_path, new_path;
66   list<ConservedPath>::iterator cur_path, next_path;
67   list<ConservedPath>::iterator pathz_i;
68   int win_ext_len = 0;
69   bool extending, end = false;
70
71   refined_pathz.clear();
72
73   //cout << "path number is: " << pathz.size() << endl;
74   pathz_i = pathz.begin();
75
76   // only try to extend when pathz isn't empty.
77   if (pathz_i != pathz.end())
78   {
79     ext_path = ExtendedConservedPath( win_size, *pathz_i);
80
81     while(pathz_i != pathz.end())
82     {
83       // keep track of current path and advance to next path
84       cur_path = pathz_i;
85       ++pathz_i;
86       if (pathz_i == pathz.end())
87         end = true;
88       else
89         next_path = pathz_i;
90
91       if (not end)
92       {
93         // if node for each seq is equal to the next node+1 then for all
94         // sequences then we are extending
95         extending = cur_path->nextTo(*next_path);
96       }
97       else
98         extending = false;
99   
100       if (extending)
101       {
102         win_ext_len++;
103       }
104       else
105       {
106         // add the extend window length as first element and add as refined
107         // now that we have the path to extend save it
108         new_path = ext_path;
109         new_path.extend(win_ext_len);
110         refined_pathz.push_back(new_path);
111         // reset stuff
112         win_ext_len = 0;
113         ext_path = ExtendedConservedPath( win_size, *next_path);
114       }
115     }
116   }
117   //cout << "r_path number is: " << refined_pathz.size() << endl;
118 }
119
120
121 void
122 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
123 {
124   pathz.push_back(ConservedPath(threshold, loaded_path));
125 }
126
127 void
128 NwayPaths::add_path(ConservedPath loaded_path)
129 {
130   pathz.push_back(loaded_path);
131 }
132
133
134 void
135 NwayPaths::save(fs::path save_file_path)
136 {
137   fs::fstream save_file;
138   list<ExtendedConservedPath >::iterator path_i, paths_end;
139
140   save_file.open(save_file_path, ios::out);
141
142   save_file << "<Mussa type=flp seq_count=" << sequence_count();
143   save_file << " win=" << win_size;
144   // add a function para new_thres defaults to -1 to later deal with
145   // reanalysis with higher thres - if statement whether to record base thres
146   // or new thres (ie if -1, then base)
147   save_file << " thres=" << threshold << " >\n";
148
149   path_i = refined_pathz.begin();
150   paths_end = refined_pathz.end();
151   //path_i = pathz.begin();
152   //paths_end = pathz.end();
153   while (path_i != paths_end)
154   {
155     ExtendedConservedPath& a_path = *path_i;
156     //cout << a_path.size() << endl;
157     //first entry is the window length of the windows in the path
158     save_file << a_path.window_size << ":";
159     for(size_t i = 0; i != sequence_count(); ++i)
160     {
161       save_file << a_path[i];
162       if (i != sequence_count())
163         save_file << ",";
164     }
165     save_file << endl;
166     ++path_i;
167   }
168
169   save_file << "</Mussa>\n";
170   save_file.close();
171 }
172
173
174 size_t
175 NwayPaths::sequence_count()
176 {
177   if (refined_pathz.begin() == refined_pathz.end() )
178     return 0;
179   else
180     return refined_pathz.begin()->size();
181 }
182
183
184 void
185 NwayPaths::load(fs::path load_file_path)
186 {
187   fs::fstream load_file;
188   string file_data_line, header_data, data, path_node, path_width;
189   int space_split_i, equal_split_i, comma_split_i, colon_split_i;
190   vector<int> loaded_path;
191
192   load_file.open(load_file_path, ios::in);
193
194   if (!load_file)
195   {
196     throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
197   }
198   else
199   {
200     // get header data
201     // grab mussa tag - discard for now...maybe check in future...
202     getline(load_file,file_data_line);
203     space_split_i = file_data_line.find(" ");
204     file_data_line = file_data_line.substr(space_split_i+1);
205     // grab type tag - need in future to distinguish between flp and vlp paths
206     space_split_i = file_data_line.find(" ");
207     file_data_line = file_data_line.substr(space_split_i+1);
208     // get species/seq number
209     space_split_i = file_data_line.find(" ");
210     header_data = file_data_line.substr(0,space_split_i); 
211     equal_split_i = header_data.find("=");
212     data = file_data_line.substr(equal_split_i+1); 
213     unsigned int species_num = atoi (data.c_str());
214     file_data_line = file_data_line.substr(space_split_i+1);
215     // get window size
216     space_split_i = file_data_line.find(" ");
217     header_data = file_data_line.substr(0,space_split_i); 
218     equal_split_i = header_data.find("=");
219     data = file_data_line.substr(equal_split_i+1); 
220     win_size = atoi (data.c_str());
221     file_data_line = file_data_line.substr(space_split_i+1);
222     // get threshold size
223     space_split_i = file_data_line.find(" ");
224     header_data = file_data_line.substr(0,space_split_i); 
225     equal_split_i = header_data.find("=");
226     data = file_data_line.substr(equal_split_i+1); 
227     threshold = atoi (data.c_str());
228     file_data_line = file_data_line.substr(space_split_i+1);
229     
230     //cout << "seq_num=" << species_num << " win=" << win_size;
231     //cout << " thres=" << threshold << endl;
232     
233     // clear out the current data
234     refined_pathz.clear();
235     
236     int temp;
237     
238     getline(load_file,file_data_line);
239     while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
240     {
241       if (file_data_line != "")
242       {
243         loaded_path.clear();
244         colon_split_i = file_data_line.find(":");
245         // whats our window size?
246         path_width = file_data_line.substr(0,colon_split_i); 
247         file_data_line = file_data_line.substr(colon_split_i+1);
248         for(size_t i = 0; i < species_num; i++)
249         {
250           comma_split_i = file_data_line.find(",");
251           path_node = file_data_line.substr(0, comma_split_i); 
252           temp = atoi (path_node.c_str());
253           loaded_path.push_back(temp);
254           file_data_line = file_data_line.substr(comma_split_i+1);
255         }
256         assert (loaded_path.size() == species_num );
257         refined_pathz.push_back(ExtendedConservedPath(atoi(path_width.c_str()), 
258                                                       threshold, 
259                                                       loaded_path));
260       }
261       getline(load_file,file_data_line);
262     }
263     load_file.close();
264   }
265 }
266
267
268 void
269 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_t depth)
270 {
271   list<int> new_nodes, trans_check_nodes;
272   list<int>::iterator new_nodes_i, new_nodes_end;
273   bool trans_check_good;
274
275   new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
276   new_nodes_i = new_nodes.begin();
277   new_nodes_end = new_nodes.end();
278   while(new_nodes_i != new_nodes_end)
279   {
280     //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
281     // check transitivity with previous nodes in path
282     trans_check_good = true;
283     for(size_t i = 0; i < depth - 1; i++)
284     {
285       trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
286       if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
287                                             trans_check_nodes.end(),
288                                             *new_nodes_i) ) &&
289            (trans_check_nodes.end() == find(trans_check_nodes.begin(),
290                                            trans_check_nodes.end(),
291                                            *new_nodes_i * -1) ) )
292         trans_check_good = false;
293     }
294
295     if (trans_check_good)
296     {
297       // this makes sure path nodes are recorded with RC status relative to
298       // the base species
299       if ( path[depth-1] >= 0)
300         path.push_back(*new_nodes_i);
301       else
302         path.push_back(*new_nodes_i * -1);
303
304       if (depth < all_comparisons.size() - 1)
305         path_search(all_comparisons, path, depth + 1);
306       else
307         pathz.push_back(path);
308       path.pop_back();
309     } 
310     ++new_nodes_i;
311   }
312 }
313 /* use this if I ever get the friggin seqcomp match lists to sort...
314       if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
315                         *new_nodes_i))
316 */
317
318 void
319 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
320 {
321   ConservedPath path;
322   int win_i, window_num;
323   list<int> new_nodes;
324   list<int>::iterator new_nodes_i, new_nodes_end;
325
326   pathz.clear();
327   window_num = all_comparisons[0][1].size();
328   // loop thru all windows in first species
329   for (win_i = 0; win_i < window_num; win_i++)
330   {
331     path.clear();
332     path.push_back(win_i);
333     new_nodes = all_comparisons[0][1].match_locations(path[0]);
334     new_nodes_i = new_nodes.begin();
335     new_nodes_end = new_nodes.end();
336     //if (new_nodes_i != new_nodes_end)
337     //cout << "* species 0 node: " << win_i << endl;
338     path.push_back(0);
339     while(new_nodes_i != new_nodes_end)
340     {
341       //cout << "  * species 1 node: " << *new_nodes_i << endl;
342       path[1] = *new_nodes_i;
343       path_search(all_comparisons, path, 2);
344       ++new_nodes_i;
345     }
346   }
347 }
348
349
350 void
351 NwayPaths::save_old(fs::path save_file_path)
352 {
353   fs::fstream save_file;
354   list<ConservedPath >::iterator path_i, paths_end;
355   size_t i;
356
357   save_file.open(save_file_path, ios::app);
358
359   path_i = pathz.begin();
360   paths_end = pathz.end();
361   while(path_i != paths_end)
362   {
363     ConservedPath& a_path = *path_i;
364     //cout << a_path.size() << endl;
365     for(i = 0; i < path_i->size(); ++i)
366       save_file << i << "," << a_path[i] << " ";
367     save_file << endl;
368     ++path_i;
369   }
370   save_file.close();
371 }
372
373
374 /*
375 void
376 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
377 {
378   int win_i, sp_i;
379   vector<list<list<int> > > path_src_tree;
380   list<int> new_nodes;
381   list<int>::iterator node_i, node_end;
382   <list<list<int> > >::iterator branch_i, branch_end;  
383
384   pathz.clear();
385   path_src_tree.reserve(all_comparisons.size()- 1);
386
387
388   // loop thru all windows in first species
389   for (win_i = 0; win_i < window_num; win_i++)
390   {
391     // clear the path search tree
392     for(i = 0; i < all_comparisons.size(); i++)
393       path_src_tree[i].clear();
394
395     // top level kept empty even tho implicity has one entry of the first
396     // species at this window - why bother, adds a little speed
397
398     // get connection list for first species, creating a list of nodes
399     // of second species connected to the first species at this window
400     new_nodes = all_comparisons[0][1];
401     path_src_tree[1].push_back(new_nodes);
402
403     // loop thru rest of species for this window to see if any paths of matches
404     // go across all species
405     // if path search tree becomes empty, break out of loop, no reason to search further
406     sp_i = 1;
407     while ((sp_i < all_comparisons.size()) && (path tree not empty))
408     {
409       branch_i = path_src_tree[1].begin();
410       branch_end = path_src_tree[1].end();
411       while (branch_i != branch_end)
412       {
413         node_i = branch_i->begin();
414         node_end = branch_i->end();
415       }
416
417
418       // loop over all current nodes
419          // get connection list for each node
420          // loop over each previous node in list
421             // get those nodes connection list
422             // intersect previous node connections with current
423
424       ++sp_i;
425     }
426
427     // insert any of the paths found into the master list of paths
428
429     // add no paths if tmp_pathz is empty...
430   }
431 }
432
433 void NwayPaths::refine()
434 {
435 }
436 */
437
438
439 void NwayPaths::print(list<vector<int> >& dump_path)
440 {
441   list<vector<int> >::iterator pathz_i;
442   vector<int>::iterator path_i;
443
444   cout << "printing list of lists\n";
445   for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
446   {
447     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
448       cout << *path_i << " ";
449     cout << endl;
450   }
451 }