1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_nway.cc -----------
13 // ----------------------------------------
15 #include "alg/nway_paths.hh"
16 #include "alg/conserved_path.h"
17 #include "mussa_exceptions.hh"
25 NwayPaths::NwayPaths()
30 NwayPaths::setup(int w, int t)
33 soft_thres = threshold;
37 cout << "nway: thres = " << threshold
38 << ", soft threo = " << soft_thres << endl;
42 NwayPaths::set_soft_thres(int sft_thres)
44 soft_thres = sft_thres;
48 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
49 // doesn't deal with interleaved adjacency
51 NwayPaths::simple_refine()
53 // ext_path remembers the first window set in an extending path
54 ExtendedConservedPath ext_path, new_path;
55 list<ConservedPath>::iterator cur_path, next_path;
56 list<ConservedPath>::iterator pathz_i;
58 bool extending, end = false;
60 refined_pathz.clear();
62 cout << "path number is: " << pathz.size() << endl;
63 pathz_i = pathz.begin();
65 // only try to extend when pathz isn't empty.
66 if (pathz_i != pathz.end())
68 ext_path = ExtendedConservedPath( win_size, *pathz_i);
70 while(pathz_i != pathz.end())
72 // keep track of current path and advance to next path
75 if (pathz_i == pathz.end())
82 // if node for each seq is equal to the next node+1 then for all
83 // sequences then we are extending
84 extending = cur_path->nextTo(*next_path);
91 //cout << "Raaaawwwrrr! I am extending\n";
96 // add the extend window length as first element and add as refined
97 // now that we have the path to extend save it
99 new_path.extend(win_ext_len);
100 refined_pathz.push_back(new_path);
103 ext_path = ExtendedConservedPath( win_size, *next_path);
107 cout << "r_path number is: " << refined_pathz.size() << endl;
112 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
114 pathz.push_back(ConservedPath(threshold, loaded_path));
118 NwayPaths::add_path(ConservedPath loaded_path)
120 pathz.push_back(loaded_path);
125 NwayPaths::save(string save_file_path)
128 list<ExtendedConservedPath >::iterator path_i, paths_end;
130 save_file.open(save_file_path.c_str(), ios::out);
132 save_file << "<Mussa type=flp seq_count=" << sequence_count();
133 save_file << " win=" << win_size;
134 // add a function para new_thres defaults to -1 to later deal with
135 // reanalysis with higher thres - if statement whether to record base thres
136 // or new thres (ie if -1, then base)
137 save_file << " thres=" << threshold << " >\n";
139 path_i = refined_pathz.begin();
140 paths_end = refined_pathz.end();
141 //path_i = pathz.begin();
142 //paths_end = pathz.end();
143 while (path_i != paths_end)
145 ExtendedConservedPath& a_path = *path_i;
146 //cout << a_path.size() << endl;
147 //first entry is the window length of the windows in the path
148 save_file << a_path.window_size << ":";
149 for(size_t i = 0; i != sequence_count(); ++i)
151 save_file << a_path[i];
152 if (i != sequence_count())
159 save_file << "</Mussa>\n";
165 NwayPaths::sequence_count()
167 if (refined_pathz.begin() == refined_pathz.end() )
170 return refined_pathz.begin()->size();
175 NwayPaths::load(string load_file_path)
178 string file_data_line, header_data, data, path_node, path_width;
179 int space_split_i, equal_split_i, comma_split_i, colon_split_i;
180 vector<int> loaded_path;
182 load_file.open(load_file_path.c_str(), ios::in);
186 throw mussa_load_error("Sequence File: " + load_file_path + " not found");
191 // grab mussa tag - discard for now...maybe check in future...
192 getline(load_file,file_data_line);
193 space_split_i = file_data_line.find(" ");
194 file_data_line = file_data_line.substr(space_split_i+1);
195 // grab type tag - need in future to distinguish between flp and vlp paths
196 space_split_i = file_data_line.find(" ");
197 file_data_line = file_data_line.substr(space_split_i+1);
198 // get species/seq number
199 space_split_i = file_data_line.find(" ");
200 header_data = file_data_line.substr(0,space_split_i);
201 equal_split_i = header_data.find("=");
202 data = file_data_line.substr(equal_split_i+1);
203 unsigned int species_num = atoi (data.c_str());
204 file_data_line = file_data_line.substr(space_split_i+1);
206 space_split_i = file_data_line.find(" ");
207 header_data = file_data_line.substr(0,space_split_i);
208 equal_split_i = header_data.find("=");
209 data = file_data_line.substr(equal_split_i+1);
210 win_size = atoi (data.c_str());
211 file_data_line = file_data_line.substr(space_split_i+1);
213 space_split_i = file_data_line.find(" ");
214 header_data = file_data_line.substr(0,space_split_i);
215 equal_split_i = header_data.find("=");
216 data = file_data_line.substr(equal_split_i+1);
217 threshold = atoi (data.c_str());
218 file_data_line = file_data_line.substr(space_split_i+1);
220 cout << "seq_num=" << species_num << " win=" << win_size;
221 cout << " thres=" << threshold << endl;
223 // clear out the current data
224 refined_pathz.clear();
228 getline(load_file,file_data_line);
229 while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
231 if (file_data_line != "")
234 colon_split_i = file_data_line.find(":");
235 // whats our window size?
236 path_width = file_data_line.substr(0,colon_split_i);
237 file_data_line = file_data_line.substr(colon_split_i+1);
238 for(size_t i = 0; i < species_num; i++)
240 comma_split_i = file_data_line.find(",");
241 path_node = file_data_line.substr(0, comma_split_i);
242 temp = atoi (path_node.c_str());
243 loaded_path.push_back(temp);
244 file_data_line = file_data_line.substr(comma_split_i+1);
246 assert (loaded_path.size() == species_num );
247 refined_pathz.push_back(ExtendedConservedPath(atoi(path_width.c_str()),
251 getline(load_file,file_data_line);
259 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, int depth)
261 list<int> new_nodes, trans_check_nodes;
262 list<int>::iterator new_nodes_i, new_nodes_end;
264 bool trans_check_good;
266 new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
267 new_nodes_i = new_nodes.begin();
268 new_nodes_end = new_nodes.end();
269 while(new_nodes_i != new_nodes_end)
271 //cout << " * species " << depth << " node: " << *new_nodes_i << endl;
272 // check transitivity with previous nodes in path
273 trans_check_good = true;
274 for(i = 0; i < depth - 1; i++)
276 trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
277 if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
278 trans_check_nodes.end(),
280 (trans_check_nodes.end() == find(trans_check_nodes.begin(),
281 trans_check_nodes.end(),
282 *new_nodes_i * -1) ) )
283 trans_check_good = false;
286 if (trans_check_good)
288 // this makes sure path nodes are recorded with RC status relative to
290 if ( path[depth-1] >= 0)
291 path.push_back(*new_nodes_i);
293 path.push_back(*new_nodes_i * -1);
295 if (depth < all_comparisons.size() - 1)
296 path_search(all_comparisons, path, depth + 1);
298 pathz.push_back(path);
304 // cout << " ****I have the power...\n";
306 /* use this if I ever get the friggin seqcomp match lists to sort...
307 if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(),
312 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
315 int win_i, window_num;
317 list<int>::iterator new_nodes_i, new_nodes_end;
320 window_num = all_comparisons[0][1].size();
321 cout << window_num << endl;
322 // loop thru all windows in first species
323 for (win_i = 0; win_i < window_num; win_i++)
326 path.push_back(win_i);
327 new_nodes = all_comparisons[0][1].match_locations(path[0]);
328 new_nodes_i = new_nodes.begin();
329 new_nodes_end = new_nodes.end();
330 //if (new_nodes_i != new_nodes_end)
331 //cout << "* species 0 node: " << win_i << endl;
332 // cout << "foookin hell\n";
334 while(new_nodes_i != new_nodes_end)
336 //cout << " * species 1 node: " << *new_nodes_i << endl;
337 path[1] = *new_nodes_i;
338 path_search(all_comparisons, path, 2);
346 NwayPaths::save_old(string save_file_path)
349 list<ConservedPath >::iterator path_i, paths_end;
352 save_file.open(save_file_path.c_str(), ios::app);
354 path_i = pathz.begin();
355 paths_end = pathz.end();
356 while(path_i != paths_end)
358 ConservedPath& a_path = *path_i;
359 //cout << a_path.size() << endl;
360 for(i = 0; i < path_i->size(); ++i)
361 save_file << i << "," << a_path[i] << " ";
371 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
374 vector<list<list<int> > > path_src_tree;
376 list<int>::iterator node_i, node_end;
377 <list<list<int> > >::iterator branch_i, branch_end;
380 path_src_tree.reserve(all_comparisons.size()- 1);
383 // loop thru all windows in first species
384 for (win_i = 0; win_i < window_num; win_i++)
386 // clear the path search tree
387 for(i = 0; i < all_comparisons.size(); i++)
388 path_src_tree[i].clear();
390 // top level kept empty even tho implicity has one entry of the first
391 // species at this window - why bother, adds a little speed
393 // get connection list for first species, creating a list of nodes
394 // of second species connected to the first species at this window
395 new_nodes = all_comparisons[0][1];
396 path_src_tree[1].push_back(new_nodes);
398 // loop thru rest of species for this window to see if any paths of matches
399 // go across all species
400 // if path search tree becomes empty, break out of loop, no reason to search further
402 while ((sp_i < all_comparisons.size()) && (path tree not empty))
404 branch_i = path_src_tree[1].begin();
405 branch_end = path_src_tree[1].end();
406 while (branch_i != branch_end)
408 node_i = branch_i->begin();
409 node_end = branch_i->end();
413 // loop over all current nodes
414 // get connection list for each node
415 // loop over each previous node in list
416 // get those nodes connection list
417 // intersect previous node connections with current
422 // insert any of the paths found into the master list of paths
424 // add no paths if tmp_pathz is empty...
428 void NwayPaths::refine()
434 void NwayPaths::print(list<vector<int> >& dump_path)
436 list<vector<int> >::iterator pathz_i;
437 vector<int>::iterator path_i;
439 cout << "printing list of lists\n";
440 for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
442 for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
443 cout << *path_i << " ";