1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_nway.cc -----------
13 // ----------------------------------------
15 #include <boost/filesystem/fstream.hpp>
16 namespace fs = boost::filesystem;
18 #include "alg/mussa_callback.hpp"
19 #include "alg/nway_paths.hpp"
20 #include "alg/conserved_path.hpp"
21 #include "mussa_exceptions.hpp"
28 NwayPaths::NwayPaths()
32 NwayPaths::NwayPaths(const NwayPaths &o)
34 refined_pathz(o.refined_pathz),
35 threshold(o.threshold),
37 soft_thres(o.soft_thres),
38 ent_thres(o.ent_thres),
39 c_sequences(o.c_sequences)
43 void NwayPaths::clear()
47 refined_pathz.clear();
51 NwayPaths::setup(int w, int t)
54 soft_thres = threshold;
58 //cout << "nway: thres = " << threshold
59 // << ", soft threo = " << soft_thres << endl;
63 NwayPaths::set_soft_threshold(int sft_thres)
65 soft_thres = sft_thres;
68 int NwayPaths::get_soft_threshold() const
73 int NwayPaths::get_threshold() const
78 int NwayPaths::get_window() const
83 // dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
84 // doesn't deal with interleaved adjacency
86 NwayPaths::simple_refine()
88 // ext_path remembers the first window set in an extending path
89 ConservedPath ext_path, new_path;
90 list<ConservedPath>::iterator cur_path, next_path;
91 list<ConservedPath>::iterator pathz_i;
93 bool extending, end = false;
95 refined_pathz.clear();
97 //cout << "path number is: " << pathz.size() << endl;
98 pathz_i = pathz.begin();
101 // only try to extend when pathz isn't empty.
102 if (pathz_i != pathz.end())
107 while(pathz_i != pathz.end())
109 // keep track of current path and advance to next path
114 if (pathz_i == pathz.end()) {
119 // if node for each seq is equal to the next node+1 then for all
120 // sequences then we are extending
121 extending = cur_path->nextTo(*next_path);
130 // add the extend window length as first element and add as refined
131 // now that we have the path to extend save it
133 new_path.extend(win_ext_len);
134 refined_pathz.push_back(new_path);
138 ext_path = *next_path;
141 if ((path_count % 100) == 0)
142 emit progress("refine", path_count-1, pathz.size());
145 // this mysterious call tells the dialog box that we're actually done
146 emit progress("refine", pathz.size(), pathz.size());
147 //cout << "r_path number is: " << refined_pathz.size() << endl;
151 NwayPaths::add_path(int threshold, vector<int>& loaded_path)
153 pathz.push_back(ConservedPath(threshold, 0.0, loaded_path));
157 NwayPaths::add_path(ConservedPath loaded_path)
159 pathz.push_back(loaded_path);
164 NwayPaths::save(fs::path save_file_path)
166 fs::fstream save_file;
167 list<ConservedPath >::iterator path_i, paths_end;
169 save_file.open(save_file_path, ios::out);
171 save_file << "<Mussa type=flp seq_count=" << sequence_count();
172 save_file << " win=" << win_size;
173 // add a function para new_thres defaults to -1 to later deal with
174 // reanalysis with higher thres - if statement whether to record base thres
175 // or new thres (ie if -1, then base)
176 save_file << " thres=" << threshold << " soft_thres=" << soft_thres << " >\n";
178 path_i = refined_pathz.begin();
179 paths_end = refined_pathz.end();
180 //path_i = pathz.begin();
181 //paths_end = pathz.end();
182 while (path_i != paths_end)
184 ConservedPath& a_path = *path_i;
185 //cout << a_path.size() << endl;
186 //first entry is the window length of the windows in the path
187 save_file << a_path.window_size << ":";
188 for(size_type i = 0; i != sequence_count(); ++i)
190 save_file << a_path[i];
191 if (i != sequence_count())
198 save_file << "</Mussa>\n";
203 NwayPaths::size_type NwayPaths::sequence_count() const
205 if (refined_pathz.begin() == refined_pathz.end() )
208 return refined_pathz.begin()->size();
211 NwayPaths::size_type NwayPaths::size() const
217 NwayPaths::load(fs::path load_file_path)
219 fs::fstream load_file;
220 string file_data_line, header_data, data, path_node, path_width;
221 int space_split_i, equal_split_i, comma_split_i, colon_split_i;
222 vector<int> loaded_path;
224 load_file.open(load_file_path, ios::in);
228 throw mussa_load_error("Sequence File: " + load_file_path.string() + " not found");
233 // grab mussa tag - discard for now...maybe check in future...
234 getline(load_file,file_data_line);
235 space_split_i = file_data_line.find(" ");
236 file_data_line = file_data_line.substr(space_split_i+1);
237 // grab type tag - need in future to distinguish between flp and vlp paths
238 space_split_i = file_data_line.find(" ");
239 file_data_line = file_data_line.substr(space_split_i+1);
240 // get species/seq number
241 space_split_i = file_data_line.find(" ");
242 header_data = file_data_line.substr(0,space_split_i);
243 equal_split_i = header_data.find("=");
244 data = file_data_line.substr(equal_split_i+1);
245 unsigned int species_num = atoi (data.c_str());
246 file_data_line = file_data_line.substr(space_split_i+1);
248 space_split_i = file_data_line.find(" ");
249 header_data = file_data_line.substr(0,space_split_i);
250 equal_split_i = header_data.find("=");
251 data = file_data_line.substr(equal_split_i+1);
252 win_size = atoi (data.c_str());
253 file_data_line = file_data_line.substr(space_split_i+1);
254 // get threshold size
255 space_split_i = file_data_line.find(" ");
256 header_data = file_data_line.substr(0,space_split_i);
257 equal_split_i = header_data.find("=");
258 data = file_data_line.substr(equal_split_i+1);
259 threshold = atoi (data.c_str());
260 file_data_line = file_data_line.substr(space_split_i+1);
262 //std::cout << "file_data_line: " << file_data_line << "\n";
263 //std::cout << "find(\">\"): " << file_data_line.find(">") << "\n";
264 if (file_data_line.find(">") != 0)
266 space_split_i = file_data_line.find(" ");
267 header_data = file_data_line.substr(0,space_split_i);
268 equal_split_i = header_data.find("=");
269 data = file_data_line.substr(equal_split_i+1);
270 soft_thres = atoi (data.c_str());
271 file_data_line = file_data_line.substr(space_split_i+1);
275 soft_thres = threshold;
277 //std::cout << "nway_soft_thres: " << soft_thres << "\n";
278 //cout << "seq_num=" << species_num << " win=" << win_size;
279 //cout << " thres=" << threshold << endl;
281 // clear out the current data
282 refined_pathz.clear();
286 getline(load_file,file_data_line);
287 while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
289 if (file_data_line != "")
292 colon_split_i = file_data_line.find(":");
293 // whats our window size?
294 path_width = file_data_line.substr(0,colon_split_i);
295 file_data_line = file_data_line.substr(colon_split_i+1);
296 for(size_type i = 0; i < species_num; i++)
298 comma_split_i = file_data_line.find(",");
299 path_node = file_data_line.substr(0, comma_split_i);
300 temp = atoi (path_node.c_str());
301 loaded_path.push_back(temp);
302 file_data_line = file_data_line.substr(comma_split_i+1);
304 assert (loaded_path.size() == species_num );
305 refined_pathz.push_back(ConservedPath(atoi(path_width.c_str()),
309 getline(load_file,file_data_line);
317 NwayPaths::path_search(vector<vector<FLPs> > all_comparisons, ConservedPath path, size_type depth)
319 list<int> new_nodes, trans_check_nodes;
320 list<int>::iterator new_nodes_i, new_nodes_end;
321 bool trans_check_good;
323 new_nodes = all_comparisons[depth - 1][depth].match_locations(path[depth-1]);
324 new_nodes_i = new_nodes.begin();
325 new_nodes_end = new_nodes.end();
326 while(new_nodes_i != new_nodes_end)
328 //cout << " * species " << depth << " node: " << *new_nodes_i << endl;
329 // check transitivity with previous nodes in path
330 trans_check_good = true;
331 for(size_type i = 0; i < depth - 1; i++)
333 trans_check_nodes = all_comparisons[i][depth].match_locations(path[i]);
334 if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
335 trans_check_nodes.end(),
337 (trans_check_nodes.end() == find(trans_check_nodes.begin(),
338 trans_check_nodes.end(),
339 *new_nodes_i * -1) ) )
340 trans_check_good = false;
343 if (trans_check_good)
345 // this makes sure path nodes are recorded with RC status relative to
347 if ( path[depth-1] >= 0)
348 path.push_back(*new_nodes_i);
350 path.push_back(*new_nodes_i * -1);
352 if (depth < all_comparisons.size() - 1)
353 path_search(all_comparisons, path, depth + 1);
355 pathz.push_back(path);
361 /* use this if I ever get the friggin seqcomp match lists to sort...
362 if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(),
367 NwayPaths::find_paths_r(vector<vector<FLPs> > all_comparisons)
370 int win_i, window_num;
372 list<int>::iterator new_nodes_i, new_nodes_end;
375 window_num = all_comparisons[0][1].size();
376 // loop thru all windows in first species
377 for (win_i = 0; win_i < window_num; win_i++)
380 path.push_back(win_i);
381 new_nodes = all_comparisons[0][1].match_locations(path[0]);
382 new_nodes_i = new_nodes.begin();
383 new_nodes_end = new_nodes.end();
384 //if (new_nodes_i != new_nodes_end)
385 //cout << "* species 0 node: " << win_i << endl;
387 while(new_nodes_i != new_nodes_end)
389 //cout << " * species 1 node: " << *new_nodes_i << endl;
390 path[1] = *new_nodes_i;
391 path_search(all_comparisons, path, 2);
399 NwayPaths::save_old(fs::path save_file_path)
401 fs::fstream save_file;
402 list<ConservedPath >::iterator path_i, paths_end;
405 save_file.open(save_file_path, ios::app);
407 path_i = pathz.begin();
408 paths_end = pathz.end();
409 while(path_i != paths_end)
411 ConservedPath& a_path = *path_i;
412 //cout << a_path.size() << endl;
413 for(i = 0; i < path_i->size(); ++i)
414 save_file << i << "," << a_path[i] << " ";
424 NwayPaths::find_paths(vector<vector<FLPs> > all_comparisons)
427 vector<list<list<int> > > path_src_tree;
429 list<int>::iterator node_i, node_end;
430 <list<list<int> > >::iterator branch_i, branch_end;
433 path_src_tree.reserve(all_comparisons.size()- 1);
436 // loop thru all windows in first species
437 for (win_i = 0; win_i < window_num; win_i++)
439 // clear the path search tree
440 for(i = 0; i < all_comparisons.size(); i++)
441 path_src_tree[i].clear();
443 // top level kept empty even tho implicity has one entry of the first
444 // species at this window - why bother, adds a little speed
446 // get connection list for first species, creating a list of nodes
447 // of second species connected to the first species at this window
448 new_nodes = all_comparisons[0][1];
449 path_src_tree[1].push_back(new_nodes);
451 // loop thru rest of species for this window to see if any paths of matches
452 // go across all species
453 // if path search tree becomes empty, break out of loop, no reason to search further
455 while ((sp_i < all_comparisons.size()) && (path tree not empty))
457 branch_i = path_src_tree[1].begin();
458 branch_end = path_src_tree[1].end();
459 while (branch_i != branch_end)
461 node_i = branch_i->begin();
462 node_end = branch_i->end();
466 // loop over all current nodes
467 // get connection list for each node
468 // loop over each previous node in list
469 // get those nodes connection list
470 // intersect previous node connections with current
475 // insert any of the paths found into the master list of paths
477 // add no paths if tmp_pathz is empty...
481 void NwayPaths::refine()
487 void NwayPaths::print(list<vector<int> >& dump_path)
489 list<vector<int> >::iterator pathz_i;
490 vector<int>::iterator path_i;
492 cout << "printing list of lists\n";
493 for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
495 for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
496 cout << *path_i << " ";