1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_class.cc -----------
13 // ----------------------------------------
15 #include "mussa_exceptions.hh"
16 #include "mussa_class.hh"
29 // set all parameters to null state
34 ana_mode = TransitiveNway;
41 fasta_indices.clear();
43 sub_seq_starts.clear();
47 // these 5 simple methods manually set the parameters for doing an analysis
48 // used so that the gui can take input from user and setup the analysis
49 // note - still need a set_append(bool, bool) method...
51 Mussa::set_name(string a_name)
53 analysis_name = a_name;
56 string Mussa::get_name()
64 if (the_seqs.size() > 0)
65 return the_seqs.size();
66 else if (seq_files.size() > 0)
67 return seq_files.size();
73 Mussa::set_window(int a_window)
78 int Mussa::get_window() const
84 Mussa::set_threshold(int a_threshold)
86 threshold = a_threshold;
87 //soft_thres = a_threshold;
90 int Mussa::get_threshold() const
96 Mussa::set_soft_thres(int sft_thres)
98 soft_thres = sft_thres;
102 Mussa::set_analysis_mode(enum analysis_modes new_ana_mode)
104 ana_mode = new_ana_mode;
107 enum Mussa::analysis_modes Mussa::get_analysis_mode() const
112 string Mussa::get_analysis_mode_name() const
117 return string("Transitive");
120 return string("Radial");
123 return string("Entropy");
126 return string("[deprecated] Recursive");
129 throw runtime_error("invalid analysis mode type");
134 const NwayPaths& Mussa::paths() const
139 // takes a string and sets it as the next seq
141 Mussa::add_a_seq(string a_seq)
146 the_seqs.push_back(aSeq);
150 // sets info for just 1 seq at a time
152 Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end)
154 seq_files.push_back(seq_file);
155 fasta_indices.push_back(fa_i);
156 annot_files.push_back(annot_file);
157 sub_seq_starts.push_back(a_start);
158 sub_seq_ends.push_back(the_end);
161 const vector<Sequence>&
162 Mussa::sequences() const
168 Mussa::load_mupa_file(string para_file_path)
171 string file_data_line;
172 string param, value, annot_file;
173 int split_index, fasta_index;
174 int sub_seq_start, sub_seq_end;
175 bool seq_params, did_seq;
178 string::size_type new_index, dir_index;
183 para_file.open(para_file_path.c_str(), ios::in);
185 // if file was opened, read the parameter values
188 // need to find the path to the .mupa file
193 new_index = (para_file_path.substr(dir_index)).find("/");
194 if (new_index != string::npos)
195 dir_index += new_index + 1;
197 parsing_path = false;
200 file_path_base = para_file_path.substr(0,dir_index);
202 // setup loop by getting file's first line
203 getline(para_file,file_data_line);
204 split_index = file_data_line.find(" ");
205 param = file_data_line.substr(0,split_index);
206 value = file_data_line.substr(split_index+1);
208 while (!para_file.eof())
211 if (param == "ANA_NAME")
212 analysis_name = value;
213 else if (param == "APPEND_WIN")
215 else if (param == "APPEND_THRES")
217 else if (param == "SEQUENCE_NUM")
218 ; // ignore sequence_num now
219 else if (param == "WINDOW")
220 window = atoi(value.c_str());
221 else if (param == "THRESHOLD")
222 threshold = atoi(value.c_str());
223 else if (param == "SEQUENCE")
225 seq_files.push_back(file_path_base + value);
226 cout << "seq_file_name " << seq_files.back() << endl;
233 while ((!para_file.eof()) && seq_params)
235 getline(para_file,file_data_line);
236 split_index = file_data_line.find(" ");
237 param = file_data_line.substr(0,split_index);
238 value = file_data_line.substr(split_index+1);
240 if (param == "FASTA_INDEX")
241 fasta_index = atoi(value.c_str());
242 else if (param == "ANNOTATION")
243 annot_file = file_path_base + value;
244 else if (param == "SEQ_START")
245 sub_seq_start = atoi(value.c_str());
246 else if (param == "SEQ_END")
248 cout << "hey! " << atoi(value.c_str()) << endl;
249 sub_seq_end = atoi(value.c_str());
251 //ignore empty lines or that start with '#'
252 else if ((param == "") || (param == "#")) {}
253 else seq_params = false;
256 fasta_indices.push_back(fasta_index);
257 annot_files.push_back(annot_file);
258 sub_seq_starts.push_back(sub_seq_start);
259 sub_seq_ends.push_back(sub_seq_end);
262 //ignore empty lines or that start with '#'
263 else if ((param == "") || (param == "#")) {}
266 cout << "Illegal/misplaced mussa parameter in file\n";
267 cout << param << "\n";
272 getline(para_file,file_data_line);
273 split_index = file_data_line.find(" ");
274 param = file_data_line.substr(0,split_index);
275 value = file_data_line.substr(split_index+1);
282 soft_thres = threshold;
283 cout << "nway mupa: analysis_name = " << analysis_name
284 << " window = " << window
285 << " threshold = " << threshold << endl;
287 // no file was loaded, signal error
290 throw mussa_load_error("Config File: " + para_file_path + " not found");
296 Mussa::analyze(int w, int t, enum Mussa::analysis_modes the_ana_mode, double new_ent_thres)
298 time_t t1, t2, begin, end;
299 double seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
303 ana_mode = the_ana_mode;
304 ent_thres = new_ent_thres;
314 load_sequence_data();
316 if (the_seqs.size() < 2) {
317 throw mussa_analysis_error("you need to have at least 2 sequences to "
320 cout << "nway ana: seq_num = " << the_seqs.size() << endl;
323 seqloadtime = difftime(t2, t1);
328 seqcomptime = difftime(t2, t1);
332 the_paths.setup(window, threshold);
335 nwaytime = difftime(t2, t1);
340 savetime = difftime(t2, t1);
343 totaltime = difftime(end, begin);
346 cout << "seqload\tseqcomp\tnway\tsave\ttotal\n";
348 //cout << setuptime << "\t";
349 cout << seqloadtime << "\t";
350 cout << seqcomptime << "\t";
351 cout << nwaytime << "\t";
352 cout << savetime << "\t";
353 cout << totaltime << "\n";
358 Mussa::load_sequence_data()
360 list<string>::iterator seq_files_i, annot_files_i;
361 list<int>::iterator fasta_indices_i, seq_starts_i, seq_ends_i;
366 seq_files_i = seq_files.begin();
367 fasta_indices_i = fasta_indices.begin();
368 annot_files_i = annot_files.begin();
369 seq_starts_i = sub_seq_starts.begin();
370 seq_ends_i = sub_seq_ends.begin();
372 while ( (seq_files_i != seq_files.end()) && (err_msg == "") )
373 /* it should be guarenteed that each of the following exist
374 should I bother checking, and how to deal with if not true...
376 (fasta_indices_i != fasta_indices.end()) &&
377 (annot_files_i != annot_files.end()) &&
378 (seq_starts_i != sub_seq_starts.end()) &&
379 (seq_ends_i != sub_seq_ends.end()) )
382 aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i, *seq_ends_i);
383 if (annot_files_i->size() > 0)
384 aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i);
386 the_seqs.push_back(aSeq);
387 cout << aSeq.get_header() << endl;
388 //cout << aSeq.get_seq() << endl;
390 ++seq_files_i; // advance all the iterators
402 vector<int> seq_lens;
403 vector<FLPs> empty_FLP_vector;
405 string save_file_string;
407 empty_FLP_vector.clear();
408 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
410 all_comps.push_back(empty_FLP_vector);
411 for(vector<Sequence>::size_type i2 = 0; i2 < the_seqs.size(); i2++)
412 all_comps[i].push_back(dummy_comp);
414 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
415 seq_lens.push_back(the_seqs[i].size());
417 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
418 for(vector<Sequence>::size_type i2 = i+1; i2 < the_seqs.size(); i2++)
420 cout << "seqcomping: " << i << " v. " << i2 << endl;
421 all_comps[i][i2].setup(window, threshold);
422 all_comps[i][i2].seqcomp(the_seqs[i].get_seq(), the_seqs[i2].get_seq(), false);
423 all_comps[i][i2].seqcomp(the_seqs[i].get_seq(),the_seqs[i2].rev_comp(),true);
430 vector<string> some_Seqs;
432 the_paths.set_soft_thres(soft_thres);
434 if (ana_mode == TransitiveNway) {
435 the_paths.trans_path_search(all_comps);
437 else if (ana_mode == RadialNway) {
438 the_paths.radiate_path_search(all_comps);
440 else if (ana_mode == EntropyNway)
442 //unlike other methods, entropy needs to look at the sequence at this stage
444 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
446 some_Seqs.push_back(the_seqs[i].get_seq());
449 the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
450 the_paths.entropy_path_search(all_comps);
453 // old recursive transitive analysis function
454 else if (ana_mode == RecursiveNway)
455 the_paths.find_paths_r(all_comps);
457 the_paths.simple_refine();
463 string save_name, save_path, create_dir_cmd, flp_filepath;
465 ostringstream append_info;
466 int dir_create_status;
469 // not sure why, but gotta close file each time since can't pass file streams
471 save_name = analysis_name;
473 // gotta do bit with adding win & thres if to be appended
477 append_info << "_w" << window;
478 save_name += append_info.str();
484 append_info << "_t" << threshold;
485 save_name += append_info.str();
488 //#include <stdlib.h>
489 // ******* use appropriate for os ------- 1 of 4
490 // the additions for osX make it more sane where it saves the analysis
491 // will come up with a cleaner sol'n later...
492 create_dir_cmd = "mkdir " + save_name; //linux
493 //create_dir_cmd = "mkdir " + file_path_base + save_name; //osX
495 dir_create_status = system( (const char*) create_dir_cmd.c_str());
496 cout << "action: " << dir_create_status << endl;
498 // save sequence and annots to a special mussa file
500 // ******** use appropriate for OS ---------- 2 of 4
501 save_path = save_name + "/" + save_name + ".museq"; //linux
502 //save_path = file_path_base + save_name + "/" + save_name + ".museq"; //osX
504 save_file.open(save_path.c_str(), ios::out);
505 save_file << "<Mussa_Sequence>" << endl;
508 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
510 the_seqs[i].save(save_file);
513 //save_file.open(save_path.c_str(), ios::app);
514 save_file << "</Mussa_Sequence>" << endl;
517 // save nway paths to its mussa save file
519 // ******** use appropriate for OS -------- 3 of 4
520 save_path = save_name + "/" + save_name + ".muway"; //linux
521 //save_path = file_path_base + save_name + "/" + save_name + ".muway"; //os X
522 the_paths.save(save_path);
524 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
525 for(vector<Sequence>::size_type i2 = i+1; i2 < the_seqs.size(); i2++)
528 append_info << "_sp_" << i << "v" << i2;
529 // ******** use appropriate for OS --------- 4 of 4
531 save_path = save_name + "/" + save_name + append_info.str() + ".flp";
533 //save_path = file_path_base + save_name + "/" + save_name + append_info.str() + ".flp";
534 all_comps[i][i2].save(save_path);
539 Mussa::save_muway(string save_path)
541 the_paths.save(save_path);
545 Mussa::load(string ana_file)
548 string::size_type start_index, end_index;
549 string file_path_base, a_file_path, ana_path;
553 ostringstream append_info;
554 vector<FLPs> empty_FLP_vector;
557 cout << "ana_file name " << ana_file << endl;
560 end_index = ana_path.size()-1;
561 if (ana_path[end_index] == '/') {
564 start_index = ana_path.rfind('/', end_index);
565 if (start_index == string::npos) {
569 // skip the / we found
572 analysis_name = ana_path.substr(start_index, end_index-start_index+1);
573 cout << " ana_name " << analysis_name << endl;
574 file_path_base = ana_path.substr(0, start_index) + analysis_name
575 + "/" + analysis_name;
576 a_file_path = file_path_base + ".muway";
577 cout << " loading museq: " << a_file_path << endl;
578 the_paths.load(a_file_path);
580 int seq_num = the_paths.sequence_count();
582 a_file_path = file_path_base + ".museq";
584 // this is a bit of a hack due to C++ not acting like it should with files
585 for (i = 1; i <= seq_num; i++)
588 cout << "mussa_class: loading museq frag... " << a_file_path << endl;
589 tmp_seq.load_museq(a_file_path, i);
590 the_seqs.push_back(tmp_seq);
593 empty_FLP_vector.clear();
594 for(i = 0; i < seq_num; i++)
596 all_comps.push_back(empty_FLP_vector);
597 for(i2 = 0; i2 < seq_num; i2++)
598 all_comps[i].push_back(dummy_comp);
601 for(i = 0; i < seq_num; i++)
603 for(i2 = i+1; i2 < seq_num; i2++)
606 append_info << "_sp_" << i << "v" << i2;
607 cout << append_info.str() << endl;
608 a_file_path = file_path_base + append_info.str() + ".flp";
609 all_comps[i][i2].load(a_file_path);
610 cout << "real size = " << all_comps[i][i2].size() << endl;
621 save_file.open(analysis_name.c_str(), ios::out);
623 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
624 save_file << the_seqs[i].get_seq() << endl;
626 save_file << window << endl;
628 //note more complex eventually since analysis_name may need to have
629 //window size, threshold and other stuff to modify it...
630 the_paths.save_old(analysis_name);
635 Mussa::load_old(char * load_file_path, int s_num)
638 string file_data_line;
639 int i, space_split_i, comma_split_i;
640 vector<int> loaded_path;
641 string node_pair, node;
645 the_paths.setup(0, 0);
646 save_file.open(load_file_path, ios::in);
648 // currently loads old mussa format
651 for(i = 0; i < seq_num; i++)
653 getline(save_file, file_data_line);
654 a_seq.set_seq(file_data_line);
655 the_seqs.push_back(a_seq);
659 getline(save_file, file_data_line);
660 window = atoi(file_data_line.c_str());
663 while (!save_file.eof())
666 getline(save_file, file_data_line);
667 if (file_data_line != "")
668 for(i = 0; i < seq_num; i++)
670 space_split_i = file_data_line.find(" ");
671 node_pair = file_data_line.substr(0,space_split_i);
672 //cout << "np= " << node_pair;
673 comma_split_i = node_pair.find(",");
674 node = node_pair.substr(comma_split_i+1);
675 //cout << "n= " << node << " ";
676 loaded_path.push_back(atoi (node.c_str()));
677 file_data_line = file_data_line.substr(space_split_i+1);
680 // FIXME: do we have any information about what the threshold should be?
681 the_paths.add_path(0, loaded_path);
685 //the_paths.save("tmp.save");