1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_class.cc -----------
13 // ----------------------------------------
15 #include "mussa_class.hh"
27 // set all parameters to null state
39 fasta_indices.clear();
41 sub_seq_starts.clear();
45 // these 5 simple methods manually set the parameters for doing an analysis
46 // used so that the gui can take input from user and setup the analysis
47 // note - still need a set_append(bool, bool) method...
49 Mussa::set_name(string a_name)
55 Mussa::set_seq_num(int a_num)
61 Mussa::set_window(int a_window)
67 Mussa::set_threshold(int a_threshold)
69 threshold = a_threshold;
70 //soft_thres = a_threshold;
74 Mussa::set_soft_thres(int sft_thres)
76 soft_thres = sft_thres;
80 Mussa::set_ana_mode(char new_ana_mode)
82 ana_mode = new_ana_mode;
85 // takes a string and sets it as the next seq - no AGCTN checking atm
87 Mussa::add_a_seq(string a_seq)
92 the_Seqs.push_back(aSeq);
96 // sets info for just 1 seq at a time
98 Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end)
100 seq_files.push_back(seq_file);
101 fasta_indices.push_back(fa_i);
102 annot_files.push_back(annot_file);
103 sub_seq_starts.push_back(a_start);
104 sub_seq_ends.push_back(the_end);
108 Mussa::load_mupa_file(string para_file_path)
111 string file_data_line;
112 string param, value, annot_file;
113 int split_index, fasta_index;
114 int sub_seq_start, sub_seq_end;
115 bool seq_params, did_seq;
119 int new_index, dir_index;
125 para_file.open(para_file_path.c_str(), ios::in);
127 // if file was opened, read the parameter values
130 // need to find the path to the .mupa file
135 new_index = (para_file_path.substr(dir_index)).find("/");
136 //cout << "mu class: "<< new_index << endl;
138 dir_index += new_index + 1;
140 parsing_path = false;
143 file_path_base = para_file_path.substr(0,dir_index);
144 cout << "mu class: mupa base path = " << file_path_base << endl;
146 // setup loop by getting file's first line
147 getline(para_file,file_data_line);
148 split_index = file_data_line.find(" ");
149 param = file_data_line.substr(0,split_index);
150 value = file_data_line.substr(split_index+1);
152 while (!para_file.eof())
155 if (param == "ANA_NAME")
157 else if (param == "APPEND_WIN")
159 else if (param == "APPEND_THRES")
161 else if (param == "SEQUENCE_NUM")
162 seq_num = atoi(value.c_str());
163 else if (param == "WINDOW")
164 window = atoi(value.c_str());
165 else if (param == "THRESHOLD")
166 threshold = atoi(value.c_str());
167 else if (param == "SEQUENCE")
169 seq_files.push_back(file_path_base + value);
176 while ((!para_file.eof()) && seq_params)
178 getline(para_file,file_data_line);
179 split_index = file_data_line.find(" ");
180 param = file_data_line.substr(0,split_index);
181 value = file_data_line.substr(split_index+1);
183 if (param == "FASTA_INDEX")
184 fasta_index = atoi(value.c_str());
185 else if (param == "ANNOTATION")
186 annot_file = file_path_base + value;
187 else if (param == "SEQ_START")
188 sub_seq_start = atoi(value.c_str());
189 else if (param == "SEQ_END")
191 cout << "hey! " << atoi(value.c_str()) << endl;
192 sub_seq_end = atoi(value.c_str());
194 //ignore empty lines or that start with '#'
195 else if ((param == "") || (param == "#")) {}
196 else seq_params = false;
199 fasta_indices.push_back(fasta_index);
200 annot_files.push_back(annot_file);
201 sub_seq_starts.push_back(sub_seq_start);
202 sub_seq_ends.push_back(sub_seq_end);
205 //ignore empty lines or that start with '#'
206 else if ((param == "") || (param == "#")) {}
209 cout << "Illegal/misplaced mussa parameter in file\n";
210 cout << param << "\n";
215 getline(para_file,file_data_line);
216 split_index = file_data_line.find(" ");
217 param = file_data_line.substr(0,split_index);
218 value = file_data_line.substr(split_index+1);
225 soft_thres = threshold;
226 cout << "nway mupa: ana_name = " << ana_name << " seq_num = " << seq_num;
227 cout << " window = " << window << " threshold = " << threshold << endl;
232 // no file was loaded, signal error
235 err_msg = "Config File: " + para_file_path + " not found";
240 // if (!((param == "") || (param == "#")))
241 // cout << value << " = " << param << endl;
245 Mussa::analyze(int w, int t, char the_ana_mode, double new_ent_thres)
248 time_t t1, t2, begin, end;
249 double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
252 cout << "nway ana: seq_num = " << seq_num << endl;
256 // now loading parameters from file must be done separately
258 //err_msg = load_mupa_file(para_file_path);
260 //setuptime = difftime(t2, t1);
262 ana_mode = the_ana_mode;
263 ent_thres = new_ent_thres;
276 err_msg = get_Seqs();
278 seqloadtime = difftime(t2, t1);
287 seqcomptime = difftime(t2, t1);
292 cout << "nway ana: seq_num = " << seq_num << endl;
293 the_paths.setup(seq_num, window, threshold);
296 nwaytime = difftime(t2, t1);
303 savetime = difftime(t2, t1);
306 totaltime = difftime(end, begin);
309 cout << "seqload\tseqcomp\tnway\tsave\ttotal\n";
311 //cout << setuptime << "\t";
312 cout << seqloadtime << "\t";
313 cout << seqcomptime << "\t";
314 cout << nwaytime << "\t";
315 cout << savetime << "\t";
316 cout << totaltime << "\n";
329 list<string>::iterator seq_files_i, annot_files_i;
330 list<int>::iterator fasta_indices_i, seq_starts_i, seq_ends_i;
335 seq_files_i = seq_files.begin();
336 fasta_indices_i = fasta_indices.begin();
337 annot_files_i = annot_files.begin();
338 seq_starts_i = sub_seq_starts.begin();
339 seq_ends_i = sub_seq_ends.begin();
343 while ( (seq_files_i != seq_files.end()) && (err_msg == "") )
344 /* it should be guarenteed that each of the following exist
345 should I bother checking, and how to deal with if not true...
347 (fasta_indices_i != fasta_indices.end()) &&
348 (annot_files_i != annot_files.end()) &&
349 (seq_starts_i != sub_seq_starts.end()) &&
350 (seq_ends_i != sub_seq_ends.end()) )
353 err_msg = aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,
357 if (*annot_files_i != "")
358 err_msg = aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i);
362 the_Seqs.push_back(aSeq);
363 cout << aSeq.hdr() << endl;
364 //cout << aSeq.seq() << endl;
366 ++seq_files_i; // advance all the iterators
386 int i, i2; // loop vars over sequences to analyze
387 vector<int> seq_lens;
388 vector<FLPs> empty_FLP_vector;
390 string save_file_string;
392 empty_FLP_vector.clear();
393 for(i = 0; i < seq_num; i++)
395 all_comps.push_back(empty_FLP_vector);
396 for(i2 = 0; i2 < seq_num; i2++)
397 all_comps[i].push_back(dummy_comp);
399 for(i = 0; i < seq_num; i++)
400 seq_lens.push_back(the_Seqs[i].len());
402 for(i = 0; i < seq_num; i++)
403 for(i2 = i+1; i2 < seq_num; i2++)
405 cout << "seqcomping: " << i << " v. " << i2 << endl;
406 all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]);
407 all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false);
408 all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true);
417 vector<string> some_Seqs;
421 cout << "nway: ana mode = " << ana_mode << endl;
422 cout << "nway: seq_num = " << seq_num << endl;
424 the_paths.set_soft_thres(soft_thres);
427 the_paths.trans_path_search(all_comps);
429 else if (ana_mode == 'r')
430 the_paths.radiate_path_search(all_comps);
432 else if (ana_mode == 'e')
434 //unlike other methods, entropy needs to look at the sequence at this stage
436 for(i = 0; i < seq_num; i++)
437 some_Seqs.push_back(the_Seqs[i].seq());
439 the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
440 the_paths.entropy_path_search(all_comps);
443 // old recursive transitive analysis function
444 else if (ana_mode == 'o')
445 the_paths.find_paths_r(all_comps);
447 the_paths.simple_refine();
453 string save_name, save_path, create_dir_cmd, flp_filepath;
455 ostringstream append_info;
456 int i, i2, dir_create_status;
459 // not sure why, but gotta close file each time since can't pass file streams
461 save_name = ana_name;
463 // gotta do bit with adding win & thres if to be appended
467 append_info << "_w" << window;
468 save_name += append_info.str();
474 append_info << "_t" << threshold;
475 save_name += append_info.str();
478 //#include <stdlib.h>
479 // ******* use appropriate for os ------- 1 of 4
480 // the additions for osX make it more sane where it saves the analysis
481 // will come up with a cleaner sol'n later...
482 create_dir_cmd = "mkdir " + save_name; //linux
483 //create_dir_cmd = "mkdir " + file_path_base + save_name; //osX
485 dir_create_status = system( (const char*) create_dir_cmd.c_str());
486 cout << "action: " << dir_create_status << endl;
488 // save sequence and annots to a special mussa file
490 // ******** use appropriate for OS ---------- 2 of 4
491 save_path = save_name + "/" + save_name + ".museq"; //linux
492 //save_path = file_path_base + save_name + "/" + save_name + ".museq"; //osX
494 save_file.open(save_path.c_str(), ios::out);
495 save_file << "<Mussa_Sequence>" << endl;
498 for(i = 0; i < seq_num; i++)
499 the_Seqs[i].save(save_file);
501 //save_file.open(save_path.c_str(), ios::app);
502 save_file << "</Mussa_Sequence>" << endl;
505 // save nway paths to its mussa save file
507 // ******** use appropriate for OS -------- 3 of 4
508 save_path = save_name + "/" + save_name + ".muway"; //linux
509 //save_path = file_path_base + save_name + "/" + save_name + ".muway"; //os X
510 the_paths.save(save_path);
512 for(i = 0; i < seq_num; i++)
513 for(i2 = i+1; i2 < seq_num; i2++)
516 append_info << "_sp_" << i << "v" << i2;
517 // ******** use appropriate for OS --------- 4 of 4
519 save_path = save_name + "/" + save_name + append_info.str() + ".flp";
521 //save_path = file_path_base + save_name + "/" + save_name + append_info.str() + ".flp";
522 all_comps[i][i2].file_save(save_path);
527 Mussa::save_muway(string save_path)
529 the_paths.save(save_path);
533 Mussa::load(string ana_file)
535 int i, i2, new_index, dir_index;
536 string file_path_base, a_file_path, ana_path;
540 ostringstream append_info;
541 vector<FLPs> empty_FLP_vector;
550 new_index = (ana_path.substr(dir_index)).find("/");
551 //cout << "mu class: "<< new_index << endl;
553 dir_index += new_index + 1;
555 parsing_path = false;
558 ana_name = ana_path.substr(dir_index);
559 cout << "mu class: ana_name = " << ana_name << endl;
560 file_path_base = ana_path + "/" + ana_path.substr(dir_index);
561 a_file_path = file_path_base + ".muway";
562 err_msg = the_paths.load(a_file_path);
563 cout << "there is no safe distance\n";
567 seq_num = the_paths.seq_num();
571 a_file_path = file_path_base + ".museq";
573 // this is a bit of a hack due to C++ not acting like it should with files
574 for (i = 1; i <= seq_num; i++)
577 cout << "mussa_class: loading the fucking museq frag...\n";
578 tmp_seq.load_museq(a_file_path, i);
579 the_Seqs.push_back(tmp_seq);
582 empty_FLP_vector.clear();
583 for(i = 0; i < seq_num; i++)
585 all_comps.push_back(empty_FLP_vector);
586 for(i2 = 0; i2 < seq_num; i2++)
587 all_comps[i].push_back(dummy_comp);
590 for(i = 0; i < seq_num; i++)
591 for(i2 = i+1; i2 < seq_num; i2++)
594 append_info << "_sp_" << i << "v" << i2;
595 cout << append_info.str() << endl;
596 a_file_path = file_path_base + append_info.str() + ".flp";
597 all_comps[i][i2].file_load(a_file_path);
598 cout << "real size = " << all_comps[i][i2].all_matches.size() << endl;
623 save_file.open(ana_name.c_str(), ios::out);
625 for(i = 0; i < seq_num; i++)
626 save_file << the_Seqs[i].seq() << endl;
628 save_file << window << endl;
630 //note more complex eventually since ana_name may need to have
631 //window size, threshold and other stuff to modify it...
632 the_paths.save_old(ana_name);
637 Mussa::load_old(char * load_file_path, int s_num)
640 string file_data_line;
641 int i, space_split_i, comma_split_i;
642 vector<int> loaded_path;
643 string node_pair, node;
647 the_paths.setup(seq_num, 0, 0);
648 save_file.open(load_file_path, ios::in);
650 // currently loads old mussa format
653 for(i = 0; i < seq_num; i++)
655 getline(save_file, file_data_line);
656 a_seq.set_seq(file_data_line);
657 the_Seqs.push_back(a_seq);
661 getline(save_file, file_data_line);
662 window = atoi(file_data_line.c_str());
665 while (!save_file.eof())
668 getline(save_file, file_data_line);
669 if (file_data_line != "")
670 for(i = 0; i < seq_num; i++)
672 space_split_i = file_data_line.find(" ");
673 node_pair = file_data_line.substr(0,space_split_i);
674 //cout << "np= " << node_pair;
675 comma_split_i = node_pair.find(",");
676 node = node_pair.substr(comma_split_i+1);
677 //cout << "n= " << node << " ";
678 loaded_path.push_back(atoi (node.c_str()));
679 file_data_line = file_data_line.substr(space_split_i+1);
682 the_paths.add_path(loaded_path);
686 //the_paths.save("tmp.save");