1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_class.cc -----------
13 // ----------------------------------------
15 #include "mussa_class.hh"
22 // set all parameters to null state
34 fasta_indices.clear();
36 sub_seq_starts.clear();
40 // these 5 simple methods manually set the parameters for doing an analysis
41 // used so that the gui can take input from user and setup the analysis
42 // note - still need a set_append(bool, bool) method...
44 Mussa::set_name(string a_name)
50 Mussa::set_seq_num(int a_num)
56 Mussa::set_window(int a_window)
62 Mussa::set_threshold(int a_threshold)
64 threshold = a_threshold;
65 //soft_thres = a_threshold;
69 Mussa::set_soft_thres(int sft_thres)
71 soft_thres = sft_thres;
75 Mussa::set_ana_mode(char new_ana_mode)
77 ana_mode = new_ana_mode;
80 // takes a string and sets it as the next seq - no AGCTN checking atm
82 Mussa::add_a_seq(string a_seq)
87 the_Seqs.push_back(aSeq);
91 // sets info for just 1 seq at a time
93 Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end)
95 seq_files.push_back(seq_file);
96 fasta_indices.push_back(fa_i);
97 annot_files.push_back(annot_file);
98 sub_seq_starts.push_back(a_start);
99 sub_seq_ends.push_back(the_end);
103 Mussa::load_mupa_file(string para_file_path)
106 string file_data_line;
107 string param, value, annot_file;
108 int split_index, fasta_index;
109 int sub_seq_start, sub_seq_end;
110 bool seq_params, did_seq;
114 int new_index, dir_index;
120 para_file.open(para_file_path.c_str(), ios::in);
122 // if file was opened, read the parameter values
125 // need to find the path to the .mupa file
130 new_index = (para_file_path.substr(dir_index)).find("/");
131 //cout << "mu class: "<< new_index << endl;
133 dir_index += new_index + 1;
135 parsing_path = false;
138 file_path_base = para_file_path.substr(0,dir_index);
139 cout << "mu class: mupa base path = " << file_path_base << endl;
141 // setup loop by getting file's first line
142 getline(para_file,file_data_line);
143 split_index = file_data_line.find(" ");
144 param = file_data_line.substr(0,split_index);
145 value = file_data_line.substr(split_index+1);
147 while (!para_file.eof())
150 if (param == "ANA_NAME")
152 else if (param == "APPEND_WIN")
154 else if (param == "APPEND_THRES")
156 else if (param == "SEQUENCE_NUM")
157 seq_num = atoi(value.c_str());
158 else if (param == "WINDOW")
159 window = atoi(value.c_str());
160 else if (param == "THRESHOLD")
161 threshold = atoi(value.c_str());
162 else if (param == "SEQUENCE")
164 seq_files.push_back(file_path_base + value);
171 while ((!para_file.eof()) && seq_params)
173 getline(para_file,file_data_line);
174 split_index = file_data_line.find(" ");
175 param = file_data_line.substr(0,split_index);
176 value = file_data_line.substr(split_index+1);
178 if (param == "FASTA_INDEX")
179 fasta_index = atoi(value.c_str());
180 else if (param == "ANNOTATION")
181 annot_file = file_path_base + value;
182 else if (param == "SEQ_START")
183 sub_seq_start = atoi(value.c_str());
184 else if (param == "SEQ_END")
186 cout << "hey! " << atoi(value.c_str()) << endl;
187 sub_seq_end = atoi(value.c_str());
189 //ignore empty lines or that start with '#'
190 else if ((param == "") || (param == "#")) {}
191 else seq_params = false;
194 fasta_indices.push_back(fasta_index);
195 annot_files.push_back(annot_file);
196 sub_seq_starts.push_back(sub_seq_start);
197 sub_seq_ends.push_back(sub_seq_end);
200 //ignore empty lines or that start with '#'
201 else if ((param == "") || (param == "#")) {}
204 cout << "Illegal/misplaced mussa parameter in file\n";
205 cout << param << "\n";
210 getline(para_file,file_data_line);
211 split_index = file_data_line.find(" ");
212 param = file_data_line.substr(0,split_index);
213 value = file_data_line.substr(split_index+1);
220 soft_thres = threshold;
221 cout << "nway mupa: ana_name = " << ana_name << " seq_num = " << seq_num;
222 cout << " window = " << window << " threshold = " << threshold << endl;
227 // no file was loaded, signal error
230 err_msg = "Config File: " + para_file_path + " not found";
235 // if (!((param == "") || (param == "#")))
236 // cout << value << " = " << param << endl;
240 Mussa::analyze(int w, int t, char the_ana_mode, double new_ent_thres)
243 time_t t1, t2, begin, end;
244 double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
247 cout << "nway ana: seq_num = " << seq_num << endl;
251 // now loading parameters from file must be done separately
253 //err_msg = load_mupa_file(para_file_path);
255 //setuptime = difftime(t2, t1);
257 ana_mode = the_ana_mode;
258 ent_thres = new_ent_thres;
271 err_msg = get_Seqs();
273 seqloadtime = difftime(t2, t1);
282 seqcomptime = difftime(t2, t1);
287 cout << "nway ana: seq_num = " << seq_num << endl;
288 the_paths.setup(seq_num, window, threshold);
291 nwaytime = difftime(t2, t1);
298 savetime = difftime(t2, t1);
301 totaltime = difftime(end, begin);
304 cout << "seqload\tseqcomp\tnway\tsave\ttotal\n";
306 //cout << setuptime << "\t";
307 cout << seqloadtime << "\t";
308 cout << seqcomptime << "\t";
309 cout << nwaytime << "\t";
310 cout << savetime << "\t";
311 cout << totaltime << "\n";
324 list<string>::iterator seq_files_i, annot_files_i;
325 list<int>::iterator fasta_indices_i, seq_starts_i, seq_ends_i;
330 seq_files_i = seq_files.begin();
331 fasta_indices_i = fasta_indices.begin();
332 annot_files_i = annot_files.begin();
333 seq_starts_i = sub_seq_starts.begin();
334 seq_ends_i = sub_seq_ends.begin();
338 while ( (seq_files_i != seq_files.end()) && (err_msg == "") )
339 /* it should be guarenteed that each of the following exist
340 should I bother checking, and how to deal with if not true...
342 (fasta_indices_i != fasta_indices.end()) &&
343 (annot_files_i != annot_files.end()) &&
344 (seq_starts_i != sub_seq_starts.end()) &&
345 (seq_ends_i != sub_seq_ends.end()) )
348 err_msg = aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,
352 if (*annot_files_i != "")
353 err_msg = aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i);
357 the_Seqs.push_back(aSeq);
358 cout << aSeq.hdr() << endl;
359 //cout << aSeq.seq() << endl;
361 ++seq_files_i; // advance all the iterators
381 int i, i2; // loop vars over sequences to analyze
382 vector<int> seq_lens;
383 vector<FLPs> empty_FLP_vector;
385 string save_file_string;
387 empty_FLP_vector.clear();
388 for(i = 0; i < seq_num; i++)
390 all_comps.push_back(empty_FLP_vector);
391 for(i2 = 0; i2 < seq_num; i2++)
392 all_comps[i].push_back(dummy_comp);
394 for(i = 0; i < seq_num; i++)
395 seq_lens.push_back(the_Seqs[i].len());
397 for(i = 0; i < seq_num; i++)
398 for(i2 = i+1; i2 < seq_num; i2++)
400 cout << "seqcomping: " << i << " v. " << i2 << endl;
401 all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]);
402 all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false);
403 all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true);
412 vector<string> some_Seqs;
416 cout << "nway: ana mode = " << ana_mode << endl;
417 cout << "nway: seq_num = " << seq_num << endl;
419 the_paths.set_soft_thres(soft_thres);
422 the_paths.trans_path_search(all_comps);
424 else if (ana_mode == 'r')
425 the_paths.radiate_path_search(all_comps);
427 else if (ana_mode == 'e')
429 //unlike other methods, entropy needs to look at the sequence at this stage
431 for(i = 0; i < seq_num; i++)
432 some_Seqs.push_back(the_Seqs[i].seq());
434 the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
435 the_paths.entropy_path_search(all_comps);
438 // old recursive transitive analysis function
439 else if (ana_mode == 'o')
440 the_paths.find_paths_r(all_comps);
442 the_paths.simple_refine();
448 string save_name, save_path, create_dir_cmd, flp_filepath;
450 ostringstream append_info;
451 int i, i2, dir_create_status;
454 // not sure why, but gotta close file each time since can't pass file streams
456 save_path_base = ana_name;
458 // gotta do bit with adding win & thres if to be appended
462 append_info << "_w" << window;
463 save_name += append_info.str();
469 append_info << "_t" << threshold;
470 save_name += append_info.str();
473 //#include <stdlib.h>
474 // ******* use appropriate for os ------- 1 of 4
475 // the additions for osX make it more sane where it saves the analysis
476 // will come up with a cleaner sol'n later...
477 create_dir_cmd = "mkdir " + save_name; //linux
478 //create_dir_cmd = "mkdir " + file_path_base + save_name; //osX
480 dir_create_status = system( (const char*) create_dir_cmd.c_str());
481 cout << "action: " << dir_create_status << endl;
483 // save sequence and annots to a special mussa file
485 // ******** use appropriate for OS ---------- 2 of 4
486 save_path = save_name + "/" + save_name + ".museq"; //linux
487 //save_path = file_path_base + save_name + "/" + save_name + ".museq"; //osX
489 save_file.open(save_path.c_str(), ios::out);
490 save_file << "<Mussa_Sequence>" << endl;
493 for(i = 0; i < seq_num; i++)
494 the_Seqs[i].save(save_file);
496 //save_file.open(save_path.c_str(), ios::app);
497 save_file << "</Mussa_Sequence>" << endl;
500 // save nway paths to its mussa save file
502 // ******** use appropriate for OS -------- 3 of 4
503 save_path = save_name + "/" + save_name + ".muway"; //linux
504 //save_path = file_path_base + save_name + "/" + save_name + ".muway"; //os X
505 the_paths.save(save_path);
507 for(i = 0; i < seq_num; i++)
508 for(i2 = i+1; i2 < seq_num; i2++)
511 append_info << "_sp_" << i << "v" << i2;
512 // ******** use appropriate for OS --------- 4 of 4
514 save_path = save_name + "/" + save_name + append_info.str() + ".flp";
516 //save_path = file_path_base + save_name + "/" + save_name + append_info.str() + ".flp";
517 all_comps[i][i2].file_save(save_path);
522 Mussa::save_muway(string save_path)
524 the_paths.save(save_path);
528 Mussa::load(string ana_file)
530 int i, i2, new_index, dir_index;
531 string file_path_base, a_file_path, ana_path;
535 ostringstream append_info;
536 vector<FLPs> empty_FLP_vector;
545 new_index = (ana_path.substr(dir_index)).find("/");
546 //cout << "mu class: "<< new_index << endl;
548 dir_index += new_index + 1;
550 parsing_path = false;
553 ana_name = ana_path.substr(dir_index);
554 cout << "mu class: ana_name = " << ana_name << endl;
555 file_path_base = ana_path + "/" + ana_path.substr(dir_index);
556 a_file_path = file_path_base + ".muway";
557 err_msg = the_paths.load(a_file_path);
558 cout << "there is no safe distance\n";
562 seq_num = the_paths.seq_num();
566 a_file_path = file_path_base + ".museq";
568 // this is a bit of a hack due to C++ not acting like it should with files
569 for (i = 1; i <= seq_num; i++)
572 cout << "mussa_class: loading the fucking museq frag...\n";
573 tmp_seq.load_museq(a_file_path, i);
574 the_Seqs.push_back(tmp_seq);
577 empty_FLP_vector.clear();
578 for(i = 0; i < seq_num; i++)
580 all_comps.push_back(empty_FLP_vector);
581 for(i2 = 0; i2 < seq_num; i2++)
582 all_comps[i].push_back(dummy_comp);
585 for(i = 0; i < seq_num; i++)
586 for(i2 = i+1; i2 < seq_num; i2++)
589 append_info << "_sp_" << i << "v" << i2;
590 cout << append_info.str() << endl;
591 a_file_path = file_path_base + append_info.str() + ".flp";
592 all_comps[i][i2].file_load(a_file_path);
593 cout << "real size = " << all_comps[i][i2].all_matches.size() << endl;
618 save_file.open(ana_name.c_str(), ios::out);
620 for(i = 0; i < seq_num; i++)
621 save_file << the_Seqs[i].seq() << endl;
623 save_file << window << endl;
625 //note more complex eventually since ana_name may need to have
626 //window size, threshold and other stuff to modify it...
627 the_paths.save_old(ana_name);
632 Mussa::load_old(char * load_file_path, int s_num)
635 string file_data_line;
636 int i, space_split_i, comma_split_i;
637 vector<int> loaded_path;
638 string node_pair, node;
642 the_paths.setup(seq_num, 0, 0);
643 save_file.open(load_file_path, ios::in);
645 // currently loads old mussa format
648 for(i = 0; i < seq_num; i++)
650 getline(save_file, file_data_line);
651 a_seq.set_seq(file_data_line);
652 the_Seqs.push_back(a_seq);
656 getline(save_file, file_data_line);
657 window = atoi(file_data_line.c_str());
660 while (!save_file.eof())
663 getline(save_file, file_data_line);
664 if (file_data_line != "")
665 for(i = 0; i < seq_num; i++)
667 space_split_i = file_data_line.find(" ");
668 node_pair = file_data_line.substr(0,space_split_i);
669 //cout << "np= " << node_pair;
670 comma_split_i = node_pair.find(",");
671 node = node_pair.substr(comma_split_i+1);
672 //cout << "n= " << node << " ";
673 loaded_path.push_back(atoi (node.c_str()));
674 file_data_line = file_data_line.substr(space_split_i+1);
677 the_paths.add_path(loaded_path);
681 //the_paths.save("tmp.save");