1 // ----------------------------------------
2 // ---------- mussa_class.cc -----------
3 // ----------------------------------------
5 #include "mussa_class.hh"
12 // set all parameters to null state
24 fasta_indices.clear();
26 sub_seq_starts.clear();
30 // these 5 simple methods manually set the parameters for doing an analysis
31 // used so that the gui can take input from user and setup the analysis
32 // note - still need a set_append(bool, bool) method...
34 Mussa::set_name(string a_name)
40 Mussa::set_seq_num(int a_num)
46 Mussa::set_window(int a_window)
52 Mussa::set_threshold(int a_threshold)
54 threshold = a_threshold;
55 //soft_thres = a_threshold;
59 Mussa::set_soft_thres(int sft_thres)
61 soft_thres = sft_thres;
65 Mussa::set_ana_mode(char new_ana_mode)
67 ana_mode = new_ana_mode;
70 // takes a string and sets it as the next seq - no AGCTN checking atm
72 Mussa::add_a_seq(string a_seq)
77 the_Seqs.push_back(aSeq);
81 // sets info for just 1 seq at a time
83 Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end)
85 seq_files.push_back(seq_file);
86 fasta_indices.push_back(fa_i);
87 annot_files.push_back(annot_file);
88 sub_seq_starts.push_back(a_start);
89 sub_seq_ends.push_back(the_end);
93 Mussa::load_mupa_file(string para_file_path)
96 string file_data_line, file_path_base;
97 string param, value, annot_file;
98 int split_index, fasta_index;
99 int sub_seq_start, sub_seq_end;
100 bool seq_params, did_seq;
104 int new_index, dir_index;
110 para_file.open(para_file_path.c_str(), ios::in);
112 // if file was opened, read the parameter values
115 // need to find the path to the .mupa file
120 new_index = (para_file_path.substr(dir_index)).find("/");
121 //cout << "mu class: "<< new_index << endl;
123 dir_index += new_index + 1;
125 parsing_path = false;
128 file_path_base = para_file_path.substr(0,dir_index);
129 cout << "mu class: mupa base path = " << file_path_base << endl;
131 // setup loop by getting file's first line
132 getline(para_file,file_data_line);
133 split_index = file_data_line.find(" ");
134 param = file_data_line.substr(0,split_index);
135 value = file_data_line.substr(split_index+1);
137 while (!para_file.eof())
140 if (param == "ANA_NAME")
142 else if (param == "APPEND_WIN")
144 else if (param == "APPEND_THRES")
146 else if (param == "SEQUENCE_NUM")
147 seq_num = atoi(value.c_str());
148 else if (param == "WINDOW")
149 window = atoi(value.c_str());
150 else if (param == "THRESHOLD")
151 threshold = atoi(value.c_str());
152 else if (param == "SEQUENCE")
154 seq_files.push_back(file_path_base + value);
161 while ((!para_file.eof()) && seq_params)
163 getline(para_file,file_data_line);
164 split_index = file_data_line.find(" ");
165 param = file_data_line.substr(0,split_index);
166 value = file_data_line.substr(split_index+1);
168 if (param == "FASTA_INDEX")
169 fasta_index = atoi(value.c_str());
170 else if (param == "ANNOTATION")
171 annot_file = file_path_base + value;
172 else if (param == "SEQ_START")
173 sub_seq_start = atoi(value.c_str());
174 else if (param == "SEQ_END")
176 cout << "hey! " << atoi(value.c_str()) << endl;
177 sub_seq_end = atoi(value.c_str());
179 //ignore empty lines or that start with '#'
180 else if ((param == "") || (param == "#")) {}
181 else seq_params = false;
184 fasta_indices.push_back(fasta_index);
185 annot_files.push_back(annot_file);
186 sub_seq_starts.push_back(sub_seq_start);
187 sub_seq_ends.push_back(sub_seq_end);
190 //ignore empty lines or that start with '#'
191 else if ((param == "") || (param == "#")) {}
194 cout << "Illegal/misplaced mussa parameter in file\n";
195 cout << param << "\n";
200 getline(para_file,file_data_line);
201 split_index = file_data_line.find(" ");
202 param = file_data_line.substr(0,split_index);
203 value = file_data_line.substr(split_index+1);
210 soft_thres = threshold;
211 cout << "nway mupa: ana_name = " << ana_name << " seq_num = " << seq_num;
212 cout << " window = " << window << " threshold = " << threshold << endl;
217 // no file was loaded, signal error
220 err_msg = "Config File: " + para_file_path + " not found";
225 // if (!((param == "") || (param == "#")))
226 // cout << value << " = " << param << endl;
230 Mussa::analyze(int w, int t, char the_ana_mode, double new_ent_thres)
233 time_t t1, t2, begin, end;
234 double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
237 cout << "nway ana: seq_num = " << seq_num << endl;
241 // now loading parameters from file must be done separately
243 //err_msg = load_mupa_file(para_file_path);
245 //setuptime = difftime(t2, t1);
247 ana_mode = the_ana_mode;
248 ent_thres = new_ent_thres;
261 err_msg = get_Seqs();
263 seqloadtime = difftime(t2, t1);
272 seqcomptime = difftime(t2, t1);
277 cout << "nway ana: seq_num = " << seq_num << endl;
278 the_paths.setup(seq_num, window, threshold);
281 nwaytime = difftime(t2, t1);
288 savetime = difftime(t2, t1);
291 totaltime = difftime(end, begin);
294 cout << "seqload\tseqcomp\tnway\tsave\ttotal\n";
296 //cout << setuptime << "\t";
297 cout << seqloadtime << "\t";
298 cout << seqcomptime << "\t";
299 cout << nwaytime << "\t";
300 cout << savetime << "\t";
301 cout << totaltime << "\n";
314 list<string>::iterator seq_files_i, annot_files_i;
315 list<int>::iterator fasta_indices_i, seq_starts_i, seq_ends_i;
320 seq_files_i = seq_files.begin();
321 fasta_indices_i = fasta_indices.begin();
322 annot_files_i = annot_files.begin();
323 seq_starts_i = sub_seq_starts.begin();
324 seq_ends_i = sub_seq_ends.begin();
328 while ( (seq_files_i != seq_files.end()) && (err_msg == "") )
329 /* it should be guarenteed that each of the following exist
330 should I bother checking, and how to deal with if not true...
332 (fasta_indices_i != fasta_indices.end()) &&
333 (annot_files_i != annot_files.end()) &&
334 (seq_starts_i != sub_seq_starts.end()) &&
335 (seq_ends_i != sub_seq_ends.end()) )
338 err_msg = aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,
342 if (*annot_files_i != "")
343 err_msg = aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i);
347 the_Seqs.push_back(aSeq);
348 cout << aSeq.hdr() << endl;
349 //cout << aSeq.seq() << endl;
351 ++seq_files_i; // advance all the iterators
371 int i, i2; // loop vars over sequences to analyze
372 vector<int> seq_lens;
373 vector<FLPs> empty_FLP_vector;
375 string save_file_string;
377 empty_FLP_vector.clear();
378 for(i = 0; i < seq_num; i++)
380 all_comps.push_back(empty_FLP_vector);
381 for(i2 = 0; i2 < seq_num; i2++)
382 all_comps[i].push_back(dummy_comp);
384 for(i = 0; i < seq_num; i++)
385 seq_lens.push_back(the_Seqs[i].len());
387 for(i = 0; i < seq_num; i++)
388 for(i2 = i+1; i2 < seq_num; i2++)
390 cout << "seqcomping: " << i << " v. " << i2 << endl;
391 all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]);
392 all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false);
393 all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true);
402 vector<string> some_Seqs;
406 cout << "nway: ana mode = " << ana_mode << endl;
407 cout << "nway: seq_num = " << seq_num << endl;
409 the_paths.set_soft_thres(soft_thres);
412 the_paths.trans_path_search(all_comps);
414 else if (ana_mode == 'r')
415 the_paths.radiate_path_search(all_comps);
417 else if (ana_mode == 'e')
419 //unlike other methods, entropy needs to look at the sequence at this stage
421 for(i = 0; i < seq_num; i++)
422 some_Seqs.push_back(the_Seqs[i].seq());
424 the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
425 the_paths.entropy_path_search(all_comps);
428 // old recursive transitive analysis function
429 else if (ana_mode == 'o')
430 the_paths.find_paths_r(all_comps);
432 the_paths.simple_refine();
438 string save_path_base, save_path, create_dir_cmd, flp_filepath;
440 ostringstream append_info;
441 int i, i2, dir_create_status;
444 // not sure why, but gotta close file each time since can't pass file streams
446 save_path_base = ana_name;
448 // gotta do bit with adding win & thres if to be appended
452 append_info << "_w" << window;
453 save_path_base += append_info.str();
459 append_info << "_t" << threshold;
460 save_path_base += append_info.str();
463 //#include <stdlib.h>
464 create_dir_cmd = "mkdir " + save_path_base;
465 dir_create_status = system( (const char*) create_dir_cmd.c_str());
466 cout << "action: " << dir_create_status << endl;
468 // save sequence and annots to a special mussa file
469 save_path = save_path_base + "/" + save_path_base + ".museq";
470 save_file.open(save_path.c_str(), ios::out);
471 save_file << "<Mussa_Sequence>" << endl;
474 for(i = 0; i < seq_num; i++)
475 the_Seqs[i].save(save_file);
477 //save_file.open(save_path.c_str(), ios::app);
478 save_file << "</Mussa_Sequence>" << endl;
481 // save nway paths to its mussa save file
482 save_path = save_path_base + "/" + save_path_base + ".muway";
483 the_paths.save(save_path);
485 for(i = 0; i < seq_num; i++)
486 for(i2 = i+1; i2 < seq_num; i2++)
489 append_info << "_sp_" << i << "v" << i2;
490 save_path = save_path_base + "/" + save_path_base + append_info.str() + ".flp";
491 all_comps[i][i2].file_save(save_path);
496 Mussa::save_muway(string save_path)
498 the_paths.save(save_path);
502 Mussa::load(string ana_file)
504 int i, i2, new_index, dir_index;
505 string file_path_base, a_file_path, ana_path;
509 ostringstream append_info;
510 vector<FLPs> empty_FLP_vector;
519 new_index = (ana_path.substr(dir_index)).find("/");
520 //cout << "mu class: "<< new_index << endl;
522 dir_index += new_index + 1;
524 parsing_path = false;
527 ana_name = ana_path.substr(dir_index);
528 cout << "mu class: ana_name = " << ana_name << endl;
529 file_path_base = ana_path + "/" + ana_path.substr(dir_index);
530 a_file_path = file_path_base + ".muway";
531 err_msg = the_paths.load(a_file_path);
532 cout << "there is no safe distance\n";
536 seq_num = the_paths.seq_num();
540 a_file_path = file_path_base + ".museq";
542 // this is a bit of a hack due to C++ not acting like it should with files
543 for (i = 1; i <= seq_num; i++)
546 cout << "mussa_class: loading the fucking museq frag...\n";
547 tmp_seq.load_museq(a_file_path, i);
548 the_Seqs.push_back(tmp_seq);
551 empty_FLP_vector.clear();
552 for(i = 0; i < seq_num; i++)
554 all_comps.push_back(empty_FLP_vector);
555 for(i2 = 0; i2 < seq_num; i2++)
556 all_comps[i].push_back(dummy_comp);
559 for(i = 0; i < seq_num; i++)
560 for(i2 = i+1; i2 < seq_num; i2++)
563 append_info << "_sp_" << i << "v" << i2;
564 cout << append_info.str() << endl;
565 a_file_path = file_path_base + append_info.str() + ".flp";
566 all_comps[i][i2].file_load(a_file_path);
567 cout << "real size = " << all_comps[i][i2].all_matches.size() << endl;
592 save_file.open(ana_name.c_str(), ios::out);
594 for(i = 0; i < seq_num; i++)
595 save_file << the_Seqs[i].seq() << endl;
597 save_file << window << endl;
599 //note more complex eventually since ana_name may need to have
600 //window size, threshold and other stuff to modify it...
601 the_paths.save_old(ana_name);
606 Mussa::load_old(char * load_file_path, int s_num)
609 string file_data_line;
610 int i, space_split_i, comma_split_i;
611 vector<int> loaded_path;
612 string node_pair, node;
616 the_paths.setup(seq_num, 0, 0);
617 save_file.open(load_file_path, ios::in);
619 // currently loads old mussa format
622 for(i = 0; i < seq_num; i++)
624 getline(save_file, file_data_line);
625 a_seq.set_seq(file_data_line);
626 the_Seqs.push_back(a_seq);
630 getline(save_file, file_data_line);
631 window = atoi(file_data_line.c_str());
634 while (!save_file.eof())
637 getline(save_file, file_data_line);
638 if (file_data_line != "")
639 for(i = 0; i < seq_num; i++)
641 space_split_i = file_data_line.find(" ");
642 node_pair = file_data_line.substr(0,space_split_i);
643 //cout << "np= " << node_pair;
644 comma_split_i = node_pair.find(",");
645 node = node_pair.substr(comma_split_i+1);
646 //cout << "n= " << node << " ";
647 loaded_path.push_back(atoi (node.c_str()));
648 file_data_line = file_data_line.substr(space_split_i+1);
651 the_paths.add_path(loaded_path);
655 //the_paths.save("tmp.save");