+// This file is part of the Mussa source distribution.
+// http://mussa.caltech.edu/
+// Contact author: Tristan De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA. It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
// ----------------------------------------
// ---------- mussa_class.cc -----------
// ----------------------------------------
#include "mussa_class.hh"
+#include <sstream>
-// doesn't do neg ints...
-string
-int_to_str(int an_int)
+Mussa::Mussa()
{
- string converted_int;
- int remainder;
+}
- converted_int = "";
+// set all parameters to null state
+void
+Mussa::clear()
+{
+ ana_name = "";
+ seq_num = 0;
+ window = 0;
+ threshold = 0;
+ soft_thres = 0;
+ win_append = false;
+ thres_append = false;
+ seq_files.clear();
+ fasta_indices.clear();
+ annot_files.clear();
+ sub_seq_starts.clear();
+ sub_seq_ends.clear();
+}
- if (an_int == 0)
- converted_int = "0";
+// these 5 simple methods manually set the parameters for doing an analysis
+// used so that the gui can take input from user and setup the analysis
+// note - still need a set_append(bool, bool) method...
+void
+Mussa::set_name(string a_name)
+{
+ ana_name = a_name;
+}
- while (an_int != 0)
- {
- remainder = an_int % 10;
-
- if (remainder == 0)
- converted_int = "0" + converted_int;
- else if (remainder == 1)
- converted_int = "1" + converted_int;
- else if (remainder == 2)
- converted_int = "2" + converted_int;
- else if (remainder == 3)
- converted_int = "3" + converted_int;
- else if (remainder == 4)
- converted_int = "4" + converted_int;
- else if (remainder == 5)
- converted_int = "5" + converted_int;
- else if (remainder == 6)
- converted_int = "6" + converted_int;
- else if (remainder == 7)
- converted_int = "7" + converted_int;
- else if (remainder == 8)
- converted_int = "8" + converted_int;
- else if (remainder == 9)
- converted_int = "9" + converted_int;
-
- an_int = an_int / 10;
- }
+void
+Mussa::set_seq_num(int a_num)
+{
+ seq_num = a_num;
+}
- return converted_int;
+void
+Mussa::set_window(int a_window)
+{
+ window = a_window;
}
+void
+Mussa::set_threshold(int a_threshold)
+{
+ threshold = a_threshold;
+ //soft_thres = a_threshold;
+}
-Mussa::Mussa()
+void
+Mussa::set_soft_thres(int sft_thres)
+{
+ soft_thres = sft_thres;
+}
+
+void
+Mussa::set_ana_mode(char new_ana_mode)
{
+ ana_mode = new_ana_mode;
}
+// takes a string and sets it as the next seq - no AGCTN checking atm
+void
+Mussa::add_a_seq(string a_seq)
+{
+ Sequence aSeq;
+ aSeq.set_seq(a_seq);
+ the_Seqs.push_back(aSeq);
+}
+
+
+// sets info for just 1 seq at a time
void
-Mussa::load_parameters(string para_file_path)
+Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end)
+{
+ seq_files.push_back(seq_file);
+ fasta_indices.push_back(fa_i);
+ annot_files.push_back(annot_file);
+ sub_seq_starts.push_back(a_start);
+ sub_seq_ends.push_back(the_end);
+}
+
+string
+Mussa::load_mupa_file(string para_file_path)
{
ifstream para_file;
string file_data_line;
int split_index, fasta_index;
int sub_seq_start, sub_seq_end;
bool seq_params, did_seq;
+ string err_msg;
int bogo;
+ bool parsing_path;
+ int new_index, dir_index;
- win_append = false;
- thres_append = false;
- seq_files.clear();
- fasta_indices.clear();
- annot_files.clear();
- sub_seq_starts.clear();
- sub_seq_ends.clear();
+ // initialize values
+ clear();
para_file.open(para_file_path.c_str(), ios::in);
- getline(para_file,file_data_line);
- split_index = file_data_line.find(" ");
- param = file_data_line.substr(0,split_index);
- value = file_data_line.substr(split_index+1);
-
-
- while (!para_file.eof())
+ // if file was opened, read the parameter values
+ if (para_file)
{
- did_seq = false;
- if (param == "ANA_NAME")
- ana_name = value;
- else if (param == "APPEND_WIN")
- win_append = true;
- else if (param == "APPEND_THRES")
- thres_append = true;
- else if (param == "SEQUENCE_NUM")
- seq_num = atoi(value.c_str());
- else if (param == "WINDOW")
- window = atoi(value.c_str());
- else if (param == "THRESHOLD")
- threshold = atoi(value.c_str());
- else if (param == "SEQUENCE")
+ // need to find the path to the .mupa file
+ parsing_path = true;
+ dir_index = 0;
+ while (parsing_path)
+ {
+ new_index = (para_file_path.substr(dir_index)).find("/");
+ //cout << "mu class: "<< new_index << endl;
+ if (new_index > 0)
+ dir_index += new_index + 1;
+ else
+ parsing_path = false;
+ }
+
+ file_path_base = para_file_path.substr(0,dir_index);
+ cout << "mu class: mupa base path = " << file_path_base << endl;
+
+ // setup loop by getting file's first line
+ getline(para_file,file_data_line);
+ split_index = file_data_line.find(" ");
+ param = file_data_line.substr(0,split_index);
+ value = file_data_line.substr(split_index+1);
+
+ while (!para_file.eof())
{
- seq_files.push_back(value);
- fasta_index = 1;
- annot_file = "";
- sub_seq_start = 0;
- sub_seq_end = 0;
- seq_params = true;
-
- while ((!para_file.eof()) && seq_params)
+ did_seq = false;
+ if (param == "ANA_NAME")
+ ana_name = value;
+ else if (param == "APPEND_WIN")
+ win_append = true;
+ else if (param == "APPEND_THRES")
+ thres_append = true;
+ else if (param == "SEQUENCE_NUM")
+ seq_num = atoi(value.c_str());
+ else if (param == "WINDOW")
+ window = atoi(value.c_str());
+ else if (param == "THRESHOLD")
+ threshold = atoi(value.c_str());
+ else if (param == "SEQUENCE")
+ {
+ seq_files.push_back(file_path_base + value);
+ fasta_index = 1;
+ annot_file = "";
+ sub_seq_start = 0;
+ sub_seq_end = 0;
+ seq_params = true;
+
+ while ((!para_file.eof()) && seq_params)
+ {
+ getline(para_file,file_data_line);
+ split_index = file_data_line.find(" ");
+ param = file_data_line.substr(0,split_index);
+ value = file_data_line.substr(split_index+1);
+
+ if (param == "FASTA_INDEX")
+ fasta_index = atoi(value.c_str());
+ else if (param == "ANNOTATION")
+ annot_file = file_path_base + value;
+ else if (param == "SEQ_START")
+ sub_seq_start = atoi(value.c_str());
+ else if (param == "SEQ_END")
+ {
+ cout << "hey! " << atoi(value.c_str()) << endl;
+ sub_seq_end = atoi(value.c_str());
+ }
+ //ignore empty lines or that start with '#'
+ else if ((param == "") || (param == "#")) {}
+ else seq_params = false;
+ }
+
+ fasta_indices.push_back(fasta_index);
+ annot_files.push_back(annot_file);
+ sub_seq_starts.push_back(sub_seq_start);
+ sub_seq_ends.push_back(sub_seq_end);
+ did_seq = true;
+ }
+ //ignore empty lines or that start with '#'
+ else if ((param == "") || (param == "#")) {}
+ else
+ {
+ cout << "Illegal/misplaced mussa parameter in file\n";
+ cout << param << "\n";
+ }
+
+ if (!did_seq)
{
getline(para_file,file_data_line);
split_index = file_data_line.find(" ");
param = file_data_line.substr(0,split_index);
value = file_data_line.substr(split_index+1);
-
- if (param == "FASTA_INDEX")
- fasta_index = atoi(value.c_str());
- else if (param == "ANNOTATION")
- annot_file = value;
- else if (param == "SEQ_START")
- sub_seq_start = atoi(value.c_str());
- else if (param == "SEQ_END")
- {
- cout << "hey! " << atoi(value.c_str()) << endl;
- sub_seq_end = atoi(value.c_str());
- }
- //ignore empty lines or that start with '#'
- else if ((param == "") || (param == "#")) {}
- else seq_params = false;
+ did_seq = false;
}
+ }
+ para_file.close();
- fasta_indices.push_back(fasta_index);
- annot_files.push_back(annot_file);
- sub_seq_starts.push_back(sub_seq_start);
- sub_seq_ends.push_back(sub_seq_end);
- did_seq = true;
-
- }
- //ignore empty lines or that start with '#'
- else if ((param == "") || (param == "#")) {}
- else
- {
- cout << "Illegal/misplaced mussa parameter in file\n";
- cout << param << "\n";
- }
+ soft_thres = threshold;
+ cout << "nway mupa: ana_name = " << ana_name << " seq_num = " << seq_num;
+ cout << " window = " << window << " threshold = " << threshold << endl;
- if (!did_seq)
- {
- getline(para_file,file_data_line);
- split_index = file_data_line.find(" ");
- param = file_data_line.substr(0,split_index);
- value = file_data_line.substr(split_index+1);
- did_seq = false;
- }
+ return "";
}
- para_file.close();
-
- cout << "ana_name = " << ana_name << win_append << win_append << "\n";
- cout << "window = " << window << " threshold = " << threshold << "\n";
+ // no file was loaded, signal error
+ else
+ {
+ err_msg = "Config File: " + para_file_path + " not found";
+ return err_msg;
+ }
}
// if (!((param == "") || (param == "#")))
// cout << value << " = " << param << endl;
-void
-Mussa::analyze(string para_file_path, int w, int t)
+
+
+string
+Mussa::analyze(int w, int t, char the_ana_mode, double new_ent_thres)
{
+ string err_msg;
time_t t1, t2, begin, end;
double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
- int seq_num;
-
- if (w > 0)
- window = w;
- if (t > 0)
- threshold = t;
- begin = time(NULL);
- t1 = time(NULL);
- load_parameters(para_file_path);
- t2 = time(NULL);
- setuptime = difftime(t2, t1);
+ cout << "nway ana: seq_num = " << seq_num << endl;
+ begin = time(NULL);
- cout << "fee\n";
- t1 = time(NULL);
- get_Seqs();
- t2 = time(NULL);
- seqloadtime = difftime(t2, t1);
+ // now loading parameters from file must be done separately
+ //t1 = time(NULL);
+ //err_msg = load_mupa_file(para_file_path);
+ //t2 = time(NULL);
+ //setuptime = difftime(t2, t1);
+ ana_mode = the_ana_mode;
+ ent_thres = new_ent_thres;
+ if (w > 0)
+ window = w;
+ if (t > 0)
+ {
+ threshold = t;
+ soft_thres = t;
+ }
- cout << "fie\n";
- t1 = time(NULL);
- seqcomp();
- t2 = time(NULL);
- seqcomptime = difftime(t2, t1);
+ //if (err_msg == "")
+ //{
+ cout << "fee\n";
+ t1 = time(NULL);
+ err_msg = get_Seqs();
+ t2 = time(NULL);
+ seqloadtime = difftime(t2, t1);
- cout << "foe\n";
- t1 = time(NULL);
- nway();
- t2 = time(NULL);
- nwaytime = difftime(t2, t1);
+ if (err_msg == "")
+ {
+ cout << "fie\n";
+ t1 = time(NULL);
+ seqcomp();
+ t2 = time(NULL);
+ seqcomptime = difftime(t2, t1);
- cout << "fum\n";
- t1 = time(NULL);
- save();
- t2 = time(NULL);
- savetime = difftime(t2, t1);
+ cout << "foe\n";
+ t1 = time(NULL);
+ cout << "nway ana: seq_num = " << seq_num << endl;
+ the_paths.setup(seq_num, window, threshold);
+ nway();
+ t2 = time(NULL);
+ nwaytime = difftime(t2, t1);
- end = time(NULL);
- totaltime = difftime(end, begin);
- cout << "setup\tseqload\tseqcomp\tnway\tsave\ttotal\n";
- cout << setuptime << "\t";
- cout << seqloadtime << "\t";
- cout << seqcomptime << "\t";
- cout << nwaytime << "\t";
- cout << savetime << "\t";
- cout << totaltime << "\n";
+ cout << "fum\n";
+ t1 = time(NULL);
+ save();
+ t2 = time(NULL);
+ savetime = difftime(t2, t1);
+
+ end = time(NULL);
+ totaltime = difftime(end, begin);
+
+
+ cout << "seqload\tseqcomp\tnway\tsave\ttotal\n";
+ //setup\t
+ //cout << setuptime << "\t";
+ cout << seqloadtime << "\t";
+ cout << seqcomptime << "\t";
+ cout << nwaytime << "\t";
+ cout << savetime << "\t";
+ cout << totaltime << "\n";
+ }
+ else
+ return err_msg;
+ //}
+ //else
+ //return err_msg;
}
-void
+string
Mussa::get_Seqs()
{
list<string>::iterator seq_files_i, annot_files_i;
list<int>::iterator fasta_indices_i, seq_starts_i, seq_ends_i;
Sequence aSeq;
+ string err_msg;
+
seq_files_i = seq_files.begin();
fasta_indices_i = fasta_indices.begin();
seq_starts_i = sub_seq_starts.begin();
seq_ends_i = sub_seq_ends.begin();
+ err_msg = "";
- while (seq_files_i != seq_files.end())
+ while ( (seq_files_i != seq_files.end()) && (err_msg == "") )
/* it should be guarenteed that each of the following exist
+ should I bother checking, and how to deal with if not true...
&&
(fasta_indices_i != fasta_indices.end()) &&
(annot_files_i != annot_files.end()) &&
(seq_ends_i != sub_seq_ends.end()) )
*/
{
- aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,*seq_ends_i);
- if (*annot_files_i != "")
- aSeq.load_annot(*annot_files_i);
- the_Seqs.push_back(aSeq);
- cout << aSeq.hdr() << endl;
- //cout << aSeq.seq() << endl;
- aSeq.clear();
- ++seq_files_i;
- ++fasta_indices_i;
- ++annot_files_i;
- ++seq_starts_i;
- ++seq_ends_i;
+ err_msg = aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,
+ *seq_ends_i);
+ if (err_msg == "")
+ {
+ if (*annot_files_i != "")
+ err_msg = aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i);
+
+ if (err_msg == "")
+ {
+ the_Seqs.push_back(aSeq);
+ cout << aSeq.hdr() << endl;
+ //cout << aSeq.seq() << endl;
+ aSeq.clear();
+ ++seq_files_i; // advance all the iterators
+ ++fasta_indices_i;
+ ++annot_files_i;
+ ++seq_starts_i;
+ ++seq_ends_i;
+ }
+ else
+ break;
+ }
+ else
+ break;
}
+
+ return err_msg;
}
-/*
- aSeq.seq();
- cout << "\n";
- aSeq.hdr();
- cout << "\n";
-*/
void
Mussa::seqcomp()
for(i2 = 0; i2 < seq_num; i2++)
all_comps[i].push_back(dummy_comp);
}
-
for(i = 0; i < seq_num; i++)
seq_lens.push_back(the_Seqs[i].len());
for(i = 0; i < seq_num; i++)
for(i2 = i+1; i2 < seq_num; i2++)
{
+ cout << "seqcomping: " << i << " v. " << i2 << endl;
all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]);
all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false);
all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true);
-
- all_comps[i][i2].file_save(save_file_string);
}
}
void
Mussa::nway()
{
- the_paths.setup(seq_num, window, threshold);
- the_paths.find_paths_r(all_comps);
+ vector<string> some_Seqs;
+ int i;
+
+
+ cout << "nway: ana mode = " << ana_mode << endl;
+ cout << "nway: seq_num = " << seq_num << endl;
+
+ the_paths.set_soft_thres(soft_thres);
+
+ if (ana_mode == 't')
+ the_paths.trans_path_search(all_comps);
+
+ else if (ana_mode == 'r')
+ the_paths.radiate_path_search(all_comps);
+
+ else if (ana_mode == 'e')
+ {
+ //unlike other methods, entropy needs to look at the sequence at this stage
+ some_Seqs.clear();
+ for(i = 0; i < seq_num; i++)
+ some_Seqs.push_back(the_Seqs[i].seq());
+
+ the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
+ the_paths.entropy_path_search(all_comps);
+ }
+
+ // old recursive transitive analysis function
+ else if (ana_mode == 'o')
+ the_paths.find_paths_r(all_comps);
+
the_paths.simple_refine();
}
void
Mussa::save()
{
- string save_path_base, save_path;
+ string save_name, save_path, create_dir_cmd, flp_filepath;
fstream save_file;
- int i;
+ ostringstream append_info;
+ int i, i2, dir_create_status;
- // gotta do bit with adding win & thres if to be appended - need itos
// not sure why, but gotta close file each time since can't pass file streams
save_path_base = ana_name;
+ // gotta do bit with adding win & thres if to be appended
if (win_append)
- save_path_base += "_w" + int_to_str(window);
+ {
+ append_info.str("");
+ append_info << "_w" << window;
+ save_name += append_info.str();
+ }
if (thres_append)
- save_path_base += "_t" + int_to_str(threshold);
+ {
+ append_info.str("");
+ append_info << "_t" << threshold;
+ save_name += append_info.str();
+ }
+
+//#include <stdlib.h>
+ // ******* use appropriate for os ------- 1 of 4
+ // the additions for osX make it more sane where it saves the analysis
+ // will come up with a cleaner sol'n later...
+ create_dir_cmd = "mkdir " + save_name; //linux
+ //create_dir_cmd = "mkdir " + file_path_base + save_name; //osX
+
+ dir_create_status = system( (const char*) create_dir_cmd.c_str());
+ cout << "action: " << dir_create_status << endl;
// save sequence and annots to a special mussa file
- save_path = save_path_base + ".museq";
+
+ // ******** use appropriate for OS ---------- 2 of 4
+ save_path = save_name + "/" + save_name + ".museq"; //linux
+ //save_path = file_path_base + save_name + "/" + save_name + ".museq"; //osX
+
save_file.open(save_path.c_str(), ios::out);
save_file << "<Mussa_Sequence>" << endl;
//save_file.close();
save_file.close();
// save nway paths to its mussa save file
- save_path = save_path_base + ".muway";
+
+ // ******** use appropriate for OS -------- 3 of 4
+ save_path = save_name + "/" + save_name + ".muway"; //linux
+ //save_path = file_path_base + save_name + "/" + save_name + ".muway"; //os X
the_paths.save(save_path);
+
+ for(i = 0; i < seq_num; i++)
+ for(i2 = i+1; i2 < seq_num; i2++)
+ {
+ append_info.str("");
+ append_info << "_sp_" << i << "v" << i2;
+ // ******** use appropriate for OS --------- 4 of 4
+ //linux
+ save_path = save_name + "/" + save_name + append_info.str() + ".flp";
+ //osX
+ //save_path = file_path_base + save_name + "/" + save_name + append_info.str() + ".flp";
+ all_comps[i][i2].file_save(save_path);
+ }
}
void
+Mussa::save_muway(string save_path)
+{
+ the_paths.save(save_path);
+}
+
+string
Mussa::load(string ana_file)
{
- int i;
- string load_file_path;
+ int i, i2, new_index, dir_index;
+ string file_path_base, a_file_path, ana_path;
+ bool parsing_path;
Sequence tmp_seq;
+ string err_msg;
+ ostringstream append_info;
+ vector<FLPs> empty_FLP_vector;
+ FLPs dummy_comp;
+
+ ana_path = ana_file;
+ parsing_path = true;
+ dir_index = 0;
+ while (parsing_path)
+ {
+ new_index = (ana_path.substr(dir_index)).find("/");
+ //cout << "mu class: "<< new_index << endl;
+ if (new_index > 0)
+ dir_index += new_index + 1;
+ else
+ parsing_path = false;
+ }
- ana_name = ana_file;
- load_file_path = ana_name + ".muway";
- seq_num = the_paths.load(load_file_path);
+ ana_name = ana_path.substr(dir_index);
+ cout << "mu class: ana_name = " << ana_name << endl;
+ file_path_base = ana_path + "/" + ana_path.substr(dir_index);
+ a_file_path = file_path_base + ".muway";
+ err_msg = the_paths.load(a_file_path);
+ cout << "there is no safe distance\n";
- load_file_path = ana_name + ".museq";
- for (i = 1; i <= seq_num; i++)
+ if (err_msg == "")
{
- tmp_seq.clear();
- tmp_seq.load_museq(load_file_path, i);
- the_Seqs.push_back(tmp_seq);
+ seq_num = the_paths.seq_num();
+
+ cout << "No BAM\n";
+
+ a_file_path = file_path_base + ".museq";
+
+ // this is a bit of a hack due to C++ not acting like it should with files
+ for (i = 1; i <= seq_num; i++)
+ {
+ tmp_seq.clear();
+ cout << "mussa_class: loading the fucking museq frag...\n";
+ tmp_seq.load_museq(a_file_path, i);
+ the_Seqs.push_back(tmp_seq);
+ }
+
+ empty_FLP_vector.clear();
+ for(i = 0; i < seq_num; i++)
+ {
+ all_comps.push_back(empty_FLP_vector);
+ for(i2 = 0; i2 < seq_num; i2++)
+ all_comps[i].push_back(dummy_comp);
+ }
+
+ for(i = 0; i < seq_num; i++)
+ for(i2 = i+1; i2 < seq_num; i2++)
+ {
+ append_info.str("");
+ append_info << "_sp_" << i << "v" << i2;
+ cout << append_info.str() << endl;
+ a_file_path = file_path_base + append_info.str() + ".flp";
+ all_comps[i][i2].file_load(a_file_path);
+ cout << "real size = " << all_comps[i][i2].all_matches.size() << endl;
+ }
+
+
+ return "";
}
+ else
+ return err_msg;
}
/*