X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=alg%2Fmussa.cpp;h=9a5935e19947cbd68e28e7c7a3f88b267816030f;hb=6d25d4d945af696134bdf788b111f38b197b1a15;hp=58558ff6e14b498c3679c7bcc4dfea41fc5223dc;hpb=960e80998b060fd3c0f0d0cf6dce4e6fc219f16f;p=mussa.git diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 58558ff..9a5935e 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -21,9 +21,11 @@ namespace fs = boost::filesystem; #include #include "mussa_exceptions.hpp" -#include "alg/flp.hpp" -#include "alg/mussa.hpp" -#include "alg/motif_parser.hpp" + +#include "flp.hpp" +#include "io.hpp" +#include "mussa.hpp" +#include "motif_parser.hpp" using namespace std; @@ -32,8 +34,8 @@ Mussa::Mussa() : color_mapper(new AnnotationColors) { clear(); - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); } Mussa::Mussa(const Mussa& m) @@ -49,8 +51,14 @@ Mussa::Mussa(const Mussa& m) analysis_path(m.analysis_path), dirty(m.dirty) { - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); +} + +MussaRef Mussa::init() +{ + boost::shared_ptr m(new Mussa()); + return m; } boost::filesystem::path Mussa::get_analysis_path() const @@ -322,7 +330,7 @@ void Mussa::append_sequence(boost::shared_ptr a_seq) } -const vector >& +const vector& Mussa::sequences() const { return the_seqs; @@ -344,10 +352,35 @@ void Mussa::load_sequence(fs::path seq_file, fs::path annot_file, set_dirty(true); } +void Mussa::load_mupa_file(std::string para_file_path) { + load_mupa_file(boost::filesystem::path(para_file_path)); +} + void Mussa::load_mupa_file(fs::path para_file_path) { - fs::ifstream para_file; + if (not fs::exists(para_file_path)) + { + throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); + } else if (fs::is_directory(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is a directory."); + } else if (fs::is_empty(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is empty"); + } else { + // what directory is the mupa file in? + fs::path file_path_base( para_file_path.branch_path()) ; + + fs::ifstream para_file; + para_file.open(para_file_path, ios::in); + + load_mupa_stream(para_file, file_path_base); + para_file.close(); + } +} + +void +Mussa::load_mupa_stream(std::istream& para_file, fs::path& file_path_base) +{ string file_data_line; string param, value; fs::path annot_file; @@ -361,101 +394,84 @@ Mussa::load_mupa_file(fs::path para_file_path) // initialize values clear(); - // if file was opened, read the parameter values - if (not fs::exists(para_file_path)) + // setup loop by getting file's first line + getline(para_file, file_data_line); + split_index = file_data_line.find(" "); + param = file_data_line.substr(0,split_index); + value = file_data_line.substr(split_index+1); + + while (para_file) { - throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); - } else if (fs::is_directory(para_file_path)) { - throw mussa_load_error("Config File: " + para_file_path.string() + " is a directory."); - } else if (fs::is_empty(para_file_path)) { - throw mussa_load_error("Config File: " + para_file_path.string() + " is empty"); - } else { - para_file.open(para_file_path, ios::in); - - // what directory is the mupa file in? - fs::path file_path_base = para_file_path.branch_path(); - - // setup loop by getting file's first line - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - while (para_file) + did_seq = false; + if (param == "ANA_NAME") + analysis_name = value; + else if (param == "APPEND_WIN") + win_append = true; + else if (param == "APPEND_THRES") + thres_append = true; + else if (param == "SEQUENCE_NUM") + ; // ignore sequence_num now + else if (param == "WINDOW") + window = atoi(value.c_str()); + else if (param == "THRESHOLD") + threshold = atoi(value.c_str()); + else if (param == "SEQUENCE") { - did_seq = false; - if (param == "ANA_NAME") - analysis_name = value; - else if (param == "APPEND_WIN") - win_append = true; - else if (param == "APPEND_THRES") - thres_append = true; - else if (param == "SEQUENCE_NUM") - ; // ignore sequence_num now - else if (param == "WINDOW") - window = atoi(value.c_str()); - else if (param == "THRESHOLD") - threshold = atoi(value.c_str()); - else if (param == "SEQUENCE") + fs::path seq_file = file_path_base / value; + //cout << "seq_file_name " << seq_files.back() << endl; + fasta_index = 1; + annot_file = ""; + sub_seq_start = 0; + sub_seq_end = 0; + seq_params = true; + + while (para_file && seq_params) { - fs::path seq_file = file_path_base / value; - //cout << "seq_file_name " << seq_files.back() << endl; - fasta_index = 1; - annot_file = ""; - sub_seq_start = 0; - sub_seq_end = 0; - seq_params = true; - - while (para_file && seq_params) - { - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - if (param == "FASTA_INDEX") - fasta_index = atoi(value.c_str()); - else if (param == "ANNOTATION") - annot_file = file_path_base / value; - else if (param == "SEQ_START") - sub_seq_start = atoi(value.c_str()); - else if (param == "SEQ_END") - { - sub_seq_end = atoi(value.c_str()); - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else seq_params = false; - } - load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, - sub_seq_end); - did_seq = true; - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else - { - clog << "Illegal/misplaced mussa parameter in file\n"; - clog << param << "\n"; - } - - if (!did_seq) - { - getline(para_file,file_data_line); + multiplatform_getline(para_file,file_data_line); split_index = file_data_line.find(" "); param = file_data_line.substr(0,split_index); value = file_data_line.substr(split_index+1); - did_seq = false; + + if (param == "FASTA_INDEX") + fasta_index = atoi(value.c_str()); + else if (param == "ANNOTATION") + annot_file = file_path_base / value; + else if (param == "SEQ_START") + sub_seq_start = atoi(value.c_str()); + else if (param == "SEQ_END") + { + sub_seq_end = atoi(value.c_str()); + } + //ignore empty lines or that start with '#' + else if ((param == "") || (param == "#")) { + // pass + } else { + seq_params = false; + } } + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); + did_seq = true; + } + //ignore empty lines or that start with '#' + else if ((param == "") || (param == "#")) {} + else + { + clog << "Illegal/misplaced mussa parameter in file\n"; + clog << param << "\n"; } - para_file.close(); - - soft_thres = threshold; - //cout << "nway mupa: analysis_name = " << analysis_name - // << " window = " << window - // << " threshold = " << threshold << endl; + if (!did_seq) + { + multiplatform_getline(para_file,file_data_line); + split_index = file_data_line.find(" "); + param = file_data_line.substr(0,split_index); + value = file_data_line.substr(split_index+1); + did_seq = false; + } } + + soft_thres = threshold; // no file was loaded, signal error set_dirty(true); } @@ -633,10 +649,14 @@ Mussa::load(fs::path ana_file) vector empty_FLP_vector; FLPs dummy_comp; + + //-------------------------------------------------------- + // Load Muway + //-------------------------------------------------------- analysis_path = ana_file; analysis_name = ana_path.leaf(); - file_path_base = ana_path.branch_path() / analysis_name; - a_file_path = file_path_base / (analysis_name + ".muway"); + fs::path muway(analysis_name+".muway", fs::native); + a_file_path = analysis_path / muway; the_paths.load(a_file_path); // perhaps this could be more elegent, but at least this'll let // us know what our threshold and window sizes were when we load a muway @@ -644,22 +664,63 @@ Mussa::load(fs::path ana_file) threshold = the_paths.get_threshold(); soft_thres = threshold; - int seq_num = the_paths.sequence_count(); - a_file_path = file_path_base / (analysis_name + ".museq"); + //-------------------------------------------------------- + // Load Sequence + //-------------------------------------------------------- + //int seq_num = the_paths.sequence_count(); + + fs::path museq(analysis_name + ".museq", fs::native); + a_file_path = analysis_path / museq; // this is a bit of a hack due to C++ not acting like it should with files - for (i = 1; i <= seq_num; i++) + /*for (i = 1; i <= seq_num; i++) { boost::shared_ptr tmp_seq(new Sequence); tmp_seq->load_museq(a_file_path, i); the_seqs.push_back(tmp_seq); + }*/ + + i = 1; + //int seq_num = 0; + boost::filesystem::fstream load_museq_fs; + load_museq_fs.open(a_file_path, std::ios::in); + boost::shared_ptr tmp_seq; + while (1) + { + tmp_seq = Sequence::load_museq(load_museq_fs); + + if (tmp_seq) + { + the_seqs.push_back(tmp_seq); + } + else + { + break; + } + + + //safe guard in case of an infinate loop. + //FIXME: If mussa can handle a comparison of 10000 sequences + // in the future, then this code should be fixed. + if (i == 10000) + { + throw mussa_load_error(" Run away sequence load!"); + } + i++; } + load_museq_fs.close(); - fs::path motif_file = file_path_base / (analysis_name + ".mtl"); + //-------------------------------------------------------- + // Load Motifs + //-------------------------------------------------------- + fs::path mtl(analysis_name + ".mtl", fs::native); + fs::path motif_file = analysis_path / mtl; if (fs::exists(motif_file)) { load_motifs(motif_file); } + + vector::size_type seq_num = the_seqs.size(); empty_FLP_vector.clear(); for(i = 0; i < seq_num; i++) { @@ -668,6 +729,7 @@ Mussa::load(fs::path ana_file) all_comps[i].push_back(dummy_comp); } + for(i = 0; i < seq_num; i++) { for(i2 = i+1; i2 < seq_num; i2++) @@ -675,15 +737,13 @@ Mussa::load(fs::path ana_file) append_info.str(""); append_info << analysis_name << "_sp_" << i << "v" << i2 << ".flp"; //clog << append_info.str() << endl; - a_file_path = file_path_base / append_info.str(); - //clog << "path " << a_file_path.string() << endl; + fs::path flp(append_info.str(), fs::native); + a_file_path = analysis_path / flp; all_comps[i][i2].load(a_file_path); - //clog << "real size = " << all_comps[i][i2].size() << endl; } } } - void Mussa::save_old() { @@ -788,7 +848,7 @@ void Mussa::load_motifs(fs::path filename) void Mussa::load_motifs(std::istream &in) { std::string data; - const char *alphabet = Alphabet::nucleic_cstr; + const char *alphabet = Alphabet::dna_cstr; motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper); // slurp our data into a string @@ -829,7 +889,7 @@ void Mussa::update_sequences_motifs() { // once we've loaded all the motifs from the file, // lets attach them to the sequences - for(vector >::iterator seq_i = the_seqs.begin(); + for(vector::iterator seq_i = the_seqs.begin(); seq_i != the_seqs.end(); ++seq_i) {