X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=alg%2Fmussa.cpp;h=cda06ffe3ab9001d78bfa85323b7ee3b12d98435;hb=944d43be5db80cddba58cecb12bedcbe451bb3f7;hp=18539b83ed44860f8439c5378c39352a2381197c;hpb=c760a5d3839b241f2f94437731050622b197bf03;p=mussa.git diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 18539b8..cda06ff 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -12,22 +12,22 @@ // ---------- mussa_class.cc ----------- // ---------------------------------------- +#include #include #include namespace fs = boost::filesystem; -#include -#include -#include -#include -namespace spirit = boost::spirit; +#include #include #include #include "mussa_exceptions.hpp" -#include "alg/mussa.hpp" -#include "alg/flp.hpp" + +#include "flp.hpp" +#include "io.hpp" +#include "mussa.hpp" +#include "motif_parser.hpp" using namespace std; @@ -36,8 +36,8 @@ Mussa::Mussa() : color_mapper(new AnnotationColors) { clear(); - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); } Mussa::Mussa(const Mussa& m) @@ -53,8 +53,14 @@ Mussa::Mussa(const Mussa& m) analysis_path(m.analysis_path), dirty(m.dirty) { - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); +} + +MussaRef Mussa::init() +{ + boost::shared_ptr m(new Mussa()); + return m; } boost::filesystem::path Mussa::get_analysis_path() const @@ -83,7 +89,35 @@ Mussa::clear() the_seqs.clear(); the_paths.clear(); analysis_path = fs::path(); - dirty = false; + set_dirty(false); +} + +void Mussa::set_append_window(bool v) +{ + win_append = v; +} + +bool Mussa::get_append_window() +{ + return win_append; +} + +void Mussa::set_append_threshold(bool v) +{ + thres_append = v; +} + +bool Mussa::get_append_threshold() +{ + return thres_append; +} + +void Mussa::set_dirty(bool new_state) +{ + if (dirty != new_state) { + dirty = new_state; + emit isModified(dirty); + } } bool Mussa::is_dirty() const @@ -104,7 +138,7 @@ void Mussa::set_name(string a_name) { analysis_name = a_name; - dirty = true; + set_dirty(true); } string Mussa::get_name() const @@ -137,7 +171,7 @@ void Mussa::set_window(int a_window) { window = a_window; - dirty = true; + set_dirty(true); } int Mussa::get_window() const @@ -149,7 +183,7 @@ void Mussa::set_threshold(int a_threshold) { threshold = a_threshold; - dirty = true; + set_dirty(true); if (a_threshold > soft_thres) { soft_thres = a_threshold; } @@ -177,11 +211,12 @@ int Mussa::get_soft_threshold() const return soft_thres; } + void Mussa::set_analysis_mode(enum analysis_modes new_ana_mode) { ana_mode = new_ana_mode; - dirty = true; + set_dirty(true); } enum Mussa::analysis_modes Mussa::get_analysis_mode() const @@ -308,17 +343,17 @@ void Mussa::append_sequence(const Sequence& a_seq) { boost::shared_ptr seq_copy(new Sequence(a_seq)); the_seqs.push_back(seq_copy); - dirty = true; + set_dirty(true); } void Mussa::append_sequence(boost::shared_ptr a_seq) { the_seqs.push_back(a_seq); - dirty = true; + set_dirty(true); } -const vector >& +const vector& Mussa::sequences() const { return the_seqs; @@ -337,27 +372,16 @@ void Mussa::load_sequence(fs::path seq_file, fs::path annot_file, aseq->set_species(*name); } the_seqs.push_back(aseq); - dirty = true; + set_dirty(true); +} + +void Mussa::load_mupa_file(std::string para_file_path) { + load_mupa_file(boost::filesystem::path(para_file_path)); } void Mussa::load_mupa_file(fs::path para_file_path) { - fs::ifstream para_file; - string file_data_line; - string param, value; - fs::path annot_file; - int split_index, fasta_index; - int sub_seq_start, sub_seq_end; - bool seq_params, did_seq; - string err_msg; - bool parsing_path; - string::size_type new_index, dir_index; - - // initialize values - clear(); - - // if file was opened, read the parameter values if (not fs::exists(para_file_path)) { throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); @@ -366,94 +390,146 @@ Mussa::load_mupa_file(fs::path para_file_path) } else if (fs::is_empty(para_file_path)) { throw mussa_load_error("Config File: " + para_file_path.string() + " is empty"); } else { + // what directory is the mupa file in? + fs::path file_path_base( para_file_path.branch_path()) ; + + fs::ifstream para_file; para_file.open(para_file_path, ios::in); + + load_mupa_stream(para_file, file_path_base); + para_file.close(); + } +} - // what directory is the mupa file in? - fs::path file_path_base = para_file_path.branch_path(); +void +Mussa::load_mupa_stream(std::istream& para_file, fs::path& file_path_base) +{ + std::string line; + std::vector< std::string > tokens; + int line_count = 0; + + enum parsing_state_enum { START, INSEQUENCE }; + parsing_state_enum parsing_state = START; + + // sequence file parameters + fs::path seq_file; + fs::path annot_file; + int split_index, fasta_index; + int sub_seq_start, sub_seq_end; + bool seq_params, did_seq; + std::string err_msg; + // initialize values + clear(); + + + while (para_file.good()) + { + ++line_count; // setup loop by getting file's first line - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - while (para_file) - { - did_seq = false; - if (param == "ANA_NAME") - analysis_name = value; - else if (param == "APPEND_WIN") - win_append = true; - else if (param == "APPEND_THRES") - thres_append = true; - else if (param == "SEQUENCE_NUM") - ; // ignore sequence_num now - else if (param == "WINDOW") - window = atoi(value.c_str()); - else if (param == "THRESHOLD") - threshold = atoi(value.c_str()); - else if (param == "SEQUENCE") - { - fs::path seq_file = file_path_base / value; - //cout << "seq_file_name " << seq_files.back() << endl; - fasta_index = 1; - annot_file = ""; - sub_seq_start = 0; - sub_seq_end = 0; - seq_params = true; - - while (para_file && seq_params) - { - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - if (param == "FASTA_INDEX") - fasta_index = atoi(value.c_str()); - else if (param == "ANNOTATION") - annot_file = file_path_base / value; - else if (param == "SEQ_START") - sub_seq_start = atoi(value.c_str()); - else if (param == "SEQ_END") - { - sub_seq_end = atoi(value.c_str()); - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else seq_params = false; - } + multiplatform_getline(para_file, line); + // strip leading/trailing whitespace + boost::trim(line); + // ignore commented out or blank lines + if ( line.size() == 0 or line[0] == '#' ) { + continue; + } + + // split the line on white spance + boost::split(tokens, line, boost::is_space()); + // do we have a name/value pair? + if (tokens.size() != 2) { + std::stringstream errmsg; + errmsg << "Error parsing MUPA file line: " + << line_count << std::endl + << line; + throw mussa_load_error(errmsg.str()); + } + + boost::to_upper(tokens[0]); + // Parameters only useful after a sequence block + if (parsing_state == INSEQUENCE) { + // in the following if blocks, if we do + // successfully match a token we should continue + // on to the next token + // but if we don't match a token we want to + // fall through to the top level parsing + + if (tokens[0] == "FASTA_INDEX") { + fasta_index = atoi(tokens[1].c_str()); + continue; + } else if (tokens[0] == "ANNOTATION") { + annot_file = file_path_base / tokens[1]; + continue; + } else if (tokens[0] == "SEQ_START") { + sub_seq_start = atoi(tokens[1].c_str()); + continue; + } else if (tokens[0] == "SEQ_END") { + sub_seq_end = atoi(tokens[1].c_str()); + continue; + } else { + // any other token means we're done with this + // sequence so we should load it + // (and let the "unknown" token fall through into the + // top level token parser) load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, sub_seq_end); - did_seq = true; + parsing_state = START; } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else - { - clog << "Illegal/misplaced mussa parameter in file\n"; - clog << param << "\n"; - } - - if (!did_seq) - { - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - did_seq = false; + } + // if we didn't consume a token from the previous if block + // try + // top level token parsing + if (tokens[0] == "ANA_NAME") { + analysis_name = tokens[1]; + } else if (tokens[0] == "APPEND_WIN") { + win_append = true; + } else if (tokens[0] == "APPEND_THRES") { + thres_append = true; + } else if (tokens[0] == "SEQUENCE_NUM") { + ; // ignore sequence_num now + } else if (tokens[0] == "WINDOW") { + window = atoi(tokens[1].c_str()); + } else if (tokens[0] == "THRESHOLD") { + threshold = atoi(tokens[1].c_str()); + } else if (tokens[0] == "SEQUENCE") { + if (parsing_state == INSEQUENCE) { + cout << "seq_file_name call2" << seq_file << endl; + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); + parsing_state = START; } + // reset sequence parameters + seq_file = file_path_base / tokens[1]; + fasta_index = 1; + annot_file = ""; + sub_seq_start = 0; + sub_seq_end = 0; + seq_params = true; + parsing_state = INSEQUENCE; + } else { + clog << "Illegal/misplaced mussa parameter in file\n"; + clog << tokens[0] << "\n"; + std::stringstream errmsg; + errmsg << "Invalid mussa paaramater '" + << tokens[0] + << "' on line: " + << line_count << std::endl + << line; + throw mussa_load_error(errmsg.str()); + throw mussa_load_error("Error parsing MUPA file"); } + } - para_file.close(); - - soft_thres = threshold; - //cout << "nway mupa: analysis_name = " << analysis_name - // << " window = " << window - // << " threshold = " << threshold << endl; + // if we hit the end of the file and there's a sequence + // pending, go ahead and load it + if (parsing_state == INSEQUENCE) { + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); } - // no file was loaded, signal error - dirty = true; + + soft_thres = threshold; + set_dirty(true); } @@ -464,8 +540,7 @@ Mussa::analyze() throw mussa_analysis_error("you need to have at least 2 sequences to " "do an analysis."); } - //cout << "nway ana: seq_num = " << the_seqs.size() << endl; - + seqcomp(); the_paths.setup(window, threshold); nway(); @@ -541,7 +616,6 @@ Mussa::nway() void Mussa::save(fs::path save_path) { - fs::path flp_filepath; fs::fstream save_file; ostringstream append_info; int dir_create_status; @@ -572,8 +646,12 @@ Mussa::save(fs::path save_path) if (not fs::exists(save_path)) { fs::create_directory(save_path); } + + std::string basename = save_path.leaf(); + fs::path museq(basename + ".museq", fs::native); + // save sequence and annots to a special mussa file - save_file.open(save_path / (save_path.leaf()+".museq"), ios::out); + save_file.open(save_path / museq, ios::out); save_file << "" << endl; for(vector::size_type i = 0; i < the_seqs.size(); i++) @@ -584,18 +662,27 @@ Mussa::save(fs::path save_path) save_file << "" << endl; save_file.close(); + // if we have any motifs, save them. + if (motif_sequences.size()) { + fs::path mtl(basename + ".mtl", fs::native); + save_motifs(save_path / mtl); + } + // save nway paths to its mussa save file - the_paths.save(save_path / (save_path.leaf()+ ".muway")); + fs::path muway(basename + ".muway", fs::native); + the_paths.save(save_path / muway); for(vector::size_type i = 0; i < the_seqs.size(); i++) { for(vector::size_type i2 = i+1; i2 < the_seqs.size(); i2++) { append_info.str(""); append_info << "_sp_" << i << "v" << i2; - all_comps[i][i2].save(save_path/(save_path.leaf()+append_info.str()+".flp")); + fs::path flp(basename+append_info.str()+".flp", fs::native); + all_comps[i][i2].save(save_path / flp); } } - dirty = false; + + set_dirty(false); analysis_path = save_path; } @@ -618,33 +705,78 @@ Mussa::load(fs::path ana_file) vector empty_FLP_vector; FLPs dummy_comp; + + //-------------------------------------------------------- + // Load Muway + //-------------------------------------------------------- analysis_path = ana_file; - //clog << "ana_file name " << ana_file.string() << endl; analysis_name = ana_path.leaf(); - //clog << " ana_name " << analysis_name << endl; - file_path_base = ana_path.branch_path() / analysis_name; - a_file_path = file_path_base / (analysis_name + ".muway"); - //clog << " loading museq: " << a_file_path.string() << endl; + fs::path muway(analysis_name+".muway", fs::native); + a_file_path = analysis_path / muway; the_paths.load(a_file_path); // perhaps this could be more elegent, but at least this'll let // us know what our threshold and window sizes were when we load a muway window = the_paths.get_window(); threshold = the_paths.get_threshold(); - soft_thres = threshold; + soft_thres = the_paths.get_soft_threshold(); - int seq_num = the_paths.sequence_count(); - a_file_path = file_path_base / (analysis_name + ".museq"); + //-------------------------------------------------------- + // Load Sequence + //-------------------------------------------------------- + //int seq_num = the_paths.sequence_count(); + + fs::path museq(analysis_name + ".museq", fs::native); + a_file_path = analysis_path / museq; // this is a bit of a hack due to C++ not acting like it should with files - for (i = 1; i <= seq_num; i++) + /*for (i = 1; i <= seq_num; i++) { boost::shared_ptr tmp_seq(new Sequence); - //clog << "mussa_class: loading museq frag... " << a_file_path.string() << endl; tmp_seq->load_museq(a_file_path, i); the_seqs.push_back(tmp_seq); + }*/ + + i = 1; + //int seq_num = 0; + boost::filesystem::fstream load_museq_fs; + load_museq_fs.open(a_file_path, std::ios::in); + boost::shared_ptr tmp_seq; + while (1) + { + tmp_seq = Sequence::load_museq(load_museq_fs); + + if (tmp_seq) + { + the_seqs.push_back(tmp_seq); + } + else + { + break; + } + + + //safe guard in case of an infinate loop. + //FIXME: If mussa can handle a comparison of 10000 sequences + // in the future, then this code should be fixed. + if (i == 10000) + { + throw mussa_load_error(" Run away sequence load!"); + } + i++; + } + load_museq_fs.close(); + + //-------------------------------------------------------- + // Load Motifs + //-------------------------------------------------------- + fs::path mtl(analysis_name + ".mtl", fs::native); + fs::path motif_file = analysis_path / mtl; + if (fs::exists(motif_file)) { + load_motifs(motif_file); } + vector::size_type seq_num = the_seqs.size(); empty_FLP_vector.clear(); for(i = 0; i < seq_num; i++) { @@ -653,6 +785,7 @@ Mussa::load(fs::path ana_file) all_comps[i].push_back(dummy_comp); } + for(i = 0; i < seq_num; i++) { for(i2 = i+1; i2 < seq_num; i2++) @@ -660,15 +793,13 @@ Mussa::load(fs::path ana_file) append_info.str(""); append_info << analysis_name << "_sp_" << i << "v" << i2 << ".flp"; //clog << append_info.str() << endl; - a_file_path = file_path_base / append_info.str(); - //clog << "path " << a_file_path.string() << endl; + fs::path flp(append_info.str(), fs::native); + a_file_path = analysis_path / flp; all_comps[i][i2].load(a_file_path); - //clog << "real size = " << all_comps[i][i2].size() << endl; } } } - void Mussa::save_old() { @@ -745,6 +876,7 @@ void Mussa::add_motif(const Sequence& motif, const Color& color) { motif_sequences.insert(motif); color_mapper->appendInstanceColor("motif", motif.get_sequence(), color); + set_dirty(true); } void Mussa::set_motifs(const vector& motifs, @@ -762,73 +894,18 @@ void Mussa::set_motifs(const vector& motifs, update_sequences_motifs(); } -// Helper functor to append created motifs to our Mussa analysis -struct push_back_motif { - Mussa::motif_set& motifs; - boost::shared_ptr color_mapper; - std::string& seq_string; - std::string& name; - float& red; - float& green; - float& blue; - float& alpha; - int& parsed; - - push_back_motif(Mussa::motif_set& motifs_, - boost::shared_ptr color_mapper_, - std::string& seq_, - std::string& name_, - float &red_, float &green_, float &blue_, float &alpha_, - int &parsed_) - : motifs(motifs_), - color_mapper(color_mapper_), - seq_string(seq_), - name(name_), - red(red_), - green(green_), - blue(blue_), - alpha(alpha_), - parsed(parsed_) - { - } - - void operator()(std::string::const_iterator, - std::string::const_iterator) const - { - Sequence seq(seq_string, Sequence::nucleic_alphabet); - // shouldn't we have a better field than "fasta header" and speices? - seq.set_fasta_header(name); - // we need to clear the name in case the next motif doesn't have one. - name.clear(); - // be nice if glsequence was a subclass of sequence so we could - // just attach colors directly to the motif. - Color c(red, green, blue, alpha); - color_mapper->appendInstanceColor("motif", seq.c_str(), c); - alpha = 1.0; - motifs.insert(seq); - ++parsed; - }; -}; - void Mussa::load_motifs(fs::path filename) { fs::ifstream f; f.open(filename, ifstream::in); load_motifs(f); } - -// I mostly split the ifstream out so I can use a stringstream to test it. + void Mussa::load_motifs(std::istream &in) { std::string data; - const char *alphabet = Alphabet::nucleic_alphabet.c_str(); - string seq; - string name; - float red = 1.0; - float green = 0.0; - float blue = 0.0; - float alpha = 1.0; - int parsed = 1; + const char *alphabet = Alphabet::dna_cstr; + motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper); // slurp our data into a string std::streamsize bytes_read = 1; @@ -838,40 +915,7 @@ void Mussa::load_motifs(std::istream &in) bytes_read = in.readsome(buf, bufsiz); data.append(buf, buf+bytes_read); } - // parse our string - bool ok = spirit::parse(data.begin(), data.end(), - *( - ( - ( - (+spirit::chset<>(alphabet))[spirit::assign_a(seq)] >> - +spirit::space_p - ) >> - !( - ( - // names can either be letter followed by non-space characters - (spirit::alpha_p >> *spirit::graph_p)[spirit::assign_a(name)] - | - // or a quoted string - ( - spirit::ch_p('"') >> - (+(~spirit::ch_p('"')))[spirit::assign_a(name)] >> - spirit::ch_p('"') - ) - ) >> +spirit::space_p - ) >> - spirit::real_p[spirit::assign_a(red)] >> +spirit::space_p >> - spirit::real_p[spirit::assign_a(green)] >> +spirit::space_p >> - spirit::real_p[spirit::assign_a(blue)] >> +spirit::space_p >> - !(spirit::real_p[spirit::assign_a(alpha)] >> +spirit::space_p) - )[push_back_motif(motif_sequences, color_mapper, seq, name, red, green, blue, alpha, parsed)] - )).full; - if (not ok) { - stringstream msg; - msg << "Error parsing motif #" << parsed; - // erase our potentially broken motif list - motif_sequences.clear(); - throw motif_load_error(msg.str()); - } + parsed_motifs.parse(data); update_sequences_motifs(); } @@ -901,7 +945,7 @@ void Mussa::update_sequences_motifs() { // once we've loaded all the motifs from the file, // lets attach them to the sequences - for(vector >::iterator seq_i = the_seqs.begin(); + for(vector::iterator seq_i = the_seqs.begin(); seq_i != the_seqs.end(); ++seq_i) {