X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=alg%2Fmussa.cpp;h=4b96d1b4d115a6f12915d1b3fd5a1d082d43212e;hb=f88eea68b95773eb5683dcca6cf3fa59b9f00036;hp=5acd58e6bdfeaf1a9081ee3d93c5351be79054ba;hpb=4f161bfd1e0a558abc59e6429bf805b3c372380f;p=mussa.git diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 5acd58e..4b96d1b 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -12,6 +12,7 @@ // ---------- mussa_class.cc ----------- // ---------------------------------------- +#include #include #include namespace fs = boost::filesystem; @@ -20,8 +21,9 @@ namespace fs = boost::filesystem; #include #include "mussa_exceptions.hpp" -#include "alg/mussa.hpp" #include "alg/flp.hpp" +#include "alg/mussa.hpp" +#include "alg/motif_parser.hpp" using namespace std; @@ -30,8 +32,8 @@ Mussa::Mussa() : color_mapper(new AnnotationColors) { clear(); - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); } Mussa::Mussa(const Mussa& m) @@ -43,10 +45,28 @@ Mussa::Mussa(const Mussa& m) win_append(m.win_append), thres_append(m.thres_append), motif_sequences(m.motif_sequences), - color_mapper(m.color_mapper) + color_mapper(m.color_mapper), + analysis_path(m.analysis_path), + dirty(m.dirty) { - connect(&the_paths, SIGNAL(progress(const std::string&, int, int)), - this, SIGNAL(progress(const std::string&, int, int))); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); +} + +MussaRef Mussa::init() +{ + boost::shared_ptr m(new Mussa()); + return m; +} + +boost::filesystem::path Mussa::get_analysis_path() const +{ + return analysis_path; +} + +void Mussa::set_analysis_path(boost::filesystem::path pathname) +{ + analysis_path = pathname; } // set all parameters to null state @@ -62,9 +82,31 @@ Mussa::clear() thres_append = false; motif_sequences.clear(); if(color_mapper) color_mapper->clear(); + the_seqs.clear(); the_paths.clear(); + analysis_path = fs::path(); + set_dirty(false); +} + +void Mussa::set_dirty(bool new_state) +{ + if (dirty != new_state) { + dirty = new_state; + emit isModified(dirty); + } } +bool Mussa::is_dirty() const +{ + return dirty; +} + +bool Mussa::empty() const +{ + return the_seqs.empty(); +} + + // these 5 simple methods manually set the parameters for doing an analysis // used so that the gui can take input from user and setup the analysis // note - still need a set_append(bool, bool) method... @@ -72,13 +114,26 @@ void Mussa::set_name(string a_name) { analysis_name = a_name; + set_dirty(true); } -string Mussa::get_name() +string Mussa::get_name() const { return analysis_name; } +string Mussa::get_title() const +{ + fs::path analysis_path = get_analysis_path(); + if (not analysis_path.empty()) { + return analysis_path.native_file_string(); + } else if (get_name().size() > 0) { + return get_name(); + } else { + return std::string("Unnamed"); + } +} + int Mussa::size() const { @@ -92,6 +147,7 @@ void Mussa::set_window(int a_window) { window = a_window; + set_dirty(true); } int Mussa::get_window() const @@ -103,8 +159,10 @@ void Mussa::set_threshold(int a_threshold) { threshold = a_threshold; - if (a_threshold > soft_thres) + set_dirty(true); + if (a_threshold > soft_thres) { soft_thres = a_threshold; + } } int Mussa::get_threshold() const @@ -133,6 +191,7 @@ void Mussa::set_analysis_mode(enum analysis_modes new_ana_mode) { ana_mode = new_ana_mode; + set_dirty(true); } enum Mussa::analysis_modes Mussa::get_analysis_mode() const @@ -259,15 +318,17 @@ void Mussa::append_sequence(const Sequence& a_seq) { boost::shared_ptr seq_copy(new Sequence(a_seq)); the_seqs.push_back(seq_copy); + set_dirty(true); } void Mussa::append_sequence(boost::shared_ptr a_seq) { the_seqs.push_back(a_seq); + set_dirty(true); } -const vector >& +const vector& Mussa::sequences() const { return the_seqs; @@ -286,6 +347,7 @@ void Mussa::load_sequence(fs::path seq_file, fs::path annot_file, aseq->set_species(*name); } the_seqs.push_back(aseq); + set_dirty(true); } void @@ -306,8 +368,14 @@ Mussa::load_mupa_file(fs::path para_file_path) clear(); // if file was opened, read the parameter values - if (fs::exists(para_file_path)) + if (not fs::exists(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); + } else if (fs::is_directory(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is a directory."); + } else if (fs::is_empty(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is empty"); + } else { para_file.open(para_file_path, ios::in); // what directory is the mupa file in? @@ -319,7 +387,7 @@ Mussa::load_mupa_file(fs::path para_file_path) param = file_data_line.substr(0,split_index); value = file_data_line.substr(split_index+1); - while (!para_file.eof()) + while (para_file) { did_seq = false; if (param == "ANA_NAME") @@ -344,7 +412,7 @@ Mussa::load_mupa_file(fs::path para_file_path) sub_seq_end = 0; seq_params = true; - while ((!para_file.eof()) && seq_params) + while (para_file && seq_params) { getline(para_file,file_data_line); split_index = file_data_line.find(" "); @@ -395,57 +463,21 @@ Mussa::load_mupa_file(fs::path para_file_path) // << " threshold = " << threshold << endl; } // no file was loaded, signal error - else - { - throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); - } + set_dirty(true); } void Mussa::analyze() { - time_t t1, t2, begin, end; - double seqloadtime, seqcomptime, nwaytime, savetime, totaltime; - - begin = time(NULL); - - t1 = time(NULL); - if (the_seqs.size() < 2) { throw mussa_analysis_error("you need to have at least 2 sequences to " "do an analysis."); } - //cout << "nway ana: seq_num = " << the_seqs.size() << endl; - - t2 = time(NULL); - seqloadtime = difftime(t2, t1); - - t1 = time(NULL); + seqcomp(); - t2 = time(NULL); - seqcomptime = difftime(t2, t1); - - t1 = time(NULL); the_paths.setup(window, threshold); nway(); - t2 = time(NULL); - nwaytime = difftime(t2, t1); - - t1 = time(NULL); - save(); - t2 = time(NULL); - savetime = difftime(t2, t1); - - end = time(NULL); - totaltime = difftime(end, begin); - - //cout << "seqload\tseqcomp\tnway\tsave\ttotal\n"; - //cout << seqloadtime << "\t"; - //cout << seqcomptime << "\t"; - //cout << nwaytime << "\t"; - //cout << savetime << "\t"; - //cout << totaltime << "\n"; } void @@ -496,7 +528,7 @@ Mussa::nway() } else if (ana_mode == EntropyNway) { - vector some_Seqs; + vector some_Seqs; //unlike other methods, entropy needs to look at the sequence at this stage some_Seqs.clear(); for(vector::size_type i = 0; i < the_seqs.size(); i++) @@ -516,61 +548,76 @@ Mussa::nway() } void -Mussa::save() +Mussa::save(fs::path save_path) { - string save_name; - fs::path flp_filepath; fs::fstream save_file; ostringstream append_info; int dir_create_status; - if (not analysis_name.empty()) { - // not sure why, but gotta close file each time since can't pass - // file streams - save_name = analysis_name; + if (save_path.empty()) { + if (not analysis_path.empty()) { + save_path = analysis_path; + } else if (not analysis_name.empty()) { + std::string save_name = analysis_name; + // gotta do bit with adding win & thres if to be appended + if (win_append) { + append_info.str(""); + append_info << "_w" << window; + save_name += append_info.str(); + } - // gotta do bit with adding win & thres if to be appended - if (win_append) - { - append_info.str(""); - append_info << "_w" << window; - save_name += append_info.str(); + if (thres_append) { + append_info.str(""); + append_info << "_t" << threshold; + save_name += append_info.str(); + } + save_path = save_name; + } else { + throw mussa_save_error("Need filename or analysis name to save"); } + } - if (thres_append) - { - append_info.str(""); - append_info << "_t" << threshold; - save_name += append_info.str(); - } - fs::path save_path( save_name); + if (not fs::exists(save_path)) { + fs::create_directory(save_path); + } + + std::string basename = save_path.leaf(); + fs::path museq(basename + ".museq", fs::native); + + // save sequence and annots to a special mussa file + save_file.open(save_path / museq, ios::out); + save_file << "" << endl; - if (not fs::exists(save_path)) { - fs::create_directory(save_path); - } - // save sequence and annots to a special mussa file - save_file.open(save_path / (save_name+".museq"), ios::out); - save_file << "" << endl; + for(vector::size_type i = 0; i < the_seqs.size(); i++) + { + the_seqs[i]->save(save_file); + } - for(vector::size_type i = 0; i < the_seqs.size(); i++) - { - the_seqs[i]->save(save_file); - } + save_file << "" << endl; + save_file.close(); - save_file << "" << endl; - save_file.close(); + // if we have any motifs, save them. + if (motif_sequences.size()) { + fs::path mtl(basename + ".mtl", fs::native); + save_motifs(save_path / mtl); + } - // save nway paths to its mussa save file - the_paths.save(save_path / (save_name + ".muway")); + // save nway paths to its mussa save file + fs::path muway(basename + ".muway", fs::native); + the_paths.save(save_path / muway); - for(vector::size_type i = 0; i < the_seqs.size(); i++) - for(vector::size_type i2 = i+1; i2 < the_seqs.size(); i2++) - { - append_info.str(""); - append_info << "_sp_" << i << "v" << i2; - all_comps[i][i2].save(save_path/(save_name+append_info.str()+".flp")); - } + for(vector::size_type i = 0; i < the_seqs.size(); i++) { + for(vector::size_type i2 = i+1; i2 < the_seqs.size(); i2++) + { + append_info.str(""); + append_info << "_sp_" << i << "v" << i2; + fs::path flp(basename+append_info.str()+".flp", fs::native); + all_comps[i][i2].save(save_path / flp); + } } + + set_dirty(false); + analysis_path = save_path; } void @@ -592,12 +639,10 @@ Mussa::load(fs::path ana_file) vector empty_FLP_vector; FLPs dummy_comp; - //cout << "ana_file name " << ana_file.string() << endl; + analysis_path = ana_file; analysis_name = ana_path.leaf(); - //cout << " ana_name " << analysis_name << endl; - file_path_base = ana_path.branch_path() / analysis_name; - a_file_path = file_path_base / (analysis_name + ".muway"); - //cout << " loading museq: " << a_file_path.string() << endl; + fs::path muway(analysis_name+".muway", fs::native); + a_file_path = analysis_path / muway; the_paths.load(a_file_path); // perhaps this could be more elegent, but at least this'll let // us know what our threshold and window sizes were when we load a muway @@ -607,17 +652,22 @@ Mussa::load(fs::path ana_file) int seq_num = the_paths.sequence_count(); - a_file_path = file_path_base / (analysis_name + ".museq"); + fs::path museq(analysis_name + ".museq", fs::native); + a_file_path = analysis_path / museq; // this is a bit of a hack due to C++ not acting like it should with files for (i = 1; i <= seq_num; i++) { boost::shared_ptr tmp_seq(new Sequence); - //cout << "mussa_class: loading museq frag... " << a_file_path.string() << endl; tmp_seq->load_museq(a_file_path, i); the_seqs.push_back(tmp_seq); } + fs::path mtl(analysis_name + ".mtl", fs::native); + fs::path motif_file = analysis_path / mtl; + if (fs::exists(motif_file)) { + load_motifs(motif_file); + } empty_FLP_vector.clear(); for(i = 0; i < seq_num; i++) { @@ -632,16 +682,14 @@ Mussa::load(fs::path ana_file) { append_info.str(""); append_info << analysis_name << "_sp_" << i << "v" << i2 << ".flp"; - //cout << append_info.str() << endl; - a_file_path = file_path_base / append_info.str(); - //cout << "path " << a_file_path.string() << endl; + //clog << append_info.str() << endl; + fs::path flp(append_info.str(), fs::native); + a_file_path = analysis_path / flp; all_comps[i][i2].load(a_file_path); - //cout << "real size = " << all_comps[i][i2].size() << endl; } } } - void Mussa::save_old() { @@ -714,19 +762,21 @@ Mussa::load_old(char * load_file_path, int s_num) //the_paths.save("tmp.save"); } -void Mussa::add_motif(const string& motif, const Color& color) +void Mussa::add_motif(const Sequence& motif, const Color& color) { motif_sequences.insert(motif); - color_mapper->appendInstanceColor("motif", motif, color); + color_mapper->appendInstanceColor("motif", motif.get_sequence(), color); + set_dirty(true); } -void Mussa::add_motifs(const vector& motifs, +void Mussa::set_motifs(const vector& motifs, const vector& colors) { if (motifs.size() != colors.size()) { throw mussa_error("motif and color vectors must be the same size"); } + motif_sequences.clear(); for(size_t i = 0; i != motifs.size(); ++i) { add_motif(motifs[i], colors[i]); @@ -734,62 +784,51 @@ void Mussa::add_motifs(const vector& motifs, update_sequences_motifs(); } -// I mostly split the ifstream out so I can use a stringstream to test it. +void Mussa::load_motifs(fs::path filename) +{ + fs::ifstream f; + f.open(filename, ifstream::in); + load_motifs(f); +} + void Mussa::load_motifs(std::istream &in) { - string seq; - float red; - float green; - float blue; - - while(in.good()) - { - in >> seq >> red >> green >> blue; - // if we couldn't read this line 'cause we're like at the end of the file - // try to exit the loop - if (!in.good()) - break; - try { - seq = Sequence::motif_normalize(seq); - } catch(motif_normalize_error e) { - clog << "unable to parse " << seq << " skipping" << endl; - clog << e.what() << endl; - continue; - } - if (red < 0.0 or red > 1.0) { - clog << "invalid red value " << red << ". must be in range [0..1]" - << endl; - continue; - } - if (green < 0.0 or green > 1.0) { - clog << "invalid green value " << green << ". must be in range [0..1]" - << endl; - continue; - } - if (blue < 0.0 or blue > 1.0) { - clog << "invalid blue value " << blue << ". must be in range [0..1]" - << endl; - continue; - } - if (motif_sequences.find(seq) == motif_sequences.end()) { - // sequence wasn't found - motif_sequences.insert(seq); - Color c(red, green, blue); - color_mapper->appendInstanceColor("motif", seq, c); - } else { - clog << "sequence " << seq << " was already defined skipping" - << endl; - continue; - } + std::string data; + const char *alphabet = Alphabet::dna_cstr; + motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper); + + // slurp our data into a string + std::streamsize bytes_read = 1; + while (in.good() and bytes_read) { + const std::streamsize bufsiz=512; + char buf[bufsiz]; + bytes_read = in.readsome(buf, bufsiz); + data.append(buf, buf+bytes_read); } + parsed_motifs.parse(data); update_sequences_motifs(); } -void Mussa::load_motifs(fs::path filename) +void Mussa::save_motifs(fs::path filename) { - fs::ifstream f; - f.open(filename, ifstream::in); - load_motifs(f); + fs::ofstream out_stream; + out_stream.open(filename, ofstream::out); + save_motifs(out_stream); +} + +void Mussa::save_motifs(std::ostream& out) +{ + for(motif_set::iterator motif_i = motif_sequences.begin(); + motif_i != motif_sequences.end(); + ++motif_i) + { + out << motif_i->get_sequence() << " "; + if (motif_i->get_name().size() > 0) { + out << "\"" << motif_i->get_name() << "\" "; + } + out << color_mapper->lookup("motif", motif_i->get_sequence()); + out << std::endl; + } } void Mussa::update_sequences_motifs() @@ -803,7 +842,7 @@ void Mussa::update_sequences_motifs() // clear out old motifs (*seq_i)->clear_motifs(); // for all the motifs in our set, attach them to the current sequence - for(set::iterator motif_i = motif_sequences.begin(); + for(set::iterator motif_i = motif_sequences.begin(); motif_i != motif_sequences.end(); ++motif_i) { @@ -812,7 +851,7 @@ void Mussa::update_sequences_motifs() } } -const set& Mussa::motifs() const +const set& Mussa::motifs() const { return motif_sequences; }