X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=alg%2Fmussa.cpp;h=9a5935e19947cbd68e28e7c7a3f88b267816030f;hb=6d25d4d945af696134bdf788b111f38b197b1a15;hp=47794ef7888acd8c0f00324149bfff93eb25ae10;hpb=198f7c18fd0b5fa84528e9b28a84269c3a856cc4;p=mussa.git diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 47794ef..9a5935e 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -12,19 +12,63 @@ // ---------- mussa_class.cc ----------- // ---------------------------------------- -#include +#include +#include +#include +namespace fs = boost::filesystem; + #include #include #include "mussa_exceptions.hpp" -#include "alg/mussa.hpp" -#include "alg/flp.hpp" + +#include "flp.hpp" +#include "io.hpp" +#include "mussa.hpp" +#include "motif_parser.hpp" using namespace std; + Mussa::Mussa() + : color_mapper(new AnnotationColors) { clear(); + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); +} + +Mussa::Mussa(const Mussa& m) + : analysis_name(m.analysis_name), + window(m.window), + threshold(m.threshold), + soft_thres(m.soft_thres), + ana_mode(m.ana_mode), + win_append(m.win_append), + thres_append(m.thres_append), + motif_sequences(m.motif_sequences), + color_mapper(m.color_mapper), + analysis_path(m.analysis_path), + dirty(m.dirty) +{ + connect(&the_paths, SIGNAL(progress(const QString&, int, int)), + this, SIGNAL(progress(const QString&, int, int))); +} + +MussaRef Mussa::init() +{ + boost::shared_ptr m(new Mussa()); + return m; +} + +boost::filesystem::path Mussa::get_analysis_path() const +{ + return analysis_path; +} + +void Mussa::set_analysis_path(boost::filesystem::path pathname) +{ + analysis_path = pathname; } // set all parameters to null state @@ -38,13 +82,33 @@ Mussa::clear() soft_thres = 0; win_append = false; thres_append = false; - seq_files.clear(); - fasta_indices.clear(); - annot_files.clear(); - sub_seq_starts.clear(); - sub_seq_ends.clear(); + motif_sequences.clear(); + if(color_mapper) color_mapper->clear(); + the_seqs.clear(); + the_paths.clear(); + analysis_path = fs::path(); + set_dirty(false); } +void Mussa::set_dirty(bool new_state) +{ + if (dirty != new_state) { + dirty = new_state; + emit isModified(dirty); + } +} + +bool Mussa::is_dirty() const +{ + return dirty; +} + +bool Mussa::empty() const +{ + return the_seqs.empty(); +} + + // these 5 simple methods manually set the parameters for doing an analysis // used so that the gui can take input from user and setup the analysis // note - still need a set_append(bool, bool) method... @@ -52,20 +116,31 @@ void Mussa::set_name(string a_name) { analysis_name = a_name; + set_dirty(true); } -string Mussa::get_name() +string Mussa::get_name() const { return analysis_name; } +string Mussa::get_title() const +{ + fs::path analysis_path = get_analysis_path(); + if (not analysis_path.empty()) { + return analysis_path.native_file_string(); + } else if (get_name().size() > 0) { + return get_name(); + } else { + return std::string("Unnamed"); + } +} + int Mussa::size() const { if (the_seqs.size() > 0) return the_seqs.size(); - else if (seq_files.size() > 0) - return seq_files.size(); else return 0; } @@ -74,6 +149,7 @@ void Mussa::set_window(int a_window) { window = a_window; + set_dirty(true); } int Mussa::get_window() const @@ -85,7 +161,10 @@ void Mussa::set_threshold(int a_threshold) { threshold = a_threshold; - //soft_thres = a_threshold; + set_dirty(true); + if (a_threshold > soft_thres) { + soft_thres = a_threshold; + } } int Mussa::get_threshold() const @@ -94,15 +173,27 @@ int Mussa::get_threshold() const } void -Mussa::set_soft_thres(int sft_thres) +Mussa::set_soft_threshold(int new_threshold) +{ + if (new_threshold < threshold) { + soft_thres = threshold; + } else if (new_threshold > window) { + soft_thres = window; + } else { + soft_thres = new_threshold; + } +} + +int Mussa::get_soft_threshold() const { - soft_thres = sft_thres; + return soft_thres; } void Mussa::set_analysis_mode(enum analysis_modes new_ana_mode) { ana_mode = new_ana_mode; + set_dirty(true); } enum Mussa::analysis_modes Mussa::get_analysis_mode() const @@ -137,40 +228,162 @@ const NwayPaths& Mussa::paths() const return the_paths; } -// takes a string and sets it as the next seq -void -Mussa::add_a_seq(string a_seq) +//template +//void Mussa::createLocalAlignment(IteratorT begin, IteratorT end) +void Mussa::createLocalAlignment(std::list::iterator begin, + std::list::iterator end, + std::list& result, + std::list >& reversed) { - Sequence aSeq; + const vector_sequence_type& raw_seq = the_seqs; + ConservedPath::path_type aligned_path; + size_t i2, i3; + int x_start, x_end; + int window_length, win_i; + int rc_1 = 0; + int rc_2 = 0; + vector rc_list; + bool full_match; + vector matched; + int align_counter; + + align_counter = 0; + for(std::list::iterator pathz_i=begin; pathz_i != end; ++pathz_i) + { + ConservedPath& a_path = *pathz_i; + window_length = a_path.window_size; + // determine which parts of the path are RC relative to first species + rc_list = a_path.reverseComplimented(); + + // loop over each bp in the conserved region for all sequences + for(win_i = 0; win_i < window_length; win_i++) + { + aligned_path.clear(); + // determine which exact base pairs match between the sequences + full_match = true; + for(i2 = 0; i2 < a_path.size()-1; i2++) + { + // assume not rc as most likely, adjust below + rc_1 = 0; + rc_2 = 0; + // no matter the case, any RC node needs adjustments + if (a_path[i2] < 0) + rc_1 = window_length-1; + if (a_path[i2+1] < 0) + rc_2 = window_length-1; + + x_start = (abs(a_path[i2]-rc_1+win_i)); + x_end = (abs(a_path[i2+1]-rc_2+win_i)); + + boost::shared_ptr cur(raw_seq[i2]) ; + boost::shared_ptr next(raw_seq[i2+1]); + // RC case handling + // ugh, and xor...only want rc coloring if just one of the nodes is rc + // if both nodes are rc, then they are 'normal' relative to each other + if((rc_list[i2] || rc_list[i2+1] )&&!(rc_list[i2] && rc_list[i2+1])) + { //the hideous rc matching logic - not complex, but annoying + if(!(( ((*cur)[x_start]=='A')&&((*next)[x_end]=='T')) || + (((*cur)[x_start]=='T')&&((*next)[x_end]=='A')) || + (((*cur)[x_start]=='G')&&((*next)[x_end]=='C')) || + (((*cur)[x_start]=='C')&&((*next)[x_end]=='G'))) ) + { + full_match = false; + } else { + aligned_path.push_back(x_start); + } + } + else + { // forward match + if (!( ((*cur)[x_start] == (*next)[x_end]) && + ((*cur)[x_start] != 'N') && ((*next)[x_end] != 'N') ) ) { + full_match = false; + } else { + aligned_path.push_back(x_start); + } + } + } + // grab the last part of our path, assuming we matched + if (full_match) + aligned_path.push_back(x_end); - aSeq.set_seq(a_seq); - the_seqs.push_back(aSeq); + if (aligned_path.size() == a_path.size()) { + result.push_back(aligned_path); + reversed.push_back(rc_list); + } + } + align_counter++; + } } -// sets info for just 1 seq at a time -void -Mussa::set_seq_info(string seq_file, string annot_file, int fa_i, int a_start, int the_end) +void Mussa::append_sequence(const Sequence& a_seq) { - seq_files.push_back(seq_file); - fasta_indices.push_back(fa_i); - annot_files.push_back(annot_file); - sub_seq_starts.push_back(a_start); - sub_seq_ends.push_back(the_end); + boost::shared_ptr seq_copy(new Sequence(a_seq)); + the_seqs.push_back(seq_copy); + set_dirty(true); } -const vector& +void Mussa::append_sequence(boost::shared_ptr a_seq) +{ + the_seqs.push_back(a_seq); + set_dirty(true); +} + + +const vector& Mussa::sequences() const { return the_seqs; } +void Mussa::load_sequence(fs::path seq_file, fs::path annot_file, + int fasta_index, int sub_seq_start, int sub_seq_end, + std::string *name) +{ + boost::shared_ptr aseq(new Sequence); + aseq->load_fasta(seq_file, fasta_index, sub_seq_start, sub_seq_end); + if ( not annot_file.empty() ) { + aseq->load_annot(annot_file, sub_seq_start, sub_seq_end); + } + if (name != 0 and name->size() > 0 ) { + aseq->set_species(*name); + } + the_seqs.push_back(aseq); + set_dirty(true); +} + +void Mussa::load_mupa_file(std::string para_file_path) { + load_mupa_file(boost::filesystem::path(para_file_path)); +} + void -Mussa::load_mupa_file(string para_file_path) +Mussa::load_mupa_file(fs::path para_file_path) +{ + if (not fs::exists(para_file_path)) + { + throw mussa_load_error("Config File: " + para_file_path.string() + " not found"); + } else if (fs::is_directory(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is a directory."); + } else if (fs::is_empty(para_file_path)) { + throw mussa_load_error("Config File: " + para_file_path.string() + " is empty"); + } else { + // what directory is the mupa file in? + fs::path file_path_base( para_file_path.branch_path()) ; + + fs::ifstream para_file; + para_file.open(para_file_path, ios::in); + + load_mupa_stream(para_file, file_path_base); + para_file.close(); + } +} + +void +Mussa::load_mupa_stream(std::istream& para_file, fs::path& file_path_base) { - ifstream para_file; string file_data_line; - string param, value, annot_file; + string param, value; + fs::path annot_file; int split_index, fasta_index; int sub_seq_start, sub_seq_end; bool seq_params, did_seq; @@ -181,219 +394,102 @@ Mussa::load_mupa_file(string para_file_path) // initialize values clear(); - para_file.open(para_file_path.c_str(), ios::in); - - // if file was opened, read the parameter values - if (para_file) + // setup loop by getting file's first line + getline(para_file, file_data_line); + split_index = file_data_line.find(" "); + param = file_data_line.substr(0,split_index); + value = file_data_line.substr(split_index+1); + + while (para_file) { - // need to find the path to the .mupa file - parsing_path = true; - dir_index = 0; - while (parsing_path) + did_seq = false; + if (param == "ANA_NAME") + analysis_name = value; + else if (param == "APPEND_WIN") + win_append = true; + else if (param == "APPEND_THRES") + thres_append = true; + else if (param == "SEQUENCE_NUM") + ; // ignore sequence_num now + else if (param == "WINDOW") + window = atoi(value.c_str()); + else if (param == "THRESHOLD") + threshold = atoi(value.c_str()); + else if (param == "SEQUENCE") { - new_index = (para_file_path.substr(dir_index)).find("/"); - if (new_index != string::npos) - dir_index += new_index + 1; - else - parsing_path = false; - } - - file_path_base = para_file_path.substr(0,dir_index); - - // setup loop by getting file's first line - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - while (!para_file.eof()) - { - did_seq = false; - if (param == "ANA_NAME") - analysis_name = value; - else if (param == "APPEND_WIN") - win_append = true; - else if (param == "APPEND_THRES") - thres_append = true; - else if (param == "SEQUENCE_NUM") - ; // ignore sequence_num now - else if (param == "WINDOW") - window = atoi(value.c_str()); - else if (param == "THRESHOLD") - threshold = atoi(value.c_str()); - else if (param == "SEQUENCE") - { - seq_files.push_back(file_path_base + value); - //cout << "seq_file_name " << seq_files.back() << endl; - fasta_index = 1; - annot_file = ""; - sub_seq_start = 0; - sub_seq_end = 0; - seq_params = true; - - while ((!para_file.eof()) && seq_params) - { - getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - if (param == "FASTA_INDEX") - fasta_index = atoi(value.c_str()); - else if (param == "ANNOTATION") - annot_file = file_path_base + value; - else if (param == "SEQ_START") - sub_seq_start = atoi(value.c_str()); - else if (param == "SEQ_END") - { - sub_seq_end = atoi(value.c_str()); - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else seq_params = false; - } - - fasta_indices.push_back(fasta_index); - annot_files.push_back(annot_file); - sub_seq_starts.push_back(sub_seq_start); - sub_seq_ends.push_back(sub_seq_end); - did_seq = true; - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) {} - else - { - clog << "Illegal/misplaced mussa parameter in file\n"; - clog << param << "\n"; - } - - if (!did_seq) + fs::path seq_file = file_path_base / value; + //cout << "seq_file_name " << seq_files.back() << endl; + fasta_index = 1; + annot_file = ""; + sub_seq_start = 0; + sub_seq_end = 0; + seq_params = true; + + while (para_file && seq_params) { - getline(para_file,file_data_line); + multiplatform_getline(para_file,file_data_line); split_index = file_data_line.find(" "); param = file_data_line.substr(0,split_index); value = file_data_line.substr(split_index+1); - did_seq = false; + + if (param == "FASTA_INDEX") + fasta_index = atoi(value.c_str()); + else if (param == "ANNOTATION") + annot_file = file_path_base / value; + else if (param == "SEQ_START") + sub_seq_start = atoi(value.c_str()); + else if (param == "SEQ_END") + { + sub_seq_end = atoi(value.c_str()); + } + //ignore empty lines or that start with '#' + else if ((param == "") || (param == "#")) { + // pass + } else { + seq_params = false; + } } + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); + did_seq = true; + } + //ignore empty lines or that start with '#' + else if ((param == "") || (param == "#")) {} + else + { + clog << "Illegal/misplaced mussa parameter in file\n"; + clog << param << "\n"; } - para_file.close(); - - soft_thres = threshold; - //cout << "nway mupa: analysis_name = " << analysis_name - // << " window = " << window - // << " threshold = " << threshold << endl; + if (!did_seq) + { + multiplatform_getline(para_file,file_data_line); + split_index = file_data_line.find(" "); + param = file_data_line.substr(0,split_index); + value = file_data_line.substr(split_index+1); + did_seq = false; + } } + + soft_thres = threshold; // no file was loaded, signal error - else - { - throw mussa_load_error("Config File: " + para_file_path + " not found"); - } + set_dirty(true); } void -Mussa::analyze(int w, int t, enum Mussa::analysis_modes the_ana_mode, double new_ent_thres) +Mussa::analyze() { - time_t t1, t2, begin, end; - double seqloadtime, seqcomptime, nwaytime, savetime, totaltime; - - begin = time(NULL); - - ana_mode = the_ana_mode; - ent_thres = new_ent_thres; - if (w > 0) - window = w; - if (t > 0) - { - threshold = t; - soft_thres = t; - } - - t1 = time(NULL); - load_sequence_data(); - if (the_seqs.size() < 2) { throw mussa_analysis_error("you need to have at least 2 sequences to " "do an analysis."); } - //cout << "nway ana: seq_num = " << the_seqs.size() << endl; - - t2 = time(NULL); - seqloadtime = difftime(t2, t1); - - t1 = time(NULL); + seqcomp(); - t2 = time(NULL); - seqcomptime = difftime(t2, t1); - - - t1 = time(NULL); the_paths.setup(window, threshold); nway(); - t2 = time(NULL); - nwaytime = difftime(t2, t1); - - t1 = time(NULL); - save(); - t2 = time(NULL); - savetime = difftime(t2, t1); - - end = time(NULL); - totaltime = difftime(end, begin); - - - //cout << "seqload\tseqcomp\tnway\tsave\ttotal\n"; - //cout << seqloadtime << "\t"; - //cout << seqcomptime << "\t"; - //cout << nwaytime << "\t"; - //cout << savetime << "\t"; - //cout << totaltime << "\n"; } - -void -Mussa::load_sequence_data() -{ - list::iterator seq_files_i, annot_files_i; - list::iterator fasta_indices_i, seq_starts_i, seq_ends_i; - Sequence aSeq; - string err_msg; - - - seq_files_i = seq_files.begin(); - fasta_indices_i = fasta_indices.begin(); - annot_files_i = annot_files.begin(); - seq_starts_i = sub_seq_starts.begin(); - seq_ends_i = sub_seq_ends.begin(); - - while ( (seq_files_i != seq_files.end()) && (err_msg == "") ) - /* it should be guarenteed that each of the following exist - should I bother checking, and how to deal with if not true... - && - (fasta_indices_i != fasta_indices.end()) && - (annot_files_i != annot_files.end()) && - (seq_starts_i != sub_seq_starts.end()) && - (seq_ends_i != sub_seq_ends.end()) ) - */ - { - aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i, *seq_ends_i); - if (annot_files_i->size() > 0) - aSeq.load_annot(*annot_files_i, *seq_starts_i, *seq_ends_i); - - the_seqs.push_back(aSeq); - //cout << aSeq.get_header() << endl; - //cout << aSeq.get_seq() << endl; - aSeq.clear(); - ++seq_files_i; // advance all the iterators - ++fasta_indices_i; - ++annot_files_i; - ++seq_starts_i; - ++seq_ends_i; - } -} - - void Mussa::seqcomp() { @@ -409,25 +505,30 @@ Mussa::seqcomp() for(vector::size_type i2 = 0; i2 < the_seqs.size(); i2++) all_comps[i].push_back(dummy_comp); } - for(vector::size_type i = 0; i < the_seqs.size(); i++) - seq_lens.push_back(the_seqs[i].size()); + for(vector::size_type i = 0; i < the_seqs.size(); i++) { + seq_lens.push_back(the_seqs[i]->size()); + } + int seqcomps_done = 0; + int seqcomps_todo = (the_seqs.size() * (the_seqs.size()-1)) / 2; + emit progress("seqcomp", seqcomps_done, seqcomps_todo); for(vector::size_type i = 0; i < the_seqs.size(); i++) for(vector::size_type i2 = i+1; i2 < the_seqs.size(); i2++) { //cout << "seqcomping: " << i << " v. " << i2 << endl; all_comps[i][i2].setup(window, threshold); - all_comps[i][i2].seqcomp(the_seqs[i].get_seq(), the_seqs[i2].get_seq(), false); - all_comps[i][i2].seqcomp(the_seqs[i].get_seq(),the_seqs[i2].rev_comp(),true); + all_comps[i][i2].seqcomp(*the_seqs[i], *the_seqs[i2], false); + all_comps[i][i2].seqcomp(*the_seqs[i], the_seqs[i2]->rev_comp(),true); + ++seqcomps_done; + emit progress("seqcomp", seqcomps_done, seqcomps_todo); } } void Mussa::nway() { - vector some_Seqs; - the_paths.set_soft_thres(soft_thres); + the_paths.set_soft_threshold(soft_thres); if (ana_mode == TransitiveNway) { the_paths.trans_path_search(all_comps); @@ -437,11 +538,12 @@ Mussa::nway() } else if (ana_mode == EntropyNway) { + vector some_Seqs; //unlike other methods, entropy needs to look at the sequence at this stage some_Seqs.clear(); for(vector::size_type i = 0; i < the_seqs.size(); i++) { - some_Seqs.push_back(the_seqs[i].get_seq()); + some_Seqs.push_back(*the_seqs[i]); } the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup @@ -456,138 +558,169 @@ Mussa::nway() } void -Mussa::save() +Mussa::save(fs::path save_path) { - string save_name, save_path, create_dir_cmd, flp_filepath; - fstream save_file; + fs::fstream save_file; ostringstream append_info; int dir_create_status; + if (save_path.empty()) { + if (not analysis_path.empty()) { + save_path = analysis_path; + } else if (not analysis_name.empty()) { + std::string save_name = analysis_name; + // gotta do bit with adding win & thres if to be appended + if (win_append) { + append_info.str(""); + append_info << "_w" << window; + save_name += append_info.str(); + } - // not sure why, but gotta close file each time since can't pass file streams - - save_name = analysis_name; - - // gotta do bit with adding win & thres if to be appended - if (win_append) - { - append_info.str(""); - append_info << "_w" << window; - save_name += append_info.str(); + if (thres_append) { + append_info.str(""); + append_info << "_t" << threshold; + save_name += append_info.str(); + } + save_path = save_name; + } else { + throw mussa_save_error("Need filename or analysis name to save"); + } } - if (thres_append) - { - append_info.str(""); - append_info << "_t" << threshold; - save_name += append_info.str(); + if (not fs::exists(save_path)) { + fs::create_directory(save_path); } - -//#include - // ******* use appropriate for os ------- 1 of 4 - // the additions for osX make it more sane where it saves the analysis - // will come up with a cleaner sol'n later... - create_dir_cmd = "mkdir " + save_name; //linux - //create_dir_cmd = "mkdir " + file_path_base + save_name; //osX - - dir_create_status = system( (const char*) create_dir_cmd.c_str()); - //cout << "action: " << dir_create_status << endl; - + + std::string basename = save_path.leaf(); + fs::path museq(basename + ".museq", fs::native); + // save sequence and annots to a special mussa file - - // ******** use appropriate for OS ---------- 2 of 4 - save_path = save_name + "/" + save_name + ".museq"; //linux - //save_path = file_path_base + save_name + "/" + save_name + ".museq"; //osX - - save_file.open(save_path.c_str(), ios::out); + save_file.open(save_path / museq, ios::out); save_file << "" << endl; - //save_file.close(); for(vector::size_type i = 0; i < the_seqs.size(); i++) { - the_seqs[i].save(save_file); + the_seqs[i]->save(save_file); } - //save_file.open(save_path.c_str(), ios::app); save_file << "" << endl; save_file.close(); - // save nway paths to its mussa save file + // if we have any motifs, save them. + if (motif_sequences.size()) { + fs::path mtl(basename + ".mtl", fs::native); + save_motifs(save_path / mtl); + } - // ******** use appropriate for OS -------- 3 of 4 - save_path = save_name + "/" + save_name + ".muway"; //linux - //save_path = file_path_base + save_name + "/" + save_name + ".muway"; //os X - the_paths.save(save_path); + // save nway paths to its mussa save file + fs::path muway(basename + ".muway", fs::native); + the_paths.save(save_path / muway); - for(vector::size_type i = 0; i < the_seqs.size(); i++) + for(vector::size_type i = 0; i < the_seqs.size(); i++) { for(vector::size_type i2 = i+1; i2 < the_seqs.size(); i2++) { append_info.str(""); append_info << "_sp_" << i << "v" << i2; - // ******** use appropriate for OS --------- 4 of 4 - //linux - save_path = save_name + "/" + save_name + append_info.str() + ".flp"; - //osX - //save_path = file_path_base + save_name + "/" + save_name + append_info.str() + ".flp"; - all_comps[i][i2].save(save_path); + fs::path flp(basename+append_info.str()+".flp", fs::native); + all_comps[i][i2].save(save_path / flp); } + } + + set_dirty(false); + analysis_path = save_path; } void -Mussa::save_muway(string save_path) +Mussa::save_muway(fs::path save_path) { the_paths.save(save_path); } void -Mussa::load(string ana_file) +Mussa::load(fs::path ana_file) { int i, i2; - string::size_type start_index, end_index; - string file_path_base, a_file_path, ana_path; + fs::path file_path_base; + fs::path a_file_path; + fs::path ana_path(ana_file); bool parsing_path; - Sequence tmp_seq; string err_msg; ostringstream append_info; vector empty_FLP_vector; FLPs dummy_comp; - //cout << "ana_file name " << ana_file << endl; - ana_path = ana_file; - parsing_path = true; - end_index = ana_path.size()-1; - if (ana_path[end_index] == '/') { - --end_index; - } - start_index = ana_path.rfind('/', end_index); - if (start_index == string::npos) { - // no / to be found - start_index = 0; - } else { - // skip the / we found - ++start_index; - } - analysis_name = ana_path.substr(start_index, end_index-start_index+1); - //cout << " ana_name " << analysis_name << endl; - file_path_base = ana_path.substr(0, start_index) + analysis_name - + "/" + analysis_name; - a_file_path = file_path_base + ".muway"; - //cout << " loading museq: " << a_file_path << endl; + + //-------------------------------------------------------- + // Load Muway + //-------------------------------------------------------- + analysis_path = ana_file; + analysis_name = ana_path.leaf(); + fs::path muway(analysis_name+".muway", fs::native); + a_file_path = analysis_path / muway; the_paths.load(a_file_path); + // perhaps this could be more elegent, but at least this'll let + // us know what our threshold and window sizes were when we load a muway + window = the_paths.get_window(); + threshold = the_paths.get_threshold(); + soft_thres = threshold; + - int seq_num = the_paths.sequence_count(); + //-------------------------------------------------------- + // Load Sequence + //-------------------------------------------------------- + //int seq_num = the_paths.sequence_count(); - a_file_path = file_path_base + ".museq"; + fs::path museq(analysis_name + ".museq", fs::native); + a_file_path = analysis_path / museq; // this is a bit of a hack due to C++ not acting like it should with files - for (i = 1; i <= seq_num; i++) + /*for (i = 1; i <= seq_num; i++) { - tmp_seq.clear(); - //cout << "mussa_class: loading museq frag... " << a_file_path << endl; - tmp_seq.load_museq(a_file_path, i); + boost::shared_ptr tmp_seq(new Sequence); + tmp_seq->load_museq(a_file_path, i); the_seqs.push_back(tmp_seq); + }*/ + + i = 1; + //int seq_num = 0; + boost::filesystem::fstream load_museq_fs; + load_museq_fs.open(a_file_path, std::ios::in); + boost::shared_ptr tmp_seq; + while (1) + { + tmp_seq = Sequence::load_museq(load_museq_fs); + + if (tmp_seq) + { + the_seqs.push_back(tmp_seq); + } + else + { + break; + } + + + //safe guard in case of an infinate loop. + //FIXME: If mussa can handle a comparison of 10000 sequences + // in the future, then this code should be fixed. + if (i == 10000) + { + throw mussa_load_error(" Run away sequence load!"); + } + i++; + } + load_museq_fs.close(); + + //-------------------------------------------------------- + // Load Motifs + //-------------------------------------------------------- + fs::path mtl(analysis_name + ".mtl", fs::native); + fs::path motif_file = analysis_path / mtl; + if (fs::exists(motif_file)) { + load_motifs(motif_file); } + vector::size_type seq_num = the_seqs.size(); empty_FLP_vector.clear(); for(i = 0; i < seq_num; i++) { @@ -596,30 +729,30 @@ Mussa::load(string ana_file) all_comps[i].push_back(dummy_comp); } + for(i = 0; i < seq_num; i++) { for(i2 = i+1; i2 < seq_num; i2++) { append_info.str(""); - append_info << "_sp_" << i << "v" << i2; - //cout << append_info.str() << endl; - a_file_path = file_path_base + append_info.str() + ".flp"; + append_info << analysis_name << "_sp_" << i << "v" << i2 << ".flp"; + //clog << append_info.str() << endl; + fs::path flp(append_info.str(), fs::native); + a_file_path = analysis_path / flp; all_comps[i][i2].load(a_file_path); - //cout << "real size = " << all_comps[i][i2].size() << endl; } } } - void Mussa::save_old() { - fstream save_file; + fs::fstream save_file; - save_file.open(analysis_name.c_str(), ios::out); + save_file.open(analysis_name, ios::out); for(vector::size_type i = 0; i < the_seqs.size(); i++) - save_file << the_seqs[i].get_seq() << endl; + save_file << *(the_seqs[i]) << endl; save_file << window << endl; save_file.close(); @@ -649,7 +782,7 @@ Mussa::load_old(char * load_file_path, int s_num) for(i = 0; i < seq_num; i++) { getline(save_file, file_data_line); - a_seq.set_seq(file_data_line); + boost::shared_ptr a_seq(new Sequence(file_data_line)); the_seqs.push_back(a_seq); } @@ -682,3 +815,102 @@ Mussa::load_old(char * load_file_path, int s_num) //the_paths.save("tmp.save"); } + +void Mussa::add_motif(const Sequence& motif, const Color& color) +{ + motif_sequences.insert(motif); + color_mapper->appendInstanceColor("motif", motif.get_sequence(), color); + set_dirty(true); +} + +void Mussa::set_motifs(const vector& motifs, + const vector& colors) +{ + if (motifs.size() != colors.size()) { + throw mussa_error("motif and color vectors must be the same size"); + } + + motif_sequences.clear(); + for(size_t i = 0; i != motifs.size(); ++i) + { + add_motif(motifs[i], colors[i]); + } + update_sequences_motifs(); +} + +void Mussa::load_motifs(fs::path filename) +{ + fs::ifstream f; + f.open(filename, ifstream::in); + load_motifs(f); +} + +void Mussa::load_motifs(std::istream &in) +{ + std::string data; + const char *alphabet = Alphabet::dna_cstr; + motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper); + + // slurp our data into a string + std::streamsize bytes_read = 1; + while (in.good() and bytes_read) { + const std::streamsize bufsiz=512; + char buf[bufsiz]; + bytes_read = in.readsome(buf, bufsiz); + data.append(buf, buf+bytes_read); + } + parsed_motifs.parse(data); + update_sequences_motifs(); +} + +void Mussa::save_motifs(fs::path filename) +{ + fs::ofstream out_stream; + out_stream.open(filename, ofstream::out); + save_motifs(out_stream); +} + +void Mussa::save_motifs(std::ostream& out) +{ + for(motif_set::iterator motif_i = motif_sequences.begin(); + motif_i != motif_sequences.end(); + ++motif_i) + { + out << motif_i->get_sequence() << " "; + if (motif_i->get_name().size() > 0) { + out << "\"" << motif_i->get_name() << "\" "; + } + out << color_mapper->lookup("motif", motif_i->get_sequence()); + out << std::endl; + } +} + +void Mussa::update_sequences_motifs() +{ + // once we've loaded all the motifs from the file, + // lets attach them to the sequences + for(vector::iterator seq_i = the_seqs.begin(); + seq_i != the_seqs.end(); + ++seq_i) + { + // clear out old motifs + (*seq_i)->clear_motifs(); + // for all the motifs in our set, attach them to the current sequence + for(set::iterator motif_i = motif_sequences.begin(); + motif_i != motif_sequences.end(); + ++motif_i) + { + (*seq_i)->add_motif(*motif_i); + } + } +} + +const set& Mussa::motifs() const +{ + return motif_sequences; +} + +boost::shared_ptr Mussa::colorMapper() +{ + return color_mapper; +}