From 02adcab9918657891638e68580b8b88e6d935875 Mon Sep 17 00:00:00 2001 From: Brandon King Date: Wed, 18 Apr 2007 00:01:36 +0000 Subject: [PATCH] Load N sequences in Mussa::load Fix for ticket:240. Mussa would not load the sequences because the .muway file would declare that 0 sequences existed. Since the Mussa:load code needed to know the number of sequences to load the sequences, it failed. The new version of the load sequence code, reads the sequences in until there are no longer any sequences to read. At that point, the number of sequences is the_seqs.size() and therefore is no longer dependant on the muway file. --- alg/mussa.cpp | 48 ++++++++++++++++++- alg/sequence.cpp | 117 ++++++++++++++++++++++++++++++++++++++++++----- alg/sequence.hpp | 3 +- 3 files changed, 154 insertions(+), 14 deletions(-) diff --git a/alg/mussa.cpp b/alg/mussa.cpp index f75290c..82b1d4a 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -639,6 +639,10 @@ Mussa::load(fs::path ana_file) vector empty_FLP_vector; FLPs dummy_comp; + + //-------------------------------------------------------- + // Load Muway + //-------------------------------------------------------- analysis_path = ana_file; analysis_name = ana_path.leaf(); fs::path muway(analysis_name+".muway", fs::native); @@ -650,24 +654,63 @@ Mussa::load(fs::path ana_file) threshold = the_paths.get_threshold(); soft_thres = threshold; - int seq_num = the_paths.sequence_count(); + + //-------------------------------------------------------- + // Load Sequence + //-------------------------------------------------------- + //int seq_num = the_paths.sequence_count(); fs::path museq(analysis_name + ".museq", fs::native); a_file_path = analysis_path / museq; // this is a bit of a hack due to C++ not acting like it should with files - for (i = 1; i <= seq_num; i++) + /*for (i = 1; i <= seq_num; i++) { boost::shared_ptr tmp_seq(new Sequence); tmp_seq->load_museq(a_file_path, i); the_seqs.push_back(tmp_seq); + }*/ + + i = 1; + //int seq_num = 0; + boost::filesystem::fstream load_museq_fs; + load_museq_fs.open(a_file_path, std::ios::in); + boost::shared_ptr tmp_seq; + while (1) + { + tmp_seq = Sequence::load_museq(load_museq_fs); + + if (tmp_seq) + { + the_seqs.push_back(tmp_seq); + } + else + { + break; + } + + + //safe guard in case of an infinate loop. + //FIXME: If mussa can handle a comparison of 10000 sequences + // in the future, then this code should be fixed. + if (i == 10000) + { + throw mussa_load_error(" Run away sequence load!"); + } + i++; } + load_museq_fs.close(); + //-------------------------------------------------------- + // Load Motifs + //-------------------------------------------------------- fs::path mtl(analysis_name + ".mtl", fs::native); fs::path motif_file = analysis_path / mtl; if (fs::exists(motif_file)) { load_motifs(motif_file); } + + vector::size_type seq_num = the_seqs.size(); empty_FLP_vector.clear(); for(i = 0; i < seq_num; i++) { @@ -676,6 +719,7 @@ Mussa::load(fs::path ana_file) all_comps[i].push_back(dummy_comp); } + for(i = 0; i < seq_num; i++) { for(i2 = i+1; i2 < seq_num; i2++) diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 05ec0a9..2e845c6 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -710,10 +710,97 @@ Sequence::save(fs::fstream &save_file) //save_file.close(); } -void -Sequence::load_museq(fs::path load_file_path, int seq_num) +//void +//Sequence::load_museq(fs::path load_file_path, int seq_num) +//{ +// fs::fstream load_file; +// std::string file_data_line; +// int seq_counter; +// //annot an_annot; +// int annot_begin; +// int annot_end; +// std::string annot_name; +// std::string annot_type; +// +// std::string::size_type space_split_i; +// std::string annot_value; +// +// annotation_list.reset(new SeqSpanRefList); +// +// load_file.open(load_file_path, std::ios::in); +// +// seq_counter = 0; +// // search for the seq_num-th sequence +// while ( (!load_file.eof()) && (seq_counter < seq_num) ) +// { +// getline(load_file,file_data_line); +// if (file_data_line == "") +// seq_counter++; +// } +// getline(load_file, file_data_line); +// // looks like the sequence is written as a single line +// set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand); +// getline(load_file, file_data_line); +// getline(load_file, file_data_line); +// if (file_data_line == "") +// { +// getline(load_file, file_data_line); +// species = file_data_line; +// while ( (!load_file.eof()) && (file_data_line != "") ) +// { +// getline(load_file,file_data_line); +// if ((file_data_line != "") && (file_data_line != "")) +// { +// // need to get 4 values...almost same code 4 times... +// // get annot start index +// space_split_i = file_data_line.find(" "); +// annot_value = file_data_line.substr(0,space_split_i); +// annot_begin = atoi (annot_value.c_str()); +// file_data_line = file_data_line.substr(space_split_i+1); +// // get annot end index +// space_split_i = file_data_line.find(" "); +// annot_value = file_data_line.substr(0,space_split_i); +// annot_end = atoi (annot_value.c_str()); +// +// if (space_split_i == std::string::npos) // no entry for type or name +// { +// std::cout << "seq, annots - no type or name\n"; +// annot_name = ""; +// annot_type = ""; +// } +// else // else get annot type +// { +// file_data_line = file_data_line.substr(space_split_i+1); +// space_split_i = file_data_line.find(" "); +// annot_value = file_data_line.substr(0,space_split_i); +// //an_annot.type = annot_value; +// annot_type = annot_value; +// if (space_split_i == std::string::npos) // no entry for name +// { +// std::cout << "seq, annots - no name\n"; +// annot_name = ""; +// } +// else // get annot name +// { +// file_data_line = file_data_line.substr(space_split_i+1); +// space_split_i = file_data_line.find(" "); +// annot_value = file_data_line.substr(0,space_split_i); +// // this seems like its wrong? +// annot_type = annot_value; +// } +// } +// add_annotation(annot_name, annot_type, annot_begin, annot_end); +// } +// //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end +// // << "-->" << an_annot.type << "::" << an_annot.name << std::endl; +// } +// } +// load_file.close(); +//} + +SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file) { - fs::fstream load_file; + boost::shared_ptr seq(new Sequence); std::string file_data_line; int seq_counter; //annot an_annot; @@ -725,27 +812,34 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) std::string::size_type space_split_i; std::string annot_value; - annotation_list.reset(new SeqSpanRefList); - - load_file.open(load_file_path, std::ios::in); + //seq->annotation_list.reset(new SeqSpanRefList); seq_counter = 0; - // search for the seq_num-th sequence + // search for the next sequence + int seq_num = 1; while ( (!load_file.eof()) && (seq_counter < seq_num) ) { getline(load_file,file_data_line); if (file_data_line == "") seq_counter++; } + + // Could not find next sequence + if (load_file.eof()) + { + seq.reset(); + return seq; + } + getline(load_file, file_data_line); // looks like the sequence is written as a single line - set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand); + seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand); getline(load_file, file_data_line); getline(load_file, file_data_line); if (file_data_line == "") { getline(load_file, file_data_line); - species = file_data_line; + seq->set_species(file_data_line); while ( (!load_file.eof()) && (file_data_line != "") ) { getline(load_file,file_data_line); @@ -789,13 +883,14 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) annot_type = annot_value; } } - add_annotation(annot_name, annot_type, annot_begin, annot_end); + seq->add_annotation(annot_name, annot_type, annot_begin, annot_end); } //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end // << "-->" << an_annot.type << "::" << an_annot.name << std::endl; } } - load_file.close(); + //load_file.close(); + return seq; } diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 1714941..1a4c9eb 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -231,7 +231,8 @@ public: SeqSpanRef seqspan() { return seq; } void save(boost::filesystem::fstream &save_file); - void load_museq(boost::filesystem::path load_file_path, int seq_num); + //void load_museq(boost::filesystem::path load_file_path, int seq_num); + static SequenceRef load_museq(boost::filesystem::fstream& load_file); protected: SeqSpanRef seq; -- 2.30.2