1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- mussa_class.cc -----------
13 // ----------------------------------------
15 #include <boost/filesystem/path.hpp>
16 #include <boost/filesystem/operations.hpp>
17 #include <boost/filesystem/fstream.hpp>
18 namespace fs = boost::filesystem;
23 #include "mussa_exceptions.hpp"
24 #include "alg/flp.hpp"
25 #include "alg/mussa.hpp"
26 #include "alg/motif_parser.hpp"
32 : color_mapper(new AnnotationColors)
35 connect(&the_paths, SIGNAL(progress(const std::string&, int, int)),
36 this, SIGNAL(progress(const std::string&, int, int)));
39 Mussa::Mussa(const Mussa& m)
40 : analysis_name(m.analysis_name),
42 threshold(m.threshold),
43 soft_thres(m.soft_thres),
45 win_append(m.win_append),
46 thres_append(m.thres_append),
47 motif_sequences(m.motif_sequences),
48 color_mapper(m.color_mapper),
49 analysis_path(m.analysis_path),
52 connect(&the_paths, SIGNAL(progress(const std::string&, int, int)),
53 this, SIGNAL(progress(const std::string&, int, int)));
56 boost::filesystem::path Mussa::get_analysis_path() const
61 void Mussa::set_analysis_path(boost::filesystem::path pathname)
63 analysis_path = pathname;
66 // set all parameters to null state
71 ana_mode = TransitiveNway;
77 motif_sequences.clear();
78 if(color_mapper) color_mapper->clear();
81 analysis_path = fs::path();
85 bool Mussa::is_dirty() const
90 bool Mussa::empty() const
92 return the_seqs.empty();
96 // these 5 simple methods manually set the parameters for doing an analysis
97 // used so that the gui can take input from user and setup the analysis
98 // note - still need a set_append(bool, bool) method...
100 Mussa::set_name(string a_name)
102 analysis_name = a_name;
106 string Mussa::get_name() const
108 return analysis_name;
111 string Mussa::get_title() const
113 fs::path analysis_path = get_analysis_path();
114 if (not analysis_path.empty()) {
115 return analysis_path.native_file_string();
116 } else if (get_name().size() > 0) {
119 return std::string("Unnamed");
126 if (the_seqs.size() > 0)
127 return the_seqs.size();
133 Mussa::set_window(int a_window)
139 int Mussa::get_window() const
145 Mussa::set_threshold(int a_threshold)
147 threshold = a_threshold;
149 if (a_threshold > soft_thres) {
150 soft_thres = a_threshold;
154 int Mussa::get_threshold() const
160 Mussa::set_soft_threshold(int new_threshold)
162 if (new_threshold < threshold) {
163 soft_thres = threshold;
164 } else if (new_threshold > window) {
167 soft_thres = new_threshold;
171 int Mussa::get_soft_threshold() const
177 Mussa::set_analysis_mode(enum analysis_modes new_ana_mode)
179 ana_mode = new_ana_mode;
183 enum Mussa::analysis_modes Mussa::get_analysis_mode() const
188 string Mussa::get_analysis_mode_name() const
193 return string("Transitive");
196 return string("Radial");
199 return string("Entropy");
202 return string("[deprecated] Recursive");
205 throw runtime_error("invalid analysis mode type");
210 const NwayPaths& Mussa::paths() const
215 //template <class IteratorT>
216 //void Mussa::createLocalAlignment(IteratorT begin, IteratorT end)
217 void Mussa::createLocalAlignment(std::list<ConservedPath>::iterator begin,
218 std::list<ConservedPath>::iterator end,
219 std::list<ConservedPath::path_type>& result,
220 std::list<std::vector<bool> >& reversed)
222 const vector_sequence_type& raw_seq = the_seqs;
223 ConservedPath::path_type aligned_path;
226 int window_length, win_i;
229 vector<bool> rc_list;
231 vector<bool> matched;
235 for(std::list<ConservedPath>::iterator pathz_i=begin; pathz_i != end; ++pathz_i)
237 ConservedPath& a_path = *pathz_i;
238 window_length = a_path.window_size;
239 // determine which parts of the path are RC relative to first species
240 rc_list = a_path.reverseComplimented();
242 // loop over each bp in the conserved region for all sequences
243 for(win_i = 0; win_i < window_length; win_i++)
245 aligned_path.clear();
246 // determine which exact base pairs match between the sequences
248 for(i2 = 0; i2 < a_path.size()-1; i2++)
250 // assume not rc as most likely, adjust below
253 // no matter the case, any RC node needs adjustments
255 rc_1 = window_length-1;
256 if (a_path[i2+1] < 0)
257 rc_2 = window_length-1;
259 x_start = (abs(a_path[i2]-rc_1+win_i));
260 x_end = (abs(a_path[i2+1]-rc_2+win_i));
262 boost::shared_ptr<Sequence> cur(raw_seq[i2]) ;
263 boost::shared_ptr<Sequence> next(raw_seq[i2+1]);
265 // ugh, and xor...only want rc coloring if just one of the nodes is rc
266 // if both nodes are rc, then they are 'normal' relative to each other
267 if((rc_list[i2] || rc_list[i2+1] )&&!(rc_list[i2] && rc_list[i2+1]))
268 { //the hideous rc matching logic - not complex, but annoying
269 if(!(( ((*cur)[x_start]=='A')&&((*next)[x_end]=='T')) ||
270 (((*cur)[x_start]=='T')&&((*next)[x_end]=='A')) ||
271 (((*cur)[x_start]=='G')&&((*next)[x_end]=='C')) ||
272 (((*cur)[x_start]=='C')&&((*next)[x_end]=='G'))) )
276 aligned_path.push_back(x_start);
281 if (!( ((*cur)[x_start] == (*next)[x_end]) &&
282 ((*cur)[x_start] != 'N') && ((*next)[x_end] != 'N') ) ) {
285 aligned_path.push_back(x_start);
289 // grab the last part of our path, assuming we matched
291 aligned_path.push_back(x_end);
293 if (aligned_path.size() == a_path.size()) {
294 result.push_back(aligned_path);
295 reversed.push_back(rc_list);
303 void Mussa::append_sequence(const Sequence& a_seq)
305 boost::shared_ptr<Sequence> seq_copy(new Sequence(a_seq));
306 the_seqs.push_back(seq_copy);
310 void Mussa::append_sequence(boost::shared_ptr<Sequence> a_seq)
312 the_seqs.push_back(a_seq);
317 const vector<boost::shared_ptr<Sequence> >&
318 Mussa::sequences() const
323 void Mussa::load_sequence(fs::path seq_file, fs::path annot_file,
324 int fasta_index, int sub_seq_start, int sub_seq_end,
327 boost::shared_ptr<Sequence> aseq(new Sequence);
328 aseq->load_fasta(seq_file, fasta_index, sub_seq_start, sub_seq_end);
329 if ( not annot_file.empty() ) {
330 aseq->load_annot(annot_file, sub_seq_start, sub_seq_end);
332 if (name != 0 and name->size() > 0 ) {
333 aseq->set_species(*name);
335 the_seqs.push_back(aseq);
340 Mussa::load_mupa_file(fs::path para_file_path)
342 fs::ifstream para_file;
343 string file_data_line;
346 int split_index, fasta_index;
347 int sub_seq_start, sub_seq_end;
348 bool seq_params, did_seq;
351 string::size_type new_index, dir_index;
356 // if file was opened, read the parameter values
357 if (not fs::exists(para_file_path))
359 throw mussa_load_error("Config File: " + para_file_path.string() + " not found");
360 } else if (fs::is_directory(para_file_path)) {
361 throw mussa_load_error("Config File: " + para_file_path.string() + " is a directory.");
362 } else if (fs::is_empty(para_file_path)) {
363 throw mussa_load_error("Config File: " + para_file_path.string() + " is empty");
365 para_file.open(para_file_path, ios::in);
367 // what directory is the mupa file in?
368 fs::path file_path_base = para_file_path.branch_path();
370 // setup loop by getting file's first line
371 getline(para_file,file_data_line);
372 split_index = file_data_line.find(" ");
373 param = file_data_line.substr(0,split_index);
374 value = file_data_line.substr(split_index+1);
379 if (param == "ANA_NAME")
380 analysis_name = value;
381 else if (param == "APPEND_WIN")
383 else if (param == "APPEND_THRES")
385 else if (param == "SEQUENCE_NUM")
386 ; // ignore sequence_num now
387 else if (param == "WINDOW")
388 window = atoi(value.c_str());
389 else if (param == "THRESHOLD")
390 threshold = atoi(value.c_str());
391 else if (param == "SEQUENCE")
393 fs::path seq_file = file_path_base / value;
394 //cout << "seq_file_name " << seq_files.back() << endl;
401 while (para_file && seq_params)
403 getline(para_file,file_data_line);
404 split_index = file_data_line.find(" ");
405 param = file_data_line.substr(0,split_index);
406 value = file_data_line.substr(split_index+1);
408 if (param == "FASTA_INDEX")
409 fasta_index = atoi(value.c_str());
410 else if (param == "ANNOTATION")
411 annot_file = file_path_base / value;
412 else if (param == "SEQ_START")
413 sub_seq_start = atoi(value.c_str());
414 else if (param == "SEQ_END")
416 sub_seq_end = atoi(value.c_str());
418 //ignore empty lines or that start with '#'
419 else if ((param == "") || (param == "#")) {}
420 else seq_params = false;
422 load_sequence(seq_file, annot_file, fasta_index, sub_seq_start,
426 //ignore empty lines or that start with '#'
427 else if ((param == "") || (param == "#")) {}
430 clog << "Illegal/misplaced mussa parameter in file\n";
431 clog << param << "\n";
436 getline(para_file,file_data_line);
437 split_index = file_data_line.find(" ");
438 param = file_data_line.substr(0,split_index);
439 value = file_data_line.substr(split_index+1);
446 soft_thres = threshold;
447 //cout << "nway mupa: analysis_name = " << analysis_name
448 // << " window = " << window
449 // << " threshold = " << threshold << endl;
451 // no file was loaded, signal error
459 if (the_seqs.size() < 2) {
460 throw mussa_analysis_error("you need to have at least 2 sequences to "
465 the_paths.setup(window, threshold);
472 vector<int> seq_lens;
473 vector<FLPs> empty_FLP_vector;
475 string save_file_string;
477 empty_FLP_vector.clear();
478 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
480 all_comps.push_back(empty_FLP_vector);
481 for(vector<Sequence>::size_type i2 = 0; i2 < the_seqs.size(); i2++)
482 all_comps[i].push_back(dummy_comp);
484 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++) {
485 seq_lens.push_back(the_seqs[i]->size());
487 int seqcomps_done = 0;
488 int seqcomps_todo = (the_seqs.size() * (the_seqs.size()-1)) / 2;
489 emit progress("seqcomp", seqcomps_done, seqcomps_todo);
491 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
492 for(vector<Sequence>::size_type i2 = i+1; i2 < the_seqs.size(); i2++)
494 //cout << "seqcomping: " << i << " v. " << i2 << endl;
495 all_comps[i][i2].setup(window, threshold);
496 all_comps[i][i2].seqcomp(*the_seqs[i], *the_seqs[i2], false);
497 all_comps[i][i2].seqcomp(*the_seqs[i], the_seqs[i2]->rev_comp(),true);
499 emit progress("seqcomp", seqcomps_done, seqcomps_todo);
507 the_paths.set_soft_threshold(soft_thres);
509 if (ana_mode == TransitiveNway) {
510 the_paths.trans_path_search(all_comps);
512 else if (ana_mode == RadialNway) {
513 the_paths.radiate_path_search(all_comps);
515 else if (ana_mode == EntropyNway)
517 vector<Sequence> some_Seqs;
518 //unlike other methods, entropy needs to look at the sequence at this stage
520 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
522 some_Seqs.push_back(*the_seqs[i]);
525 the_paths.setup_ent(ent_thres, some_Seqs); // ent analysis extra setup
526 the_paths.entropy_path_search(all_comps);
529 // old recursive transitive analysis function
530 else if (ana_mode == RecursiveNway)
531 the_paths.find_paths_r(all_comps);
533 the_paths.simple_refine();
537 Mussa::save(fs::path save_path)
539 fs::fstream save_file;
540 ostringstream append_info;
541 int dir_create_status;
543 if (save_path.empty()) {
544 if (not analysis_path.empty()) {
545 save_path = analysis_path;
546 } else if (not analysis_name.empty()) {
547 std::string save_name = analysis_name;
548 // gotta do bit with adding win & thres if to be appended
551 append_info << "_w" << window;
552 save_name += append_info.str();
557 append_info << "_t" << threshold;
558 save_name += append_info.str();
560 save_path = save_name;
562 throw mussa_save_error("Need filename or analysis name to save");
566 if (not fs::exists(save_path)) {
567 fs::create_directory(save_path);
570 std::string basename = save_path.leaf();
571 fs::path museq(basename + ".museq", fs::native);
573 // save sequence and annots to a special mussa file
574 save_file.open(save_path / museq, ios::out);
575 save_file << "<Mussa_Sequence>" << endl;
577 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
579 the_seqs[i]->save(save_file);
582 save_file << "</Mussa_Sequence>" << endl;
585 // if we have any motifs, save them.
586 if (motif_sequences.size()) {
587 fs::path mtl(basename + ".mtl", fs::native);
588 save_motifs(save_path / mtl);
591 // save nway paths to its mussa save file
592 fs::path muway(basename + ".muway", fs::native);
593 the_paths.save(save_path / muway);
595 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++) {
596 for(vector<Sequence>::size_type i2 = i+1; i2 < the_seqs.size(); i2++)
599 append_info << "_sp_" << i << "v" << i2;
600 fs::path flp(basename+append_info.str()+".flp", fs::native);
601 all_comps[i][i2].save(save_path / flp);
606 analysis_path = save_path;
610 Mussa::save_muway(fs::path save_path)
612 the_paths.save(save_path);
616 Mussa::load(fs::path ana_file)
619 fs::path file_path_base;
620 fs::path a_file_path;
621 fs::path ana_path(ana_file);
624 ostringstream append_info;
625 vector<FLPs> empty_FLP_vector;
628 analysis_path = ana_file;
629 analysis_name = ana_path.leaf();
630 file_path_base = ana_path.branch_path() / analysis_name;
631 a_file_path = file_path_base / (analysis_name + ".muway");
632 the_paths.load(a_file_path);
633 // perhaps this could be more elegent, but at least this'll let
634 // us know what our threshold and window sizes were when we load a muway
635 window = the_paths.get_window();
636 threshold = the_paths.get_threshold();
637 soft_thres = threshold;
639 int seq_num = the_paths.sequence_count();
641 a_file_path = file_path_base / (analysis_name + ".museq");
643 // this is a bit of a hack due to C++ not acting like it should with files
644 for (i = 1; i <= seq_num; i++)
646 boost::shared_ptr<Sequence> tmp_seq(new Sequence);
647 tmp_seq->load_museq(a_file_path, i);
648 the_seqs.push_back(tmp_seq);
651 fs::path motif_file = file_path_base / (analysis_name + ".mtl");
652 if (fs::exists(motif_file)) {
653 load_motifs(motif_file);
655 empty_FLP_vector.clear();
656 for(i = 0; i < seq_num; i++)
658 all_comps.push_back(empty_FLP_vector);
659 for(i2 = 0; i2 < seq_num; i2++)
660 all_comps[i].push_back(dummy_comp);
663 for(i = 0; i < seq_num; i++)
665 for(i2 = i+1; i2 < seq_num; i2++)
668 append_info << analysis_name << "_sp_" << i << "v" << i2 << ".flp";
669 //clog << append_info.str() << endl;
670 a_file_path = file_path_base / append_info.str();
671 //clog << "path " << a_file_path.string() << endl;
672 all_comps[i][i2].load(a_file_path);
673 //clog << "real size = " << all_comps[i][i2].size() << endl;
682 fs::fstream save_file;
684 save_file.open(analysis_name, ios::out);
686 for(vector<Sequence>::size_type i = 0; i < the_seqs.size(); i++)
687 save_file << *(the_seqs[i]) << endl;
689 save_file << window << endl;
691 //note more complex eventually since analysis_name may need to have
692 //window size, threshold and other stuff to modify it...
693 the_paths.save_old(analysis_name);
698 Mussa::load_old(char * load_file_path, int s_num)
701 string file_data_line;
702 int i, space_split_i, comma_split_i;
703 vector<int> loaded_path;
704 string node_pair, node;
708 the_paths.setup(0, 0);
709 save_file.open(load_file_path, ios::in);
711 // currently loads old mussa format
714 for(i = 0; i < seq_num; i++)
716 getline(save_file, file_data_line);
717 boost::shared_ptr<Sequence> a_seq(new Sequence(file_data_line));
718 the_seqs.push_back(a_seq);
722 getline(save_file, file_data_line);
723 window = atoi(file_data_line.c_str());
726 while (!save_file.eof())
729 getline(save_file, file_data_line);
730 if (file_data_line != "")
731 for(i = 0; i < seq_num; i++)
733 space_split_i = file_data_line.find(" ");
734 node_pair = file_data_line.substr(0,space_split_i);
735 //cout << "np= " << node_pair;
736 comma_split_i = node_pair.find(",");
737 node = node_pair.substr(comma_split_i+1);
738 //cout << "n= " << node << " ";
739 loaded_path.push_back(atoi (node.c_str()));
740 file_data_line = file_data_line.substr(space_split_i+1);
743 // FIXME: do we have any information about what the threshold should be?
744 the_paths.add_path(0, loaded_path);
748 //the_paths.save("tmp.save");
751 void Mussa::add_motif(const Sequence& motif, const Color& color)
753 motif_sequences.insert(motif);
754 color_mapper->appendInstanceColor("motif", motif.get_sequence(), color);
758 void Mussa::set_motifs(const vector<Sequence>& motifs,
759 const vector<Color>& colors)
761 if (motifs.size() != colors.size()) {
762 throw mussa_error("motif and color vectors must be the same size");
765 motif_sequences.clear();
766 for(size_t i = 0; i != motifs.size(); ++i)
768 add_motif(motifs[i], colors[i]);
770 update_sequences_motifs();
773 void Mussa::load_motifs(fs::path filename)
776 f.open(filename, ifstream::in);
780 void Mussa::load_motifs(std::istream &in)
783 const char *alphabet = Alphabet::nucleic_cstr;
784 motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper);
786 // slurp our data into a string
787 std::streamsize bytes_read = 1;
788 while (in.good() and bytes_read) {
789 const std::streamsize bufsiz=512;
791 bytes_read = in.readsome(buf, bufsiz);
792 data.append(buf, buf+bytes_read);
794 parsed_motifs.parse(data);
795 update_sequences_motifs();
798 void Mussa::save_motifs(fs::path filename)
800 fs::ofstream out_stream;
801 out_stream.open(filename, ofstream::out);
802 save_motifs(out_stream);
805 void Mussa::save_motifs(std::ostream& out)
807 for(motif_set::iterator motif_i = motif_sequences.begin();
808 motif_i != motif_sequences.end();
811 out << motif_i->get_sequence() << " ";
812 if (motif_i->get_name().size() > 0) {
813 out << "\"" << motif_i->get_name() << "\" ";
815 out << color_mapper->lookup("motif", motif_i->get_sequence());
820 void Mussa::update_sequences_motifs()
822 // once we've loaded all the motifs from the file,
823 // lets attach them to the sequences
824 for(vector<boost::shared_ptr<Sequence> >::iterator seq_i = the_seqs.begin();
825 seq_i != the_seqs.end();
828 // clear out old motifs
829 (*seq_i)->clear_motifs();
830 // for all the motifs in our set, attach them to the current sequence
831 for(set<Sequence>::iterator motif_i = motif_sequences.begin();
832 motif_i != motif_sequences.end();
835 (*seq_i)->add_motif(*motif_i);
840 const set<Sequence>& Mussa::motifs() const
842 return motif_sequences;
845 boost::shared_ptr<AnnotationColors> Mussa::colorMapper()