From: Diane Trout Date: Tue, 29 Aug 2006 04:06:35 +0000 (+0000) Subject: go back to sequence being its own class X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=mussa.git;a=commitdiff_plain;h=eb21e0aaafdd02783b312a01ed0a09881d0502d8 go back to sequence being its own class So my first attempt at getting the sequence tree hierarchy working ended up imploding in too much complexity. So I decided to try again in smaller steps. This version just switches back to sequence not being a string. one other problem is that iterators didn't seem to be serializable. so there might be some problem writing a sequence with a tree of annotated subsequences attached to it out. --- diff --git a/alg/flp.hpp b/alg/flp.hpp index 75298a4..8e7dbb9 100644 --- a/alg/flp.hpp +++ b/alg/flp.hpp @@ -20,6 +20,7 @@ #include #include +class Sequence; //! FLP = Fixed Length Pairs (Data) /*! * vector of linked lists of the match type struct @@ -46,7 +47,7 @@ public: void setup(int win_size, int hard_thres); //! compare two sequences and store the list of results in all_matches - void seqcomp(std::string seq1, std::string seq2, bool is_RC); + void seqcomp(const Sequence& seq1, const Sequence& seq2, bool is_RC); //bool FLPs::match_less(match *match1, match *match2); //void FLPs::sort(); //! Return all the matches for a particular window? diff --git a/alg/flp_seqcomp.cpp b/alg/flp_seqcomp.cpp index 7315028..2c7ad7a 100644 --- a/alg/flp_seqcomp.cpp +++ b/alg/flp_seqcomp.cpp @@ -14,6 +14,7 @@ #include "mussa_exceptions.hpp" #include "alg/flp.hpp" +#include "alg/sequence.hpp" #include #include using namespace std; @@ -46,12 +47,14 @@ FLPs::add(int seq1_i, int seq2_i, int a_score, int i2_offset) void -FLPs::seqcomp(string sseq1, string sseq2, bool is_RC) +FLPs::seqcomp(const Sequence& sseq1, const Sequence& sseq2, bool is_RC) { int start_i, seq1_i, seq2_i, win_i; // loop variables int matches; // number of matches in to a window int i2_offset; - char * seq1, * seq2; + Sequence::const_iterator seq1 = sseq1.begin(); + Sequence::const_iterator seq2 = sseq2.begin(); + int seq1_win_num = sseq1.size() - window_size + 1; int seq2_win_num = sseq2.size() - window_size + 1; alloc_matches(sseq1.size()); @@ -62,9 +65,6 @@ FLPs::seqcomp(string sseq1, string sseq2, bool is_RC) throw mussa_error(msg.str()); } - seq1 = (char *) sseq1.c_str(); - seq2 = (char *) sseq2.c_str(); - if (is_RC) i2_offset = window_size - sseq2.size(); else diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 1c35273..f87673a 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -479,7 +479,7 @@ Mussa::nway() } else if (ana_mode == EntropyNway) { - vector some_Seqs; + vector some_Seqs; //unlike other methods, entropy needs to look at the sequence at this stage some_Seqs.clear(); for(vector::size_type i = 0; i < the_seqs.size(); i++) @@ -700,7 +700,7 @@ Mussa::load_old(char * load_file_path, int s_num) void Mussa::add_motif(const Sequence& motif, const Color& color) { motif_sequences.insert(motif); - color_mapper->appendInstanceColor("motif", motif, color); + color_mapper->appendInstanceColor("motif", motif.get_sequence(), color); } void Mussa::set_motifs(const vector& motifs, diff --git a/alg/nway_entropy.cpp b/alg/nway_entropy.cpp index 1d9870a..94f47f5 100644 --- a/alg/nway_entropy.cpp +++ b/alg/nway_entropy.cpp @@ -20,13 +20,13 @@ using namespace std; void -NwayPaths::setup_ent(double new_entropy_thres, vector some_seqs) +NwayPaths::setup_ent(double new_entropy_thres, vector some_seqs) { ent_thres = new_entropy_thres; c_sequences.clear(); for (vector::size_type i = 0; i < some_seqs.size(); i++) - c_sequences.push_back((char *)some_seqs[i].c_str()); + c_sequences.push_back((char *)some_seqs[i].get_sequence().c_str()); } diff --git a/alg/nway_paths.hpp b/alg/nway_paths.hpp index 008b23f..3259116 100644 --- a/alg/nway_paths.hpp +++ b/alg/nway_paths.hpp @@ -23,6 +23,7 @@ #include "alg/conserved_path.hpp" #include "alg/flp.hpp" +#include "alg/sequence.hpp" class NwayPaths : public QObject { @@ -39,7 +40,7 @@ public: //! setup an nway comparison, initialize # of species, window size, //! threshold void setup(int w, int t); - void setup_ent(double new_entropy_thres, std::vector some_Seqs); + void setup_ent(double new_entropy_thres, std::vector some_Seqs); //! clear out our path void clear(); //! set the score that a match must exceed inorder to be recorded as a path diff --git a/alg/sequence.cpp b/alg/sequence.cpp index bbd6292..0d1594f 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -85,9 +85,9 @@ motif::~motif() } Sequence::Sequence() - : std::string(), - header(""), - species("") + : seq(""), + header(""), + species("") { } @@ -110,7 +110,7 @@ Sequence::Sequence(const std::string& seq) } Sequence::Sequence(const Sequence& o) - : std::string(o), + : seq(o.seq), header(o.header), species(o.species), annots(o.annots), @@ -122,7 +122,7 @@ Sequence &Sequence::operator=(const Sequence& s) { if (this != &s) { //sequence = s.sequence; - assign(s); + seq = s.seq; header = s.header; species = s.species; annots = s.annots; @@ -131,6 +131,20 @@ Sequence &Sequence::operator=(const Sequence& s) return *this; } +/* +Sequence::operator std::string() +{ + std::string s(seq.begin(), seq.end()); + return s; +} + +Sequence::operator std::string() const +{ + std::string s(seq.begin(), seq.end()); + return s; +} +*/ + static void multiplatform_getline(std::istream& in, std::string& line) { line.clear(); @@ -245,8 +259,8 @@ void Sequence::set_filtered_sequence(const std::string &old_seq, if ( count == 0) count = old_seq.size() - start; - std::string::clear(); - reserve(count); + seq.clear(); + seq.reserve(count); // Make a conversion table @@ -272,13 +286,10 @@ void Sequence::set_filtered_sequence(const std::string &old_seq, // finally, the actual conversion loop for(std::string::size_type seq_index = 0; seq_index < count; seq_index++) { - append(1, conversionTable[ (int)old_seq[seq_index+start]]); + seq.append(1, conversionTable[ (int)old_seq[seq_index+start]]); } } - // this doesn't work properly under gcc 3.x ... it can't recognize toupper - //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper); - void Sequence::load_annot(fs::path file_path, int start_index, int end_index) { @@ -464,7 +475,7 @@ Sequence::subseq(int start, int count) const if ( count == npos || start+count > size()) { count = size()-start; } - Sequence new_seq(std::string::substr(start, count)); + Sequence new_seq(seq.substr(start, count)); new_seq.set_fasta_header(get_fasta_header()); new_seq.set_species(get_species()); @@ -533,7 +544,7 @@ Sequence::rev_comp() const // finally, the actual conversion loop for(seq_i = len - 1; seq_i >= 0; seq_i--) { - table_i = (int) at(seq_i); + table_i = (int) seq.at(seq_i); rev_comp += conversionTable[table_i]; } @@ -573,15 +584,70 @@ Sequence::get_name() const return ""; } +void Sequence::set_sequence(const std::string& s) +{ + set_filtered_sequence(s); +} + +std::string Sequence::get_sequence() const +{ + return seq; +} + +Sequence::const_reference Sequence::operator[](Sequence::size_type i) const +{ + return seq[i]; +} + +Sequence::const_reference Sequence::at(Sequence::size_type n) const +{ + return seq[n]; +} + void Sequence::clear() { - std::string::clear(); - header = ""; - species = ""; + seq.clear(); + header.clear(); + species.clear(); annots.clear(); + motif_list.clear(); +} + +Sequence::iterator Sequence::begin() +{ + return seq.begin(); +} + +Sequence::const_iterator Sequence::begin() const +{ + return seq.begin(); +} + +Sequence::iterator Sequence::end() +{ + return seq.end(); } +Sequence::const_iterator Sequence::end() const +{ + return seq.end(); +} + +bool Sequence::empty() const +{ + return seq.empty(); +} + +Sequence::size_type Sequence::size() const +{ + return seq.size(); +} + +Sequence::size_type Sequence::length() const +{ + return size(); +} void Sequence::save(fs::fstream &save_file) //std::string save_file_path) @@ -631,7 +697,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) seq_counter++; } getline(load_file, file_data_line); - assign(file_data_line); + seq.assign(file_data_line); getline(load_file, file_data_line); getline(load_file, file_data_line); if (file_data_line == "") @@ -689,7 +755,7 @@ Sequence::load_museq(fs::path load_file_path, int seq_num) } std::string -Sequence::rc_motif(std::string a_motif) +Sequence::rc_motif(std::string a_motif) const { std::string rev_comp; char conversionTable[257]; @@ -737,7 +803,7 @@ Sequence::rc_motif(std::string a_motif) } std::string -Sequence::motif_normalize(std::string a_motif) +Sequence::motif_normalize(const std::string& a_motif) { std::string valid_motif; int seq_i, len; @@ -798,7 +864,7 @@ void Sequence::add_motif(const Sequence& a_motif) motif_start_i != motif_starts.end(); ++motif_start_i) { - motif_list.push_back(motif(*motif_start_i, a_motif)); + motif_list.push_back(motif(*motif_start_i, a_motif.get_sequence())); } } @@ -813,40 +879,44 @@ const std::list& Sequence::motifs() const } std::vector -Sequence::find_motif(std::string a_motif) +Sequence::find_motif(const std::string& a_motif) const { std::vector motif_match_starts; - std::string a_motif_rc; + std::string norm_motif_rc; motif_match_starts.clear(); //std::cout << "motif is: " << a_motif << std::endl; - a_motif = motif_normalize(a_motif); + std::string norm_motif = motif_normalize(a_motif); //std::cout << "motif is: " << a_motif << std::endl; - if (a_motif.size() > 0) + if (norm_motif.size() > 0) { //std::cout << "Sequence: none blank motif\n"; - motif_scan(a_motif, &motif_match_starts); + motif_scan(norm_motif, &motif_match_starts); - a_motif_rc = rc_motif(a_motif); + norm_motif_rc = rc_motif(a_motif); // make sure not to do search again if it is a palindrome - if (a_motif_rc != a_motif) { - motif_scan(a_motif_rc, &motif_match_starts); + if (norm_motif_rc != norm_motif) { + motif_scan(norm_motif_rc, &motif_match_starts); } } return motif_match_starts; } +std::vector +Sequence::find_motif(const Sequence& a_motif) const +{ + return find_motif(a_motif.get_sequence()); +} + void -Sequence::motif_scan(std::string a_motif, std::vector * motif_match_starts) +Sequence::motif_scan(std::string a_motif, std::vector * motif_match_starts) const { - const char * seq_c; + std::string::const_iterator seq_c = seq.begin(); std::string::size_type seq_i; int motif_i, motif_len; - // faster to loop thru the sequence as a old c std::string (ie char array) - seq_c = c_str(); //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl; motif_len = a_motif.length(); @@ -946,9 +1016,48 @@ void Sequence::find_sequences(std::list::iterator start, std::list::iterator end) { while (start != end) { - add_string_annotation(*start, start->get_fasta_header()); + add_string_annotation(start->get_sequence(), start->get_fasta_header()); ++start; } } +std::ostream& operator<<(std::ostream& out, const Sequence& s) +{ + for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) { + out << *s_i; + } + return out; +} + +bool operator<(const Sequence& x, const Sequence& y) +{ + Sequence::const_iterator x_i = x.begin(); + Sequence::const_iterator y_i = y.begin(); + + while(1) { + if( x_i == x.end() and y_i == y.end() ) { + return false; + } else if ( x_i == x.end() ) { + return true; + } else if ( y_i == y.end() ) { + return false; + } else if ( (*x_i) < (*y_i)) { + return true; + } else if ( (*x_i) > (*y_i) ) { + return false; + } else { + ++x_i; + ++y_i; + } + } +} + +bool operator==(const Sequence& x, const Sequence& y) +{ + if (x.seq == y.seq and x.annots == y.annots and x.motif_list == y.motif_list) { + return true; + } else { + return false; + } +} diff --git a/alg/sequence.hpp b/alg/sequence.hpp index 39d6aed..67f6c9f 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -82,111 +82,145 @@ private: BOOST_CLASS_EXPORT(motif); //! sequence track for mussa. -class Sequence : public std::string +class Sequence { - private: - std::string header; - std::string species; - - std::list annots; - //! a seperate list for motifs since we're currently not saving them - std::list motif_list; - - void motif_scan(std::string a_motif, std::vector * motif_match_starts); - std::string rc_motif(std::string a_motif); - //! look for a string sequence type and and it to an annotation list - void add_string_annotation(std::string a_seq, std::string name); - - // boost::serialization support - friend class boost::serialization::access; - template - void serialize(Archive& ar, const unsigned int /*version*/) { - ar & boost::serialization::make_nvp( - "seq_text", - boost::serialization::base_object(*this) - ); - ar & BOOST_SERIALIZATION_NVP(header); - ar & BOOST_SERIALIZATION_NVP(species); - ar & BOOST_SERIALIZATION_NVP(annots); - ar & BOOST_SERIALIZATION_NVP(motif_list); - } - - public: - Sequence(); - ~Sequence(); - Sequence(const char* seq); - Sequence(const std::string& seq); - Sequence(const Sequence& seq); - //! assignment to constant sequences - Sequence &operator=(const Sequence&); - - //! set sequence to a (sub)string containing nothing but AGCTN - void set_filtered_sequence(const std::string& seq, - std::string::size_type start=0, - std::string::size_type count=0); - - //! load sequence AGCT from fasta file - //! \throw mussa_load_error - //! \throw sequence_empty_error - //! \throw sequence_empty_file_error - void load_fasta(const boost::filesystem::path file_path, int seq_num=1, - int start_index=0, int end_index=0); - //! load sequence from stream - //! \throw mussa_load_error - //! \throw sequence_empty_error - //! \throw sequence_empty_file_error - void load_fasta(std::iostream& file, int seq_num=1, - int start_index=0, int end_index=0); - //! load sequence annotations - //! \throws mussa_load_error - void load_annot(const boost::filesystem::path file_path, int start_index, int end_index); - //! load sequence annotations - //! \throws mussa_load_error - void load_annot(std::fstream& data_stream, int start_index, int end_index); - bool parse_annot(std::string data, int start_index=0, int end_index=0); - //! add an annotation to our list of annotations - void add_annotation(const annot& a); - const std::list& annotations() const; - const std::list& motifs() const; - - //! return a subsequence, copying over any appropriate annotation - Sequence subseq(int start=0, int count = std::string::npos) const; - //! return a reverse compliment - std::string rev_comp() const; - - //! clear the sequence and its annotations - void clear(); - - //! set species name - void set_species(const std::string &); - //! get species name - std::string get_species() const; - //! set the fasta header - void set_fasta_header(std::string header); - //! get the fasta header - std::string get_fasta_header() const; - //! get name (will return the first non-empty, of fasta_header, species) - std::string get_name() const; - - //! add a motif to our list of motifs - //! \throws motif_normalize_error if there's something wrong with a_motif - void add_motif(const Sequence& a_motif); - //! clear our list of found motifs - void clear_motifs(); - //! search a sequence for a_motif - //! \throws motif_normalize_error if there's something wrong with a_motif - std::vector find_motif(std::string a_motif); - //! convert IUPAC symbols to upperase - //! \throws motif_normalize_error if there is an invalid symbol - static std::string motif_normalize(std::string a_motif); - - //! annotate the current sequence with other sequences - void find_sequences(std::list::iterator start, - std::list::iterator end); - - void save(boost::filesystem::fstream &save_file); - void load_museq(boost::filesystem::path load_file_path, int seq_num); - +public: + typedef std::string::difference_type difference_type; + typedef std::string::iterator iterator; + typedef std::string::const_iterator const_iterator; + typedef std::string::reference reference; + typedef std::string::const_reference const_reference; + typedef std::string::size_type size_type; + static const size_type npos = std::string::npos; + + Sequence(); + ~Sequence(); + Sequence(const char* seq); + Sequence(const std::string& seq); + Sequence(const Sequence& seq); + //! assignment to constant sequences + Sequence &operator=(const Sequence&); + + // dangerous since they create large copies + //operator std::string(); + //operator std::string() const; + + friend std::ostream& operator<<(std::ostream&, const Sequence&); + friend bool operator<(const Sequence&, const Sequence&); + friend bool operator==(const Sequence&, const Sequence&); + const_reference operator[](size_type) const; + + //! set sequence to a (sub)string containing nothing but AGCTN + void set_filtered_sequence(const std::string& seq, + std::string::size_type start=0, + std::string::size_type count=0); + + //! retrive element at specific position + const_reference at(size_type n) const; + //! clear the sequence and its annotations + void clear(); + //! forward iterator + iterator begin(); + const_iterator begin() const; + iterator end(); + const_iterator end() const; + //! is our sequence empty? + bool empty() const; + //! how many base pairs are there in our sequence + size_type size() const; + //! alias for size (used by string) + size_type length() const; + + //! return a subsequence, copying over any appropriate annotation + Sequence subseq(int start=0, int count = std::string::npos) const; + //! return a reverse compliment + std::string rev_comp() const; + + //! set sequence (filtered) + void set_sequence(const std::string &); + //! get sequence + std::string get_sequence() const; + //! set species name + void set_species(const std::string &); + //! get species name + std::string get_species() const; + //! set the fasta header + void set_fasta_header(std::string header); + //! get the fasta header + std::string get_fasta_header() const; + //! get name (will return the first non-empty, of fasta_header, species) + std::string get_name() const; + + //! load sequence AGCT from fasta file + //! \throw mussa_load_error + //! \throw sequence_empty_error + //! \throw sequence_empty_file_error + void load_fasta(const boost::filesystem::path file_path, int seq_num=1, + int start_index=0, int end_index=0); + //! load sequence from stream + //! \throw mussa_load_error + //! \throw sequence_empty_error + //! \throw sequence_empty_file_error + void load_fasta(std::iostream& file, int seq_num=1, + int start_index=0, int end_index=0); + //! load sequence annotations + //! \throws mussa_load_error + void load_annot(const boost::filesystem::path file_path, int start_index, int end_index); + //! load sequence annotations + //! \throws mussa_load_error + void load_annot(std::fstream& data_stream, int start_index, int end_index); + bool parse_annot(std::string data, int start_index=0, int end_index=0); + //! add an annotation to our list of annotations + void add_annotation(const annot& a); + const std::list& annotations() const; + const std::list& motifs() const; + + //! add a motif to our list of motifs + //! \throws motif_normalize_error if there's something wrong with a_motif + void add_motif(const Sequence& a_motif); + //! clear our list of found motifs + void clear_motifs(); + //! search a sequence for a_motif + //! \throws motif_normalize_error if there's something wrong with a_motif + std::vector find_motif(const std::string& a_motif) const; + //! search a sequence for a_motif + //! \throws motif_normalize_error if there's something wrong with a_motif + std::vector find_motif(const Sequence& a_motif) const; + //! convert IUPAC symbols to upperase + //! \throws motif_normalize_error if there is an invalid symbol + static std::string motif_normalize(const std::string& a_motif); + + //! annotate the current sequence with other sequences + void find_sequences(std::list::iterator start, + std::list::iterator end); + + void save(boost::filesystem::fstream &save_file); + void load_museq(boost::filesystem::path load_file_path, int seq_num); + +private: + std::string seq; + std::string header; + std::string species; + + std::list annots; + //! a seperate list for motifs since we're currently not saving them + std::list motif_list; + + void motif_scan(std::string a_motif, std::vector * motif_match_starts) const; + std::string rc_motif(std::string a_motif) const; + //! look for a string sequence type and and it to an annotation list + void add_string_annotation(std::string a_seq, std::string name); + + // boost::serialization support + friend class boost::serialization::access; + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & BOOST_SERIALIZATION_NVP(seq); + ar & BOOST_SERIALIZATION_NVP(header); + ar & BOOST_SERIALIZATION_NVP(species); + ar & BOOST_SERIALIZATION_NVP(annots); + ar & BOOST_SERIALIZATION_NVP(motif_list); + } }; BOOST_CLASS_EXPORT(Sequence); diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 70c585e..a0d7045 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE( sequence_filter ) BOOST_CHECK_EQUAL(s2, "AATTGGCC"); BOOST_CHECK_EQUAL(s2.rev_comp(), "GGCCAATT"); BOOST_CHECK_EQUAL(s2.size(), s2.size()); - BOOST_CHECK_EQUAL(s2.c_str(), core_seq); + BOOST_CHECK_EQUAL(s2.get_sequence(), core_seq); Sequence s3("asdfg"); BOOST_CHECK_EQUAL(s3, "ANNNG"); diff --git a/py/sequence.cpp b/py/sequence.cpp index 494690c..e340fe6 100644 --- a/py/sequence.cpp +++ b/py/sequence.cpp @@ -11,16 +11,12 @@ using namespace boost::python; #include #include "alg/glsequence.hpp" -PyObject* seq_to_string(const Sequence& s) { - ::PyString_FromStringAndSize(s.data(), s.size()); -} - void export_sequence() { class_("Sequence") .def(init()) - .def("__str__", &seq_to_string, "cast to string") - .def("__repr__", &seq_to_string, "display as string") + .def("__str__", &Sequence::get_sequence, "cast to string") + .def("__repr__", &Sequence::get_sequence, "display as string") .def("size", &Sequence::size, "return the length of the sequence") .def("__len__", &Sequence::size, "return the length of the sequence") .def("clear", &Sequence::clear, "clear the sequence and its annotations") diff --git a/qui/motif_editor/MotifEditor.cpp b/qui/motif_editor/MotifEditor.cpp index 0fd80f0..b27967d 100644 --- a/qui/motif_editor/MotifEditor.cpp +++ b/qui/motif_editor/MotifEditor.cpp @@ -13,7 +13,7 @@ MotifEditor::MotifEditor(Mussa *m, QWidget *parent) { assert (m != 0); const set &motif = analysis->motifs(); - vector motif_seq(motif.begin(), motif.end()); + vector motif_seq(motif.begin(), motif.end()); connect(&applyButton, SIGNAL(clicked()), this, SLOT(updateMotifs())); layout.addWidget(&applyButton); @@ -22,8 +22,8 @@ MotifEditor::MotifEditor(Mussa *m, QWidget *parent) { MotifDetail *detail = new MotifDetail; if (i < motif_seq.size()) { - detail->setMotif(motif_seq[i]); - detail->setColor(analysis->colorMapper()->lookup("motif", motif_seq[i])); + detail->setMotif(motif_seq[i].get_sequence()); + detail->setColor(analysis->colorMapper()->lookup("motif", motif_seq[i].get_sequence())); } motif_details.push_back(detail); layout.addWidget(detail);