From b99f9cb5ab110260ec476acf09a564255ecc7b1a Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 8 Sep 2006 22:48:54 +0000 Subject: [PATCH] subsequences now use offsets into the shared_ptr seq_string A subsequence created with the Sequence::subseq call will use the creating sequences shared_ptr, and then just maintains a seq_start and seq_count to indicate the start and number of bytes in the subseq. (Yes this means that sometiems the c_str will let you read more than you should actually be able to. But try not to depend on that). --- alg/sequence.cpp | 85 +++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/alg/sequence.cpp b/alg/sequence.cpp index ae670c7..64bbfbd 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -83,6 +83,8 @@ const std::string Sequence::nucleic_iupac_alphabet("AaCcGgTtUuRrYyMmKkSsWwBbDdHh const std::string Sequence::protein_alphabet("AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"); Sequence::Sequence() + : seq_start(0), + seq_count(0) { } @@ -91,14 +93,18 @@ Sequence::~Sequence() } Sequence::Sequence(const char *seq) - : header(""), + : seq_start(0), + seq_count(0), + header(""), species("") { set_filtered_sequence(seq); } Sequence::Sequence(const std::string& seq) - : header(""), + : seq_start(0), + seq_count(0), + header(""), species("") { set_filtered_sequence(seq); @@ -106,6 +112,8 @@ Sequence::Sequence(const std::string& seq) Sequence::Sequence(const Sequence& o) : seq(o.seq), + seq_start(o.seq_start), + seq_count(o.seq_count), header(o.header), species(o.species), annots(o.annots), @@ -116,8 +124,9 @@ Sequence::Sequence(const Sequence& o) Sequence &Sequence::operator=(const Sequence& s) { if (this != &s) { - //sequence = s.sequence; seq = s.seq; + seq_start = s.seq_start; + seq_count = s.seq_count; header = s.header; species = s.species; annots = s.annots; @@ -188,7 +197,7 @@ Sequence::load_fasta(std::iostream& data_file, int seq_num, bool read_seq = true; std::string rev_comp; std::string sequence_raw; - std::string seq_tmp; // holds sequence during basic filtering + std::string seq_tmp; // holds sequence during basic filtering if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); @@ -270,6 +279,8 @@ void Sequence::set_filtered_sequence(const std::string &old_seq, new_seq->append(1, conversionTable[ (int)old_seq[seq_index+start]]); } seq = new_seq; + seq_start = 0; + seq_count = count; } void @@ -432,8 +443,7 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) *spirit::space_p ) //end grammar - ) /*, - spirit::space_p*/).full; + )).full; // go seearch for query sequences find_sequences(query_seqs.begin(), query_seqs.end()); @@ -462,11 +472,12 @@ Sequence::subseq(int start, int count) const if ( count == npos || start+count > size()) { count = size()-start; } - Sequence new_seq(seq->substr(start, count)); - new_seq.set_fasta_header(get_fasta_header()); - new_seq.set_species(get_species()); + Sequence new_seq(*this); + new_seq.seq_start = start; + new_seq.seq_count = count; new_seq.motif_list = motif_list; + new_seq.annots.clear(); // attempt to copy & reannotate position based annotations int end = start+count; @@ -531,7 +542,7 @@ Sequence::rev_comp() const // finally, the actual conversion loop for(seq_i = len - 1; seq_i >= 0; seq_i--) { - table_i = (int) seq->at(seq_i); + table_i = (int) at(seq_i); rev_comp += conversionTable[table_i]; } @@ -589,13 +600,15 @@ Sequence::const_reference Sequence::operator[](Sequence::size_type i) const Sequence::const_reference Sequence::at(Sequence::size_type i) const { if (!seq) throw std::out_of_range("empty sequence"); - return seq->at(i); + return seq->at(i+seq_start); } void Sequence::clear() { seq.reset(); + seq_start = 0; + seq_count = 0; header.clear(); species.clear(); annots.clear(); @@ -605,41 +618,36 @@ Sequence::clear() const char *Sequence::c_str() const { if (seq) - return seq->c_str(); + return seq->c_str()+seq_start; else return 0; } Sequence::const_iterator Sequence::begin() const { - if (seq) - return seq->begin(); + if (seq and seq_count != 0) + return seq->begin()+seq_start; else return Sequence::const_iterator(0); } Sequence::const_iterator Sequence::end() const { - if (seq) - return seq->end(); - else + if (seq and seq_count != 0) { + return seq->begin() + seq_start + seq_count; + } else { return Sequence::const_iterator(0); + } } bool Sequence::empty() const { - if (seq) - return seq->empty(); - else - return true; + return (seq_count == 0) ? true : false; } Sequence::size_type Sequence::size() const { - if (seq) - return seq->size(); - else - return 0; + return seq_count; } Sequence::size_type Sequence::length() const @@ -1037,13 +1045,16 @@ bool operator<(const Sequence& x, const Sequence& y) { Sequence::const_iterator x_i = x.begin(); Sequence::const_iterator y_i = y.begin(); - + // for sequences there's some computation associated with computing .end + // so lets cache it. + Sequence::const_iterator xend = x.end(); + Sequence::const_iterator yend = y.end(); while(1) { - if( x_i == x.end() and y_i == y.end() ) { + if( x_i == xend and y_i == yend ) { return false; - } else if ( x_i == x.end() ) { + } else if ( x_i == xend ) { return true; - } else if ( y_i == y.end() ) { + } else if ( y_i == yend ) { return false; } else if ( (*x_i) < (*y_i)) { return true; @@ -1066,10 +1077,18 @@ bool operator==(const Sequence& x, const Sequence& y) // we know they can't be equal. (and we need to do this // to prevent dereferencing an empty pointer) return false; - } else if ( *(x.seq) == *(y.seq)) { - // and x.annots == y.annots and x.motif_list == y.motif_list) { - return true; - } else { + } else if (x.seq_count != y.seq_count) { + // if they're of different lenghts, they're not equal return false; } + Sequence::const_iterator xseq_i = x.begin(); + Sequence::const_iterator yseq_i = y.begin(); + // since the length of the two sequences is equal, we only need to + // test one. + for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) { + if (*xseq_i != *yseq_i) { + return false; + } + } + return true; } -- 2.30.2