// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+#include <boost/filesystem/operations.hpp>
namespace fs = boost::filesystem;
#include <boost/spirit/core.hpp>
#include <boost/spirit/actor/push_back_actor.hpp>
#include <boost/spirit/iterator/file_iterator.hpp>
+#include <boost/spirit/utility/chset.hpp>
namespace spirit = boost::spirit;
#include "alg/sequence.hpp"
+#include "io.hpp"
#include "mussa_exceptions.hpp"
#include <string>
+#include <stdexcept>
#include <iostream>
#include <sstream>
+#include <set>
-annot::annot()
- : start(0),
- end(0),
- type(""),
- name("")
+bool operator==(const motif& left, const motif& right)
{
+ return ((left.begin== right.begin) and
+ (left.end == right.end) and
+ (left.type == right.type) and
+ (left.name == right.name));
}
-annot::annot(int start, int end, std::string type, std::string name)
- : start(start),
- end(end),
- type(type),
- name(name)
+motif::motif()
+ : begin(0),
+ end(0),
+ type("motif"),
+ name(""),
+ sequence("")
{
}
-motif::motif(int start, std::string motif)
- : annot(start, start+motif.size(), "motif", motif),
+motif::motif(int begin_, std::string motif)
+ : begin(begin_),
+ end(begin_+motif.size()),
+ type("motif"),
+ name(motif),
sequence(motif)
{
}
-
-Sequence::Sequence()
- : sequence(""),
- header(""),
- species("")
+
+motif::~motif()
+{
+}
+
+Sequence::Sequence(AlphabetRef alphabet)
+ : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
+{
+}
+
+Sequence::~Sequence()
+{
+}
+
+Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- annots.clear();
- motif_list.clear();
+ set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
+}
+
+Sequence::Sequence(const std::string& seq,
+ AlphabetRef alphabet_,
+ SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
+{
+ set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
+}
+
+Sequence::Sequence(const Sequence& o)
+ : seq(o.seq),
+ header(o.header),
+ species(o.species),
+ annotation_list(o.annotation_list),
+ motif_list(o.motif_list)
+{
+}
+
+Sequence::Sequence(const Sequence* o)
+ : seq(o->seq),
+ header(o->header),
+ species(o->species),
+ annotation_list(o->annotation_list),
+ motif_list(o->motif_list)
+{
+}
+
+Sequence::Sequence(const SequenceRef o)
+ : seq(new SeqSpan(o->seq)),
+ header(o->header),
+ species(o->species),
+ annotation_list(new SeqSpanRefList),
+ motif_list(o->motif_list)
+{
+ // copy over the annotations in the other sequence ref,
+ // attaching them to our current sequence ref
+ for(SeqSpanRefList::const_iterator annot_i = o->annotation_list->begin();
+ annot_i != o->annotation_list->end();
+ ++annot_i)
+ {
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_count = (*annot_i)->size();
+
+ SeqSpanRef new_annot(seq->subseq(annot_begin, annot_count));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ annotation_list->push_back(new_annot);
+ }
}
-Sequence::Sequence(std::string seq)
+Sequence::Sequence(const SeqSpanRef& seq_ref)
+ : seq(seq_ref),
+ header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- set_filtered_sequence(seq);
}
Sequence &Sequence::operator=(const Sequence& s)
{
if (this != &s) {
- sequence = s.sequence;
+ seq = s.seq;
header = s.header;
species = s.species;
- annots = s.annots;
+ annotation_list = s.annotation_list;
+ motif_list = s.motif_list;
}
return *this;
}
-Sequence &Sequence::operator=(const std::string& s)
+void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
{
- set_filtered_sequence(s);
- return *this;
+ load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
}
-char Sequence::operator[](int index) const
+//! load a fasta file into a sequence
+void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
+ int seq_num, int start_index, int end_index)
{
- return sequence[index];
+ fs::fstream data_file;
+ data_file.open(file_path, std::ios::in);
+
+ if (!data_file.good())
+ {
+ throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
+ } else {
+ try {
+ load_fasta(data_file, a, seq_num, start_index, end_index);
+ } catch(sequence_empty_error e) {
+ // there doesn't appear to be any sequence
+ // catch and rethrow to include the filename
+ std::stringstream msg;
+ msg << "The selected sequence in "
+ << file_path.native_file_string()
+ << " appears to be empty";
+ throw sequence_empty_error(msg.str());
+ } catch(sequence_empty_file_error e) {
+ std::stringstream errormsg;
+ errormsg << file_path.native_file_string()
+ << " did not have any fasta sequences" << std::endl;
+ throw sequence_empty_file_error(errormsg.str());
+ } catch(sequence_invalid_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string();
+ msg << " " << e.what();
+ throw sequence_invalid_load_error(msg.str());
+ }
+ }
}
-std::ostream& operator<<(std::ostream& out, const Sequence& seq)
+void Sequence::load_fasta(std::istream& file,
+ int seq_num, int start_index, int end_index)
{
- out << "Sequence(" << seq.get_seq() << ")";
- return out;
+ load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
}
-//! load a fasta file into a sequence
-/*!
- * \param file_path the location of the fasta file in the filesystem
- * \param seq_num which sequence in the file to load
- * \param start_index starting position in the fasta sequence, 0 for beginning
- * \param end_index ending position in the fasta sequence, 0 for end
- * \return error message, empty string if no error. (gag!)
- */
void
-Sequence::load_fasta(fs::path file_path, int seq_num,
+Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
+ int seq_num,
int start_index, int end_index)
{
- fs::fstream data_file;
std::string file_data_line;
int header_counter = 0;
+ size_t line_counter = 0;
bool read_seq = true;
std::string rev_comp;
std::string sequence_raw;
- std::string seq_tmp; // holds sequence during basic filtering
-
- data_file.open(file_path, std::ios::in);
+ std::string seq_tmp; // holds sequence during basic filtering
+ const Alphabet &alpha = Alphabet::get_alphabet(a);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
}
- if (!data_file)
- {
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
- }
- // if file opened okay, read it
- else
+
+ // search for the header of the fasta sequence we want
+ while ( (!data_file.eof()) && (header_counter < seq_num) )
{
- // search for the header of the fasta sequence we want
- while ( (!data_file.eof()) && (header_counter < seq_num) )
- {
- getline(data_file,file_data_line);
- if (file_data_line.substr(0,1) == ">")
- header_counter++;
- }
+ multiplatform_getline(data_file, file_data_line);
+ ++line_counter;
+ if (file_data_line.substr(0,1) == ">")
+ header_counter++;
+ }
- if (header_counter > 0) {
- header = file_data_line.substr(1);
+ if (header_counter > 0) {
+ header = file_data_line.substr(1);
- sequence_raw = "";
+ sequence_raw = "";
- while ( !data_file.eof() && read_seq ) {
- getline(data_file,file_data_line);
- if (file_data_line.substr(0,1) == ">")
- read_seq = false;
- else sequence_raw += file_data_line;
+ while ( !data_file.eof() && read_seq ) {
+ multiplatform_getline(data_file,file_data_line);
+ ++line_counter;
+ if (file_data_line.substr(0,1) == ">")
+ read_seq = false;
+ else {
+ for (std::string::const_iterator line_i = file_data_line.begin();
+ line_i != file_data_line.end();
+ ++line_i)
+ {
+ if(alpha.exists(*line_i)) {
+ sequence_raw += *line_i;
+ } else {
+ std::ostringstream msg;
+ msg << "Unrecognized characters in fasta sequence at line ";
+ msg << line_counter;
+ throw sequence_invalid_load_error(msg.str());
+ }
+ }
}
+ }
- // Lastly, if subselection of the sequence was specified we keep cut out
- // and only keep that part
- // end_index = 0 means no end was specified, so cut to the end
- if (end_index == 0)
- end_index = sequence_raw.size();
+ // Lastly, if subselection of the sequence was specified we keep cut out
+ // and only keep that part
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = sequence_raw.size();
- // sequence filtering for upcasing agctn and convert non AGCTN to N
- set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
- } else {
- std::stringstream errormsg;
- errormsg << file_path.native_file_string()
- << " did not have any fasta sequences" << std::endl;
- throw mussa_load_error(errormsg.str());
+ // sequence filtering for upcasing agctn and convert non AGCTN to N
+ if (end_index-start_index <= 0) {
+ std::string msg("The selected sequence appears to be empty");
+ throw sequence_empty_error(msg);
}
- data_file.close();
+ set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
+ } else {
+ std::string errormsg("There were no fasta sequences");
+ throw sequence_empty_file_error(errormsg);
}
}
-void Sequence::set_filtered_sequence(const std::string &old_seq,
- std::string::size_type start,
- std::string::size_type count)
+void Sequence::set_filtered_sequence(const std::string &in_seq,
+ AlphabetRef alphabet_,
+ size_type start,
+ size_type count,
+ SeqSpan::strand_type strand_)
{
- char conversionTable[257];
-
- if ( count == 0)
- count = old_seq.size() - start;
- sequence.clear();
- sequence.reserve(count);
-
- // Make a conversion table
-
- // everything we don't specify below will become 'N'
- for(int table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = 'N';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // we want these to map to themselves - ie not to change
- conversionTable[(int)'A'] = 'A';
- conversionTable[(int)'T'] = 'T';
- conversionTable[(int)'G'] = 'G';
- conversionTable[(int)'C'] = 'C';
- // this is to upcase
- conversionTable[(int)'a'] = 'A';
- conversionTable[(int)'t'] = 'T';
- conversionTable[(int)'g'] = 'G';
- conversionTable[(int)'c'] = 'C';
+ if ( count == npos)
+ count = in_seq.size() - start;
+ std::string new_seq;
+ new_seq.reserve(count);
// finally, the actual conversion loop
- for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
+ const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
+ std::string::const_iterator seq_i = in_seq.begin()+start;
+ for(size_type i = 0; i != count; ++i, ++seq_i)
{
- sequence += conversionTable[ (int)old_seq[seq_index+start]];
+ if (alpha_impl.exists(*seq_i)) {
+ new_seq.append(1, toupper(*seq_i));
+ } else {
+ new_seq.append(1, 'N');
+ }
}
+ SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
+ seq = new_seq_ref;
}
- // this doesn't work properly under gcc 3.x ... it can't recognize toupper
- //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
-
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
+ if (not fs::exists(file_path)) {
+ throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
+ }
+ if (fs::is_directory(file_path)) {
+ throw mussa_load_error(file_path.string() +
+ " is a directory, please provide a file for annotations."
+ );
+ }
fs::fstream data_stream(file_path, std::ios::in);
if (!data_stream)
{
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
+ throw mussa_load_error("Error loading annotation file " + file_path.string());
+ }
+
+ try {
+ load_annot(data_stream, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
}
+ data_stream.close();
+}
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
// so i should probably be passing the parse function some iterators
// but the annotations files are (currently) small, so i think i can
// get away with loading the whole file into memory
data_stream.get(c);
data.push_back(c);
}
- data_stream.close();
-
+
parse_annot(data, start_index, end_index);
}
*/
struct push_back_annot {
- std::list<annot>& annot_list;
+ Sequence* parent;
+ SeqSpanRefListRef children;
int& begin;
int& end;
std::string& name;
std::string& type;
+ int &parsed;
- push_back_annot(std::list<annot>& annot_list_,
+ push_back_annot(Sequence* parent_seq,
+ SeqSpanRefListRef children_list,
int& begin_,
int& end_,
std::string& name_,
- std::string& type_)
- : annot_list(annot_list_),
+ std::string& type_,
+ int &parsed_)
+ : parent(parent_seq),
+ children(children_list),
begin(begin_),
end(end_),
name(name_),
- type(type_)
+ type(type_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
- //std::cout << "adding annot: " << begin << " " << end << " " << name << " " << type << std::endl;
- annot_list.push_back(annot(begin, end, name, type));
+ children->push_back(parent->make_annotation(name, type, begin, end));
+ ++parsed;
};
};
std::list<Sequence>& seq_list;
std::string& name;
std::string& seq;
+ int &parsed;
push_back_seq(std::list<Sequence>& seq_list_,
std::string& name_,
- std::string& seq_)
+ std::string& seq_,
+ int &parsed_)
: seq_list(seq_list_),
name(name_),
- seq(seq_)
+ seq(seq_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
+ std::string::iterator seq_i = seq.begin();
+ std::string::iterator seq_end = seq.end();
+
+ // this if block is a hack, for some reason spirit was
+ // duplicating the last character if the file didn't end
+ // with a new line.
+ // this checks for the trailing newline, and if it is missing
+ // removes the last character ( which should be the duplicated character.
+ // check test_sequence.cpp:sequence_no_trailing_newline for test case
+ // also see ticket:265 for more information
+ if (seq.size() > 0) {
+ std::string::value_type c = seq[seq.size()-1];
+ if (not (c == '\015' or c == '\012')) {
+ // doesn't end with a new line character
+ seq_end--;
+ }
+ }
+ // end hack
+
// filter out newlines from our sequence
std::string new_seq;
- for(std::string::const_iterator seq_i = seq.begin();
- seq_i != seq.end();
- ++seq_i)
+ for(; seq_i != seq_end; ++seq_i)
{
- if (*seq_i != '\n') new_seq += *seq_i;
+ if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
}
//std::cout << "adding seq: " << name << " " << new_seq << std::endl;
-
Sequence s(new_seq);
- s.set_header(name);
+ s.set_fasta_header(name);
seq_list.push_back(s);
+ ++parsed;
};
};
int end=0;
std::string name;
std::string type;
- std::string seq;
+ std::string seqstr;
+ SeqSpanRefListRef parsed_annots(new SeqSpanRefList);
std::list<Sequence> query_seqs;
-
- bool status = spirit::parse(data.begin(), data.end(),
- (
- //begin grammar
- (+(spirit::alpha_p))[spirit::assign_a(species)] >>
- +(spirit::space_p) >>
- *(
- ( // parse an absolute location name
- (spirit::uint_p[spirit::assign_a(start)] >>
- +spirit::space_p >>
- spirit::uint_p[spirit::assign_a(end)] >>
- +spirit::space_p >>
- (*(spirit::alpha_p|spirit::digit_p))[spirit::assign_a(name)] >>
- // optional type
- !(
- +spirit::space_p >>
- (*(spirit::alpha_p))[spirit::assign_a(type)]
- )
- // to understand how this group gets set
- // read the comment above struct push_back_annot
- )[push_back_annot(annots, start, end, type, name)]
- |
- (spirit::ch_p('>') >>
- (*(~spirit::chlit<char>('\n')))[spirit::assign_a(name)] >>
- +spirit::space_p >>
- (+(spirit::ch_p('A')|
- spirit::ch_p('G')|
- spirit::ch_p('C')|
- spirit::ch_p('T')|
- spirit::ch_p('N')|
- spirit::ch_p('\n')))[spirit::assign_a(seq)]
- )[push_back_seq(query_seqs, name, seq)]
- ) >>
- *spirit::space_p
+ int parsed=0;
+
+ bool ok = spirit::parse(data.begin(), data.end(),
+ (
+ //begin grammar
+ !(
+ (
+ spirit::alpha_p >>
+ +(spirit::graph_p)
+ )[spirit::assign_a(species)] >>
+ +(spirit::space_p)
+ ) >>
+ *(
+ ( // ignore html tags
+ *(spirit::space_p) >>
+ spirit::ch_p('<') >>
+ +(~spirit::ch_p('>')) >>
+ spirit::ch_p('>') >>
+ *(spirit::space_p)
)
- //end grammar
- ) /*,
- spirit::space_p*/).full;
-
- // go seearch for query sequences
+ |
+ ( // parse an absolute location name
+ (spirit::uint_p[spirit::assign_a(start)] >>
+ +spirit::space_p >>
+ spirit::uint_p[spirit::assign_a(end)] >>
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(name)] >>
+ // optional type
+ !(
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(type)]
+ )
+ // to understand how this group gets set
+ // read the comment above struct push_back_annot
+ )[push_back_annot(this, parsed_annots, start, end, name, type, parsed)]
+ |
+ ((spirit::ch_p('>')|spirit::str_p(">")) >>
+ (*(spirit::print_p))[spirit::assign_a(name)] >>
+ spirit::eol_p >>
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seqstr)]
+ )[push_back_seq(query_seqs, name, seqstr, parsed)]
+ ) >>
+ *spirit::space_p
+ )
+ //end grammar
+ )).full;
+ if (not ok) {
+ std::stringstream msg;
+ msg << "Error parsing annotation #" << parsed;
+ throw annotation_load_error(msg.str());
+ }
+ // If everything loaded correctly add the sequences to our annotation list
+ // add newly parsed annotations to our sequence
+ std::copy(parsed_annots->begin(), parsed_annots->end(), std::back_inserter(*annotation_list));
+ // go search for query sequences
find_sequences(query_seqs.begin(), query_seqs.end());
}
-/*
-void
-Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+void Sequence::add_annotation(const SeqSpanRef a)
{
- std::string file_data_line;
- annot an_annot;
- std::string::size_type space_split_i;
- std::string annot_value;
- std::list<annot>::iterator list_i;
- std::string err_msg;
+ annotation_list->push_back(a);
+}
- annots.clear();
+void Sequence::add_annotation(std::string name, std::string type, size_type start, size_type stop)
+{
+ add_annotation(make_annotation(name, type, start, stop));
+}
- getline(data_stream,file_data_line);
- species = file_data_line;
+SeqSpanRef
+Sequence::make_annotation(std::string name, std::string type, size_type start, size_type stop) const
+{
+ // we want things to be in the positive direction
+ if (stop < start) {
+ size_type tmp = start;
+ start = stop;
+ stop = tmp;
+ }
+ size_type count = stop - start;
+ SeqSpanRef new_annot(seq->subseq(start, count, SeqSpan::UnknownStrand));
+ AnnotationsRef metadata(new Annotations(name));
+ metadata->set("type", type);
+ new_annot->setAnnotations(metadata);
+ return new_annot;
+}
- // end_index = 0 means no end was specified, so cut to the end
- if (end_index == 0)
- end_index = sequence.length();
+const SeqSpanRefList& Sequence::annotations() const
+{
+ return *annotation_list;
+}
- //std::cout << "START: " << start_index << " END: " << end_index << std::endl;
+void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
+{
+ new_seq.motif_list = motif_list;
+ new_seq.annotation_list.reset(new SeqSpanRefList);
- while ( !data_stream.eof() )
+ for(SeqSpanRefList::const_iterator annot_i = annotation_list->begin();
+ annot_i != annotation_list->end();
+ ++annot_i)
{
- getline(data_stream,file_data_line);
- if (file_data_line != "")
- {
- // need to get 4 values...almost same code 4 times...
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.start = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot end index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
-
- //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
- // << std::endl;
-
- // get annot name
- space_split_i = file_data_line.find(" ");
- if (space_split_i == std::string::npos) // no entries for name & type
- {
- std::cout << "seq, annots - no name or type\n";
- an_annot.name = "";
- an_annot.type = "";
- }
- else
- {
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.name = annot_value;
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot type
- space_split_i = file_data_line.find(" ");
- if (space_split_i == std::string::npos) // no entry for type
- an_annot.type = "";
- else
- {
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
- }
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_end = (*annot_i)->stop();
+
+ if (annot_begin < start+count) {
+ if (annot_begin >= start) {
+ annot_begin -= start;
+ } else {
+ annot_begin = 0;
}
-
- // add annot to list if it falls within the range of sequence specified
- if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
- {
- an_annot.start -= start_index;
- an_annot.end -= start_index;
- annots.push_back(an_annot);
+ if (annot_end < start+count) {
+ annot_end -= start;
+ } else {
+ annot_end = count;
}
- // else no (or bogus) annotations
+ SeqSpanRef new_annot(new_seq.seq->subseq(annot_begin, annot_end));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ new_seq.annotation_list->push_back(new_annot);
}
}
}
-*/
-
-const std::string& Sequence::get_species() const
-{
- return species;
-}
-
-bool Sequence::empty() const
-{
- return (size() == 0);
-}
-const std::list<annot>& Sequence::annotations() const
+Sequence
+Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
{
- return annots;
-}
+ // FIXME: should i really allow a subsequence of an empty sequence?
+ if (!seq) {
+ Sequence new_seq;
+ return new_seq;
+ }
-std::string::size_type Sequence::length() const
-{
- return size();
+ Sequence new_seq(*this);
+ new_seq.seq = seq->subseq(start, count, strand);
+ if (seq->annotations()) {
+ AnnotationsRef a(new Annotations(*(seq->annotations())));
+ new_seq.seq->setAnnotations(a);
+ }
+ copy_children(new_seq, start, count);
+
+ return new_seq;
}
-std::string::size_type Sequence::size() const
-{
- return sequence.size();
-}
-Sequence::iterator Sequence::begin()
+// FIXME: This needs to be moved into SeqSpan
+Sequence Sequence::rev_comp() const
{
- return sequence.begin();
+ // a reverse complement is the whole opposite strand
+ return subseq(0, npos, SeqSpan::OppositeStrand);
}
-Sequence::const_iterator Sequence::begin() const
+const Alphabet& Sequence::get_alphabet() const
{
- return sequence.begin();
+ return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
}
-Sequence::iterator Sequence::end()
+void Sequence::set_fasta_header(std::string header_)
{
- return sequence.end();
+ header = header_;
}
-Sequence::const_iterator Sequence::end() const
+void Sequence::set_species(const std::string& name)
{
- return sequence.end();
+ species = name;
}
-
-const std::string&
-Sequence::get_seq() const
+std::string Sequence::get_species() const
{
- return sequence;
+ return species;
}
std::string
-Sequence::subseq(int start, int end) const
-{
- return sequence.substr(start, end);
-}
-
-
-const char *
-Sequence::c_seq() const
+Sequence::get_fasta_header() const
{
- return sequence.c_str();
+ return header;
}
std::string
-Sequence::rev_comp() const
-{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = sequence.length();
- rev_comp.reserve(len);
- // make a conversion table
- // init all parts of conversion table to '~' character
- // '~' I doubt will ever appear in a sequence file (jeez, I hope)
- // and may the fleas of 1000 camels infest the genitals of any biologist (and
- // seven generations of their progeny) who decides to make it mean
- // something special!!!
- // PS - double the curse for any smartass non-biologist who tries it as well
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // add in the characters for the bases we want to convert
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
-
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
- {
- table_i = (int) sequence[seq_i];
- rev_comp += conversionTable[table_i];
- }
-
- return rev_comp;
-}
-
-void Sequence::set_header(std::string header_)
+Sequence::get_name() const
{
- header = header_;
+ if (header.size() > 0)
+ return header;
+ else if (species.size() > 0)
+ return species;
+ else
+ return "";
}
-std::string
-Sequence::get_header() const
+void Sequence::set_sequence(const std::string& s, AlphabetRef a)
{
- return header;
-}
-/*
-//FIXME: i don't think this code is callable
-std::string
-Sequence::sp_name() const
-{
- return species;
+ set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
}
-*/
-void
-Sequence::set_seq(const std::string& a_seq)
+std::string Sequence::get_sequence() const
{
- set_filtered_sequence(a_seq);
+ return seq->sequence();
}
-
-/*
-std::string
-Sequence::species()
+Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
{
- return species;
+ return at(i);
}
-*/
void
Sequence::clear()
{
- sequence = "";
- header = "";
- species = "";
- annots.clear();
+ seq.reset();
+ header.clear();
+ species.clear();
+ annotation_list.reset(new SeqSpanRefList);
+ motif_list.reset(new MotifList);
}
void
Sequence::save(fs::fstream &save_file)
- //std::string save_file_path)
{
+ std::string type("type");
+ std::string empty_str("");
//fstream save_file;
- std::list<annot>::iterator annots_i;
-
- // not sure why, or if i'm doing something wrong, but can't seem to pass
- // file pointers down to this method from the mussa control class
- // so each call to save a sequence appends to the file started by mussa_class
- //save_file.open(save_file_path.c_str(), std::ios::app);
+ SeqSpanRefList::iterator annots_i;
+ AnnotationsRef metadata;
save_file << "<Sequence>" << std::endl;
- save_file << sequence << std::endl;
+ save_file << *this << std::endl;
save_file << "</Sequence>" << std::endl;
save_file << "<Annotations>" << std::endl;
save_file << species << std::endl;
- for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
+ for (annots_i = annotation_list->begin();
+ annots_i != annotation_list->end();
+ ++annots_i)
{
- save_file << annots_i->start << " " << annots_i->end << " " ;
- save_file << annots_i->name << " " << annots_i->type << std::endl;
+ metadata = (*annots_i)->annotations();
+ save_file << (*annots_i)->parentStart() << " " << (*annots_i)->parentStop() << " " ;
+ save_file << metadata->name() << " "
+ << metadata->getdefault(type, empty_str) << std::endl;
}
save_file << "</Annotations>" << std::endl;
//save_file.close();
}
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+// fs::fstream load_file;
+// std::string file_data_line;
+// int seq_counter;
+// //annot an_annot;
+// int annot_begin;
+// int annot_end;
+// std::string annot_name;
+// std::string annot_type;
+//
+// std::string::size_type space_split_i;
+// std::string annot_value;
+//
+// annotation_list.reset(new SeqSpanRefList);
+//
+// load_file.open(load_file_path, std::ios::in);
+//
+// seq_counter = 0;
+// // search for the seq_num-th sequence
+// while ( (!load_file.eof()) && (seq_counter < seq_num) )
+// {
+// getline(load_file,file_data_line);
+// if (file_data_line == "<Sequence>")
+// seq_counter++;
+// }
+// getline(load_file, file_data_line);
+// // looks like the sequence is written as a single line
+// set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+// getline(load_file, file_data_line);
+// getline(load_file, file_data_line);
+// if (file_data_line == "<Annotations>")
+// {
+// getline(load_file, file_data_line);
+// species = file_data_line;
+// while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+// {
+// getline(load_file,file_data_line);
+// if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+// {
+// // need to get 4 values...almost same code 4 times...
+// // get annot start index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_begin = atoi (annot_value.c_str());
+// file_data_line = file_data_line.substr(space_split_i+1);
+// // get annot end index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_end = atoi (annot_value.c_str());
+//
+// if (space_split_i == std::string::npos) // no entry for type or name
+// {
+// std::cout << "seq, annots - no type or name\n";
+// annot_name = "";
+// annot_type = "";
+// }
+// else // else get annot type
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// //an_annot.type = annot_value;
+// annot_type = annot_value;
+// if (space_split_i == std::string::npos) // no entry for name
+// {
+// std::cout << "seq, annots - no name\n";
+// annot_name = "";
+// }
+// else // get annot name
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// // this seems like its wrong?
+// annot_type = annot_value;
+// }
+// }
+// add_annotation(annot_name, annot_type, annot_begin, annot_end);
+// }
+// //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+// // << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+// }
+// }
+// load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
{
- fs::fstream load_file;
+ boost::shared_ptr<Sequence> seq(new Sequence);
std::string file_data_line;
int seq_counter;
- annot an_annot;
+ //annot an_annot;
+ int annot_begin;
+ int annot_end;
+ std::string annot_name;
+ std::string annot_type;
+
std::string::size_type space_split_i;
std::string annot_value;
- annots.clear();
- load_file.open(load_file_path, std::ios::in);
+ //seq->annotation_list.reset(new SeqSpanRefList);
seq_counter = 0;
- // search for the seq_num-th sequence
+ // search for the next sequence
+ int seq_num = 1;
while ( (!load_file.eof()) && (seq_counter < seq_num) )
{
getline(load_file,file_data_line);
if (file_data_line == "<Sequence>")
seq_counter++;
}
+
+ // Could not find next sequence
+ if (load_file.eof())
+ {
+ seq.reset();
+ return seq;
+ }
+
getline(load_file, file_data_line);
- sequence = file_data_line;
+ // looks like the sequence is written as a single line
+ seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
{
getline(load_file, file_data_line);
- species = file_data_line;
+ seq->set_species(file_data_line);
while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
{
getline(load_file,file_data_line);
// get annot start index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.start = atoi (annot_value.c_str());
+ annot_begin = atoi (annot_value.c_str());
file_data_line = file_data_line.substr(space_split_i+1);
// get annot end index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
+ annot_end = atoi (annot_value.c_str());
if (space_split_i == std::string::npos) // no entry for type or name
{
std::cout << "seq, annots - no type or name\n";
- an_annot.type = "";
- an_annot.name = "";
+ annot_name = "";
+ annot_type = "";
}
else // else get annot type
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ //an_annot.type = annot_value;
+ annot_type = annot_value;
if (space_split_i == std::string::npos) // no entry for name
{
std::cout << "seq, annots - no name\n";
- an_annot.name = "";
+ annot_name = "";
}
else // get annot name
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ // this seems like its wrong?
+ annot_type = annot_value;
}
}
- annots.push_back(an_annot); // don't forget to actually add the annot
+ seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
}
//std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
// << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
}
}
- load_file.close();
+ //load_file.close();
+ return seq;
}
-std::string
-Sequence::rc_motif(std::string a_motif)
-{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = a_motif.length();
- rev_comp.reserve(len);
-
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
- }
- // add end of std::string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // add in the characters for the bases we want to convert (IUPAC)
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
- conversionTable[(int)'M'] = 'K';
- conversionTable[(int)'R'] = 'Y';
- conversionTable[(int)'W'] = 'W';
- conversionTable[(int)'S'] = 'S';
- conversionTable[(int)'Y'] = 'R';
- conversionTable[(int)'K'] = 'M';
- conversionTable[(int)'V'] = 'B';
- conversionTable[(int)'H'] = 'D';
- conversionTable[(int)'D'] = 'H';
- conversionTable[(int)'B'] = 'V';
-
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
- {
- //std::cout << "** i = " << seq_i << " bp = " <<
- table_i = (int) a_motif[seq_i];
- rev_comp += conversionTable[table_i];
- }
-
- //std::cout << "seq: " << a_motif << std::endl;
- //std::cout << "rc: " << rev_comp << std::endl;
-
- return rev_comp;
-}
-
-std::string
-Sequence::motif_normalize(std::string a_motif)
-{
- std::string valid_motif;
- int seq_i, len;
-
- len = a_motif.length();
- valid_motif.reserve(len);
-
- // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
- // current nonIUPAC symbols are omitted, which is not reported atm
- for(seq_i = 0; seq_i < len; seq_i++)
- {
- if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
- valid_motif += 'A';
- else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
- valid_motif += 'T';
- else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
- valid_motif += 'C';
- else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
- valid_motif += 'N';
- else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
- valid_motif += 'M';
- else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
- valid_motif += 'R';
- else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
- valid_motif += 'W';
- else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
- valid_motif += 'S';
- else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
- valid_motif += 'Y';
- else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
- valid_motif += 'V';
- else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
- valid_motif += 'H';
- else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
- valid_motif += 'D';
- else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
- valid_motif += 'B';
- else {
- std::string msg = "Letter ";
- msg += a_motif[seq_i];
- msg += " is not a valid IUPAC symbol";
- throw motif_normalize_error(msg);
- }
- }
- //std::cout << "valid_motif is: " << valid_motif << std::endl;
- return valid_motif;
-}
-
-void Sequence::add_motif(std::string a_motif)
+void Sequence::add_motif(const Sequence& a_motif)
{
std::vector<int> motif_starts = find_motif(a_motif);
motif_start_i != motif_starts.end();
++motif_start_i)
{
- motif_list.push_back(motif(*motif_start_i, a_motif));
+ motif_list->push_back(motif(*motif_start_i, a_motif.get_sequence()));
}
}
void Sequence::clear_motifs()
{
- motif_list.clear();
+ if (motif_list)
+ motif_list->clear();
+ else
+ motif_list.reset(new MotifList);
}
-const std::list<motif>& Sequence::motifs() const
+const Sequence::MotifList& Sequence::motifs() const
{
- return motif_list;
+ return *motif_list;
}
std::vector<int>
-Sequence::find_motif(std::string a_motif)
+Sequence::find_motif(const Sequence& a_motif) const
{
std::vector<int> motif_match_starts;
- std::string a_motif_rc;
+ Sequence norm_motif_rc;
motif_match_starts.clear();
+ // std::cout << "motif is: " << norm_motif << std::endl;
- //std::cout << "motif is: " << a_motif << std::endl;
- a_motif = motif_normalize(a_motif);
- //std::cout << "motif is: " << a_motif << std::endl;
-
- if (a_motif != "")
+ if (a_motif.size() > 0)
{
//std::cout << "Sequence: none blank motif\n";
motif_scan(a_motif, &motif_match_starts);
- a_motif_rc = rc_motif(a_motif);
+ norm_motif_rc = a_motif.rev_comp();;
// make sure not to do search again if it is a palindrome
- if (a_motif_rc != a_motif)
- motif_scan(a_motif_rc, &motif_match_starts);
+ if (norm_motif_rc != a_motif) {
+ motif_scan(norm_motif_rc, &motif_match_starts);
+ }
}
return motif_match_starts;
}
void
-Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts)
+Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
{
- char * seq_c;
- std::string::size_type seq_i;
- int motif_i, motif_len;
-
- // faster to loop thru the sequence as a old c std::string (ie char array)
- seq_c = (char*)sequence.c_str();
- //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl;
- motif_len = a_motif.length();
-
- //std::cout << "motif_length: " << motif_len << std::endl;
- //std::cout << "RAAARRRRR\n";
-
- motif_i = 0;
+ // if there's no sequence we can't scan for it?
+ // should this throw an exception?
+ if (!seq) return;
- //std::cout << "motif: " << a_motif << std::endl;
+ std::string::size_type seq_i = 0;
+ Sequence::size_type motif_i = 0;
+ Sequence::size_type motif_len = a_motif.length();
+ Sequence::value_type motif_char;
+ Sequence::value_type seq_char;
- //std::cout << "Sequence: motif, length= " << length << std::endl;
- seq_i = 0;
- while (seq_i < sequence.length())
+ while (seq_i < size())
{
- //std::cout << seq_c[seq_i];
- //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
- if (a_motif[motif_i] =='N')
+ motif_char = toupper(a_motif[motif_i]);
+ seq_char = toupper(seq->at(seq_i));
+ if (motif_char =='N')
motif_i++;
- else if (a_motif[motif_i] == seq_c[seq_i])
+ else if (motif_char == seq_char)
motif_i++;
- else if ((a_motif[motif_i] =='M') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+ else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
motif_i++;
- else if ((a_motif[motif_i] =='R') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='W') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='S') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='Y') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='K') &&
- ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='V') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='G')))
+ else if ((motif_char =='V') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[seq_i] =='H') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='H') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='D') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='D') &&
+ ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='B') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='B') &&
+ ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
-
else
{
+ // if a motif doesn't match, erase our current trial and try again
seq_i -= motif_i;
motif_i = 0;
}
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- //std::cout << "!!";
- annot new_motif;
motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
}
{
std::vector<int> seq_starts = find_motif(a_seq);
- //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
-
for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
seq_start_i != seq_starts.end();
++seq_start_i)
{
- annots.push_back(annot(*seq_start_i,
- *seq_start_i+a_seq.size(),
- "",
- name));
+ add_annotation(name, "", *seq_start_i, *seq_start_i+a_seq.size());
}
}
std::list<Sequence>::iterator end)
{
while (start != end) {
- add_string_annotation(start->get_seq(), start->get_header());
+ add_string_annotation(start->get_sequence(), start->get_fasta_header());
++start;
}
}
+
+std::ostream& operator<<(std::ostream& out, const Sequence& s)
+{
+ if (s.seq) {
+ for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
+ out << *s_i;
+ }
+ }
+ return out;
+}
+
+bool operator<(const Sequence& x, const Sequence& y)
+{
+ Sequence::const_iterator x_i = x.begin();
+ Sequence::const_iterator y_i = y.begin();
+ // for sequences there's some computation associated with computing .end
+ // so lets cache it.
+ Sequence::const_iterator xend = x.end();
+ Sequence::const_iterator yend = y.end();
+ while(1) {
+ if( x_i == xend and y_i == yend ) {
+ return false;
+ } else if ( x_i == xend ) {
+ return true;
+ } else if ( y_i == yend ) {
+ return false;
+ } else if ( (*x_i) < (*y_i)) {
+ return true;
+ } else if ( (*x_i) > (*y_i) ) {
+ return false;
+ } else {
+ ++x_i;
+ ++y_i;
+ }
+ }
+}
+
+template <typename Iter1, typename Iter2>
+static
+bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
+{
+ Iter1 aseq_i = abegin;
+ Iter2 bseq_i = bbegin;
+ if (aend-abegin == bend-bbegin) {
+ // since the length of the two sequences is equal, we only need to
+ // test one.
+ for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
+ if (toupper(*aseq_i) != toupper(*bseq_i)) {
+ return false;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool operator==(const Sequence& x, const Sequence& y)
+{
+ if (x.seq and y.seq) {
+ // both x and y are defined
+ if (SeqSpan::isFamily(x.seq, y.seq)) {
+ // both are part of the same SeqSpan tree
+ return *(x.seq) == *(y.seq);
+ } else {
+ // we'll have to do a real comparison
+ return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
+ x.begin(), x.end(),
+ y.begin(), y.end()
+ );
+ }
+ } else {
+ // true if they're both empty (with either a null SeqSpanRef or
+ // a zero length string
+ return (x.size() == y.size());
+ }
+}
+
+bool operator!=(const Sequence& x, const Sequence& y)
+{
+ return not operator==(x, y);
+}
+