// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+#include <boost/filesystem/operations.hpp>
namespace fs = boost::filesystem;
#include <boost/spirit/core.hpp>
namespace spirit = boost::spirit;
#include "alg/sequence.hpp"
+#include "io.hpp"
#include "mussa_exceptions.hpp"
#include <string>
#include <sstream>
#include <set>
-annot::annot()
- : begin(0),
- end(0),
- type(""),
- name("")
-{
-}
-
-annot::annot(int begin, int end, std::string type, std::string name)
- : begin(begin),
- end(end),
- type(type),
- name(name)
-{
-}
-
-annot::~annot()
-{
-}
-
-bool operator==(const annot& left, const annot& right)
+bool operator==(const motif& left, const motif& right)
{
return ((left.begin== right.begin) and
(left.end == right.end) and
(left.name == right.name));
}
-motif::motif(int begin, std::string motif)
- : annot(begin, begin+motif.size(), "motif", motif),
+motif::motif()
+ : begin(0),
+ end(0),
+ type("motif"),
+ name(""),
+ sequence("")
+{
+}
+
+motif::motif(int begin_, std::string motif)
+ : begin(begin_),
+ end(begin_+motif.size()),
+ type("motif"),
+ name(motif),
sequence(motif)
{
}
{
}
-
-Sequence::Sequence(alphabet_ref alphabet_)
- : parent(0),
- alphabet(alphabet_),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand)
+Sequence::Sequence(AlphabetRef alphabet)
+ : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
}
{
}
-Sequence::Sequence(const char *seq, alphabet_ref alphabet_)
- : parent(0),
- alphabet(alphabet_),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand),
- header(""),
- species("")
+Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- set_filtered_sequence(seq, alphabet);
+ set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
}
-Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_)
- : parent(0),
- alphabet(alphabet_),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand),
- header(""),
- species("")
+Sequence::Sequence(const std::string& seq,
+ AlphabetRef alphabet_,
+ SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- set_filtered_sequence(seq, alphabet);
+ set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
}
Sequence::Sequence(const Sequence& o)
- : parent(o.parent),
- seq(o.seq),
- alphabet(o.alphabet),
- seq_start(o.seq_start),
- seq_count(o.seq_count),
- strand(o.strand),
+ : seq(o.seq),
header(o.header),
species(o.species),
- annots(o.annots),
+ annotation_list(o.annotation_list),
motif_list(o.motif_list)
{
}
+Sequence::Sequence(const Sequence* o)
+ : seq(o->seq),
+ header(o->header),
+ species(o->species),
+ annotation_list(o->annotation_list),
+ motif_list(o->motif_list)
+{
+}
+
+Sequence::Sequence(const SequenceRef o)
+ : seq(new SeqSpan(o->seq)),
+ header(o->header),
+ species(o->species),
+ annotation_list(new SeqSpanRefList),
+ motif_list(o->motif_list)
+{
+ // copy over the annotations in the other sequence ref,
+ // attaching them to our current sequence ref
+ for(SeqSpanRefList::const_iterator annot_i = o->annotation_list->begin();
+ annot_i != o->annotation_list->end();
+ ++annot_i)
+ {
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_count = (*annot_i)->size();
+
+ SeqSpanRef new_annot(seq->subseq(annot_begin, annot_count));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ annotation_list->push_back(new_annot);
+ }
+}
+
+Sequence::Sequence(const SeqSpanRef& seq_ref)
+ : seq(seq_ref),
+ header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
+{
+}
+
Sequence &Sequence::operator=(const Sequence& s)
{
if (this != &s) {
- parent = s.parent;
seq = s.seq;
- alphabet = s.alphabet;
- seq_start = s.seq_start;
- seq_count = s.seq_count;
- strand = s.strand;
header = s.header;
species = s.species;
- annots = s.annots;
+ annotation_list = s.annotation_list;
motif_list = s.motif_list;
}
return *this;
}
-static void multiplatform_getline(std::istream& in, std::string& line)
-{
- line.clear();
- char c;
- in.get(c);
- while(in.good() and !(c == '\012' or c == '\015') ) {
- line.push_back(c);
- in.get(c);
- }
- // if we have cr-lf eat it
- c = in.peek();
- if (c=='\012' or c == '\015') {
- in.get();
- }
-}
-
void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
{
- load_fasta(file_path, alphabet, seq_num, start_index, end_index);
+ load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
}
//! load a fasta file into a sequence
-void Sequence::load_fasta(fs::path file_path, alphabet_ref a,
+void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
int seq_num, int start_index, int end_index)
{
fs::fstream data_file;
errormsg << file_path.native_file_string()
<< " did not have any fasta sequences" << std::endl;
throw sequence_empty_file_error(errormsg.str());
+ } catch(sequence_invalid_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string();
+ msg << " " << e.what();
+ throw sequence_invalid_load_error(msg.str());
}
}
}
-void Sequence::load_fasta(std::iostream& file,
+void Sequence::load_fasta(std::istream& file,
int seq_num, int start_index, int end_index)
{
- load_fasta(file, alphabet, seq_num, start_index, end_index);
+ load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
}
void
-Sequence::load_fasta(std::iostream& data_file, alphabet_ref a,
+Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
int seq_num,
int start_index, int end_index)
{
std::string file_data_line;
int header_counter = 0;
+ size_t line_counter = 0;
bool read_seq = true;
std::string rev_comp;
std::string sequence_raw;
std::string seq_tmp; // holds sequence during basic filtering
- const Alphabet &alpha = get_alphabet(a);
+ const Alphabet &alpha = Alphabet::get_alphabet(a);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
while ( (!data_file.eof()) && (header_counter < seq_num) )
{
multiplatform_getline(data_file, file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
header_counter++;
}
while ( !data_file.eof() && read_seq ) {
multiplatform_getline(data_file,file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
read_seq = false;
else {
if(alpha.exists(*line_i)) {
sequence_raw += *line_i;
} else {
- throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
+ std::ostringstream msg;
+ msg << "Unrecognized characters in fasta sequence at line ";
+ msg << line_counter;
+ throw sequence_invalid_load_error(msg.str());
}
}
}
std::string msg("The selected sequence appears to be empty");
throw sequence_empty_error(msg);
}
- set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
+ set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
} else {
std::string errormsg("There were no fasta sequences");
throw sequence_empty_file_error(errormsg);
}
void Sequence::set_filtered_sequence(const std::string &in_seq,
- alphabet_ref alphabet_,
+ AlphabetRef alphabet_,
size_type start,
size_type count,
- strand_type strand_)
+ SeqSpan::strand_type strand_)
{
- alphabet = alphabet_;
if ( count == npos)
count = in_seq.size() - start;
- boost::shared_ptr<seq_string> new_seq(new seq_string);
- new_seq->reserve(count);
+ std::string new_seq;
+ new_seq.reserve(count);
// finally, the actual conversion loop
- const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets
+ const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
std::string::const_iterator seq_i = in_seq.begin()+start;
for(size_type i = 0; i != count; ++i, ++seq_i)
{
if (alpha_impl.exists(*seq_i)) {
- new_seq->append(1, *seq_i);
+ new_seq.append(1, toupper(*seq_i));
} else {
- new_seq->append(1, 'N');
+ new_seq.append(1, 'N');
}
}
- parent = 0;
- seq = new_seq;
- seq_start = 0;
- seq_count = count;
- strand = strand_;
+ SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
+ seq = new_seq_ref;
}
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
+ if (not fs::exists(file_path)) {
+ throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
+ }
+ if (fs::is_directory(file_path)) {
+ throw mussa_load_error(file_path.string() +
+ " is a directory, please provide a file for annotations."
+ );
+ }
fs::fstream data_stream(file_path, std::ios::in);
if (!data_stream)
{
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
+ throw mussa_load_error("Error loading annotation file " + file_path.string());
+ }
+
+ try {
+ load_annot(data_stream, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
}
+ data_stream.close();
+}
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
// so i should probably be passing the parse function some iterators
// but the annotations files are (currently) small, so i think i can
// get away with loading the whole file into memory
data_stream.get(c);
data.push_back(c);
}
- data_stream.close();
-
+
parse_annot(data, start_index, end_index);
}
*/
struct push_back_annot {
- std::list<annot>& annot_list;
+ Sequence* parent;
+ SeqSpanRefListRef children;
int& begin;
int& end;
std::string& name;
std::string& type;
+ int &parsed;
- push_back_annot(std::list<annot>& annot_list_,
+ push_back_annot(Sequence* parent_seq,
+ SeqSpanRefListRef children_list,
int& begin_,
int& end_,
std::string& name_,
- std::string& type_)
- : annot_list(annot_list_),
+ std::string& type_,
+ int &parsed_)
+ : parent(parent_seq),
+ children(children_list),
begin(begin_),
end(end_),
name(name_),
- type(type_)
+ type(type_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
- //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
- annot_list.push_back(annot(begin, end, name, type));
+ children->push_back(parent->make_annotation(name, type, begin, end));
+ ++parsed;
};
};
std::list<Sequence>& seq_list;
std::string& name;
std::string& seq;
+ int &parsed;
push_back_seq(std::list<Sequence>& seq_list_,
std::string& name_,
- std::string& seq_)
+ std::string& seq_,
+ int &parsed_)
: seq_list(seq_list_),
name(name_),
- seq(seq_)
+ seq(seq_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
+ std::string::iterator seq_i = seq.begin();
+ std::string::iterator seq_end = seq.end();
+
+ // this if block is a hack, for some reason spirit was
+ // duplicating the last character if the file didn't end
+ // with a new line.
+ // this checks for the trailing newline, and if it is missing
+ // removes the last character ( which should be the duplicated character.
+ // check test_sequence.cpp:sequence_no_trailing_newline for test case
+ // also see ticket:265 for more information
+ if (seq.size() > 0) {
+ std::string::value_type c = seq[seq.size()-1];
+ if (not (c == '\015' or c == '\012')) {
+ // doesn't end with a new line character
+ seq_end--;
+ }
+ }
+ // end hack
+
// filter out newlines from our sequence
std::string new_seq;
- for(std::string::const_iterator seq_i = seq.begin();
- seq_i != seq.end();
- ++seq_i)
+ for(; seq_i != seq_end; ++seq_i)
{
if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
}
//std::cout << "adding seq: " << name << " " << new_seq << std::endl;
-
Sequence s(new_seq);
s.set_fasta_header(name);
seq_list.push_back(s);
+ ++parsed;
};
};
-bool
+void
Sequence::parse_annot(std::string data, int start_index, int end_index)
{
int start=0;
int end=0;
std::string name;
std::string type;
- std::string seq;
+ std::string seqstr;
+ SeqSpanRefListRef parsed_annots(new SeqSpanRefList);
std::list<Sequence> query_seqs;
-
- bool status = spirit::parse(data.begin(), data.end(),
- (
- //begin grammar
- !(
- (
- spirit::alpha_p >>
- +(spirit::graph_p)
- )[spirit::assign_a(species)] >>
- +(spirit::space_p)
- ) >>
- *(
- ( // ignore html tags
- *(spirit::space_p) >>
- spirit::ch_p('<') >>
- +(~spirit::ch_p('>')) >>
- spirit::ch_p('>') >>
- *(spirit::space_p)
- )
- |
- ( // parse an absolute location name
- (spirit::uint_p[spirit::assign_a(start)] >>
- +spirit::space_p >>
- spirit::uint_p[spirit::assign_a(end)] >>
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(name)] >>
- // optional type
- !(
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(type)]
- )
- // to understand how this group gets set
- // read the comment above struct push_back_annot
- )[push_back_annot(annots, start, end, type, name)]
- |
- ((spirit::ch_p('>')|spirit::str_p(">")) >>
- (*(spirit::print_p))[spirit::assign_a(name)] >>
- spirit::eol_p >>
- (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
- )[push_back_seq(query_seqs, name, seq)]
- ) >>
- *spirit::space_p
+ int parsed=0;
+
+ bool ok = spirit::parse(data.begin(), data.end(),
+ (
+ //begin grammar
+ !(
+ (
+ spirit::alpha_p >>
+ +(spirit::graph_p)
+ )[spirit::assign_a(species)] >>
+ +(spirit::space_p)
+ ) >>
+ *(
+ ( // ignore html tags
+ *(spirit::space_p) >>
+ spirit::ch_p('<') >>
+ +(~spirit::ch_p('>')) >>
+ spirit::ch_p('>') >>
+ *(spirit::space_p)
)
- //end grammar
- )).full;
-
- // go seearch for query sequences
+ |
+ ( // parse an absolute location name
+ (spirit::uint_p[spirit::assign_a(start)] >>
+ +spirit::space_p >>
+ spirit::uint_p[spirit::assign_a(end)] >>
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(name)] >>
+ // optional type
+ !(
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(type)]
+ )
+ // to understand how this group gets set
+ // read the comment above struct push_back_annot
+ )[push_back_annot(this, parsed_annots, start, end, name, type, parsed)]
+ |
+ ((spirit::ch_p('>')|spirit::str_p(">")) >>
+ (*(spirit::print_p))[spirit::assign_a(name)] >>
+ spirit::eol_p >>
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seqstr)]
+ )[push_back_seq(query_seqs, name, seqstr, parsed)]
+ ) >>
+ *spirit::space_p
+ )
+ //end grammar
+ )).full;
+ if (not ok) {
+ std::stringstream msg;
+ msg << "Error parsing annotation #" << parsed;
+ throw annotation_load_error(msg.str());
+ }
+ // If everything loaded correctly add the sequences to our annotation list
+ // add newly parsed annotations to our sequence
+ std::copy(parsed_annots->begin(), parsed_annots->end(), std::back_inserter(*annotation_list));
+ // go search for query sequences
find_sequences(query_seqs.begin(), query_seqs.end());
- return status;
}
-void Sequence::add_annotation(const annot& a)
+void Sequence::add_annotation(const SeqSpanRef a)
{
- annots.push_back(a);
+ annotation_list->push_back(a);
}
-const std::list<annot>& Sequence::annotations() const
+void Sequence::add_annotation(std::string name, std::string type, size_type start, size_type stop)
{
- return annots;
+ add_annotation(make_annotation(name, type, start, stop));
}
-Sequence
-Sequence::subseq(int start, int count)
+SeqSpanRef
+Sequence::make_annotation(std::string name, std::string type, size_type start, size_type stop) const
{
- if (!seq) {
- Sequence new_seq;
- return new_seq;
+ // we want things to be in the positive direction
+ if (stop < start) {
+ size_type tmp = start;
+ start = stop;
+ stop = tmp;
}
+ size_type count = stop - start;
+ SeqSpanRef new_annot(seq->subseq(start, count, SeqSpan::UnknownStrand));
+ AnnotationsRef metadata(new Annotations(name));
+ metadata->set("type", type);
+ new_annot->setAnnotations(metadata);
+ return new_annot;
+}
- // there might be an off by one error with start+count > size()
- if ( count == npos || start+count > size()) {
- count = size()-start;
- }
- Sequence new_seq(*this);
- new_seq.parent = this;
- new_seq.seq_start = seq_start+start;
- new_seq.seq_count = count;
+const SeqSpanRefList& Sequence::annotations() const
+{
+ return *annotation_list;
+}
+void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
+{
new_seq.motif_list = motif_list;
- new_seq.annots.clear();
- // attempt to copy & reannotate position based annotations
- int end = start+count;
+ new_seq.annotation_list.reset(new SeqSpanRefList);
- for(std::list<annot>::const_iterator annot_i = annots.begin();
- annot_i != annots.end();
+ for(SeqSpanRefList::const_iterator annot_i = annotation_list->begin();
+ annot_i != annotation_list->end();
++annot_i)
{
- int annot_begin= annot_i->begin;
- int annot_end = annot_i->end;
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_end = (*annot_i)->stop();
- if (annot_begin < end) {
+ if (annot_begin < start+count) {
if (annot_begin >= start) {
annot_begin -= start;
} else {
annot_begin = 0;
}
- if (annot_end < end) {
+ if (annot_end < start+count) {
annot_end -= start;
} else {
annot_end = count;
}
-
- annot new_annot(annot_begin, annot_end, annot_i->type, annot_i->name);
- new_seq.annots.push_back(new_annot);
+ SeqSpanRef new_annot(new_seq.seq->subseq(annot_begin, annot_end));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ new_seq.annotation_list->push_back(new_annot);
}
}
+}
+
+Sequence
+Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
+{
+ // FIXME: should i really allow a subsequence of an empty sequence?
+ if (!seq) {
+ Sequence new_seq;
+ return new_seq;
+ }
+ Sequence new_seq(*this);
+ new_seq.seq = seq->subseq(start, count, strand);
+ if (seq->annotations()) {
+ AnnotationsRef a(new Annotations(*(seq->annotations())));
+ new_seq.seq->setAnnotations(a);
+ }
+ copy_children(new_seq, start, count);
+
return new_seq;
}
-std::string Sequence::create_reverse_map() const
+
+// FIXME: This needs to be moved into SeqSpan
+Sequence Sequence::rev_comp() const
{
- std::string rc_map(256, '~');
- // if we're rna, use U instead of T
- // we might want to add an "is_rna" to sequence at somepoint
- char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T';
- char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't';
- rc_map['A'] = TU ; rc_map['a'] = tu ;
- rc_map['T'] = 'A'; rc_map['t'] = 'a';
- rc_map['U'] = 'A'; rc_map['u'] = 'a';
- rc_map['G'] = 'C'; rc_map['g'] = 'c';
- rc_map['C'] = 'G'; rc_map['c'] = 'g';
- rc_map['M'] = 'K'; rc_map['m'] = 'k';
- rc_map['R'] = 'Y'; rc_map['r'] = 'y';
- rc_map['W'] = 'W'; rc_map['w'] = 'w';
- rc_map['S'] = 'S'; rc_map['s'] = 's';
- rc_map['Y'] = 'R'; rc_map['y'] = 'r';
- rc_map['K'] = 'M'; rc_map['k'] = 'm';
- rc_map['V'] = 'B'; rc_map['v'] = 'b';
- rc_map['H'] = 'D'; rc_map['h'] = 'd';
- rc_map['D'] = 'H'; rc_map['d'] = 'h';
- rc_map['B'] = 'V'; rc_map['b'] = 'v';
- rc_map['N'] = 'N'; rc_map['n'] = 'n';
- rc_map['X'] = 'X'; rc_map['x'] = 'x';
- rc_map['?'] = '?';
- rc_map['.'] = '.';
- rc_map['-'] = '-';
- rc_map['~'] = '~'; // not really needed, but perhaps it's clearer.
- return rc_map;
+ // a reverse complement is the whole opposite strand
+ return subseq(0, npos, SeqSpan::OppositeStrand);
}
-Sequence Sequence::rev_comp() const
+const Alphabet& Sequence::get_alphabet() const
{
- std::string rev_comp;
- rev_comp.reserve(length());
-
- std::string rc_map = create_reverse_map();
-
- // reverse and convert
- seq_string::const_reverse_iterator seq_i;
- seq_string::const_reverse_iterator seq_end = seq->rend();
- for(seq_i = seq->rbegin();
- seq_i != seq_end;
- ++seq_i)
- {
- rev_comp.append(1, rc_map[*seq_i]);
- }
- return Sequence(rev_comp, alphabet);
+ return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
}
void Sequence::set_fasta_header(std::string header_)
return "";
}
-const Alphabet& Sequence::get_alphabet() const
-{
- return get_alphabet(alphabet);
-}
-
-const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const
-{
- switch (alpha) {
- case reduced_dna_alphabet:
- return Alphabet::reduced_dna_alphabet;
- case reduced_rna_alphabet:
- return Alphabet::reduced_rna_alphabet;
- case reduced_nucleic_alphabet:
- return Alphabet::reduced_nucleic_alphabet;
- case nucleic_alphabet:
- return Alphabet::nucleic_alphabet;
- case protein_alphabet:
- return Alphabet::protein_alphabet;
- default:
- throw std::runtime_error("unrecognized alphabet type");
- break;
- }
-}
-
-void Sequence::set_sequence(const std::string& s, alphabet_ref a)
+void Sequence::set_sequence(const std::string& s, AlphabetRef a)
{
- set_filtered_sequence(s, a);
+ set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
}
std::string Sequence::get_sequence() const
{
- if (seq)
- return *seq;
- else
- return std::string();
+ return seq->sequence();
}
Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
return at(i);
}
-Sequence::const_reference Sequence::at(Sequence::size_type i) const
-{
- if (!seq) throw std::out_of_range("empty sequence");
- return seq->at(i+seq_start);
-}
-
void
Sequence::clear()
{
- parent = 0;
seq.reset();
- seq_start = 0;
- seq_count = 0;
- strand = UnknownStrand;
header.clear();
species.clear();
- annots.clear();
- motif_list.clear();
-}
-
-const char *Sequence::c_str() const
-{
- if (seq)
- return seq->c_str()+seq_start;
- else
- return 0;
-}
-
-Sequence::const_iterator Sequence::begin() const
-{
- if (seq and seq_count != 0)
- return seq->begin()+seq_start;
- else
- return Sequence::const_iterator(0);
-}
-
-Sequence::const_iterator Sequence::end() const
-{
- if (seq and seq_count != 0) {
- return seq->begin() + seq_start + seq_count;
- } else {
- return Sequence::const_iterator(0);
- }
-}
-
-bool Sequence::empty() const
-{
- return (seq_count == 0) ? true : false;
-}
-
-Sequence::size_type Sequence::start() const
-{
- if (parent)
- return seq_start - parent->start();
- else
- return seq_start;
-}
-
-Sequence::size_type Sequence::stop() const
-{
- return start() + seq_count;
-}
-
-Sequence::size_type Sequence::size() const
-{
- return seq_count;
-}
-
-Sequence::size_type Sequence::length() const
-{
- return size();
+ annotation_list.reset(new SeqSpanRefList);
+ motif_list.reset(new MotifList);
}
void
Sequence::save(fs::fstream &save_file)
{
+ std::string type("type");
+ std::string empty_str("");
//fstream save_file;
- std::list<annot>::iterator annots_i;
-
- // not sure why, or if i'm doing something wrong, but can't seem to pass
- // file pointers down to this method from the mussa control class
- // so each call to save a sequence appends to the file started by mussa_class
- //save_file.open(save_file_path.c_str(), std::ios::app);
+ SeqSpanRefList::iterator annots_i;
+ AnnotationsRef metadata;
save_file << "<Sequence>" << std::endl;
save_file << *this << std::endl;
save_file << "<Annotations>" << std::endl;
save_file << species << std::endl;
- for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
+ for (annots_i = annotation_list->begin();
+ annots_i != annotation_list->end();
+ ++annots_i)
{
- save_file << annots_i->begin << " " << annots_i->end << " " ;
- save_file << annots_i->name << " " << annots_i->type << std::endl;
+ metadata = (*annots_i)->annotations();
+ save_file << (*annots_i)->parentStart() << " " << (*annots_i)->parentStop() << " " ;
+ save_file << metadata->name() << " "
+ << metadata->getdefault(type, empty_str) << std::endl;
}
save_file << "</Annotations>" << std::endl;
//save_file.close();
}
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+// fs::fstream load_file;
+// std::string file_data_line;
+// int seq_counter;
+// //annot an_annot;
+// int annot_begin;
+// int annot_end;
+// std::string annot_name;
+// std::string annot_type;
+//
+// std::string::size_type space_split_i;
+// std::string annot_value;
+//
+// annotation_list.reset(new SeqSpanRefList);
+//
+// load_file.open(load_file_path, std::ios::in);
+//
+// seq_counter = 0;
+// // search for the seq_num-th sequence
+// while ( (!load_file.eof()) && (seq_counter < seq_num) )
+// {
+// getline(load_file,file_data_line);
+// if (file_data_line == "<Sequence>")
+// seq_counter++;
+// }
+// getline(load_file, file_data_line);
+// // looks like the sequence is written as a single line
+// set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+// getline(load_file, file_data_line);
+// getline(load_file, file_data_line);
+// if (file_data_line == "<Annotations>")
+// {
+// getline(load_file, file_data_line);
+// species = file_data_line;
+// while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+// {
+// getline(load_file,file_data_line);
+// if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+// {
+// // need to get 4 values...almost same code 4 times...
+// // get annot start index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_begin = atoi (annot_value.c_str());
+// file_data_line = file_data_line.substr(space_split_i+1);
+// // get annot end index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_end = atoi (annot_value.c_str());
+//
+// if (space_split_i == std::string::npos) // no entry for type or name
+// {
+// std::cout << "seq, annots - no type or name\n";
+// annot_name = "";
+// annot_type = "";
+// }
+// else // else get annot type
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// //an_annot.type = annot_value;
+// annot_type = annot_value;
+// if (space_split_i == std::string::npos) // no entry for name
+// {
+// std::cout << "seq, annots - no name\n";
+// annot_name = "";
+// }
+// else // get annot name
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// // this seems like its wrong?
+// annot_type = annot_value;
+// }
+// }
+// add_annotation(annot_name, annot_type, annot_begin, annot_end);
+// }
+// //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+// // << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+// }
+// }
+// load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
{
- fs::fstream load_file;
+ boost::shared_ptr<Sequence> seq(new Sequence);
std::string file_data_line;
int seq_counter;
- annot an_annot;
+ //annot an_annot;
+ int annot_begin;
+ int annot_end;
+ std::string annot_name;
+ std::string annot_type;
+
std::string::size_type space_split_i;
std::string annot_value;
- annots.clear();
- load_file.open(load_file_path, std::ios::in);
+ //seq->annotation_list.reset(new SeqSpanRefList);
seq_counter = 0;
- // search for the seq_num-th sequence
+ // search for the next sequence
+ int seq_num = 1;
while ( (!load_file.eof()) && (seq_counter < seq_num) )
{
getline(load_file,file_data_line);
if (file_data_line == "<Sequence>")
seq_counter++;
}
+
+ // Could not find next sequence
+ if (load_file.eof())
+ {
+ seq.reset();
+ return seq;
+ }
+
getline(load_file, file_data_line);
// looks like the sequence is written as a single line
- set_filtered_sequence(file_data_line, reduced_dna_alphabet);
+ seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
{
getline(load_file, file_data_line);
- species = file_data_line;
+ seq->set_species(file_data_line);
while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
{
getline(load_file,file_data_line);
// get annot start index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.begin = atoi (annot_value.c_str());
+ annot_begin = atoi (annot_value.c_str());
file_data_line = file_data_line.substr(space_split_i+1);
// get annot end index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
+ annot_end = atoi (annot_value.c_str());
if (space_split_i == std::string::npos) // no entry for type or name
{
std::cout << "seq, annots - no type or name\n";
- an_annot.type = "";
- an_annot.name = "";
+ annot_name = "";
+ annot_type = "";
}
else // else get annot type
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ //an_annot.type = annot_value;
+ annot_type = annot_value;
if (space_split_i == std::string::npos) // no entry for name
{
std::cout << "seq, annots - no name\n";
- an_annot.name = "";
+ annot_name = "";
}
else // get annot name
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ // this seems like its wrong?
+ annot_type = annot_value;
}
}
- annots.push_back(an_annot); // don't forget to actually add the annot
+ seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
}
//std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
// << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
}
}
- load_file.close();
+ //load_file.close();
+ return seq;
}
motif_start_i != motif_starts.end();
++motif_start_i)
{
- motif_list.push_back(motif(*motif_start_i, a_motif.get_sequence()));
+ motif_list->push_back(motif(*motif_start_i, a_motif.get_sequence()));
}
}
void Sequence::clear_motifs()
{
- motif_list.clear();
+ if (motif_list)
+ motif_list->clear();
+ else
+ motif_list.reset(new MotifList);
}
-const std::list<motif>& Sequence::motifs() const
+const Sequence::MotifList& Sequence::motifs() const
{
- return motif_list;
+ return *motif_list;
}
std::vector<int>
Sequence::value_type motif_char;
Sequence::value_type seq_char;
- while (seq_i < seq->length())
+ while (seq_i < size())
{
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- annot new_motif;
motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
}
{
std::vector<int> seq_starts = find_motif(a_seq);
- //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
-
for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
seq_start_i != seq_starts.end();
++seq_start_i)
{
- annots.push_back(annot(*seq_start_i,
- *seq_start_i+a_seq.size(),
- "",
- name));
+ add_annotation(name, "", *seq_start_i, *seq_start_i+a_seq.size());
}
}
std::ostream& operator<<(std::ostream& out, const Sequence& s)
{
- for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
- out << *s_i;
+ if (s.seq) {
+ for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
+ out << *s_i;
+ }
}
return out;
}
}
}
-bool operator==(const Sequence& x, const Sequence& y)
+template <typename Iter1, typename Iter2>
+static
+bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
{
- if (x.empty() and y.empty()) {
- // if there's no sequence in either sequence structure, they're equal
+ Iter1 aseq_i = abegin;
+ Iter2 bseq_i = bbegin;
+ if (aend-abegin == bend-bbegin) {
+ // since the length of the two sequences is equal, we only need to
+ // test one.
+ for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
+ if (toupper(*aseq_i) != toupper(*bseq_i)) {
+ return false;
+ }
+ }
return true;
- } else if (x.empty() or y.empty()) {
- // if we fail the first test, and we discover one is empty,
- // we know they can't be equal. (and we need to do this
- // to prevent dereferencing an empty pointer)
- return false;
- } else if (x.seq_count != y.seq_count) {
- // if they're of different lenghts, they're not equal
+ } else {
return false;
}
- Sequence::const_iterator xseq_i = x.begin();
- Sequence::const_iterator yseq_i = y.begin();
- // since the length of the two sequences is equal, we only need to
- // test one.
- for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) {
- if (toupper(*xseq_i) != toupper(*yseq_i)) {
- return false;
+}
+
+bool operator==(const Sequence& x, const Sequence& y)
+{
+ if (x.seq and y.seq) {
+ // both x and y are defined
+ if (SeqSpan::isFamily(x.seq, y.seq)) {
+ // both are part of the same SeqSpan tree
+ return *(x.seq) == *(y.seq);
+ } else {
+ // we'll have to do a real comparison
+ return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
+ x.begin(), x.end(),
+ y.begin(), y.end()
+ );
}
+ } else {
+ // true if they're both empty (with either a null SeqSpanRef or
+ // a zero length string
+ return (x.size() == y.size());
}
- return true;
}
bool operator!=(const Sequence& x, const Sequence& y)
{
return not operator==(x, y);
}
+