// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+#include <boost/filesystem/operations.hpp>
namespace fs = boost::filesystem;
#include <boost/spirit/core.hpp>
namespace spirit = boost::spirit;
#include "alg/sequence.hpp"
+#include "io.hpp"
#include "mussa_exceptions.hpp"
#include <string>
+#include <stdexcept>
#include <iostream>
#include <sstream>
+#include <set>
-annot::annot()
- : begin(0),
- end(0),
- type(""),
- name("")
-{
-}
-
-annot::annot(int begin, int end, std::string type, std::string name)
- : begin(begin),
- end(end),
- type(type),
- name(name)
-{
-}
-
-annot::~annot()
-{
-}
-
-bool operator==(const annot& left, const annot& right)
+bool operator==(const motif& left, const motif& right)
{
return ((left.begin== right.begin) and
(left.end == right.end) and
(left.name == right.name));
}
-motif::motif(int begin, std::string motif)
- : annot(begin, begin+motif.size(), "motif", motif),
+motif::motif()
+ : begin(0),
+ end(0),
+ type("motif"),
+ name(""),
+ sequence("")
+{
+}
+
+motif::motif(int begin_, std::string motif)
+ : begin(begin_),
+ end(begin_+motif.size()),
+ type("motif"),
+ name(motif),
sequence(motif)
{
}
{
}
-const std::string Sequence::dna_alphabet("AaCcGgTtNn\012\015");
-const std::string Sequence::rna_alphabet("AaCcGgNnUu\012\015");
- //! this is the general iupac alphabet for nucleotides
-const std::string Sequence::nucleic_iupac_alphabet("AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015");
- //! the protein alphabet
-const std::string Sequence::protein_alphabet("AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015");
-
-Sequence::Sequence()
- : parent(0),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand)
+Sequence::Sequence(AlphabetRef alphabet)
+ : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
}
{
}
-Sequence::Sequence(const char *seq)
- : parent(0),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand),
- header(""),
- species("")
+Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- set_filtered_sequence(seq);
+ set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
}
-Sequence::Sequence(const std::string& seq)
- : parent(0),
- seq_start(0),
- seq_count(0),
- strand(UnknownStrand),
- header(""),
- species("")
+Sequence::Sequence(const std::string& seq,
+ AlphabetRef alphabet_,
+ SeqSpan::strand_type strand_)
+ : header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
{
- set_filtered_sequence(seq);
+ set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
}
Sequence::Sequence(const Sequence& o)
- : parent(o.parent),
- seq(o.seq),
- seq_start(o.seq_start),
- seq_count(o.seq_count),
- strand(o.strand),
+ : seq(o.seq),
header(o.header),
species(o.species),
- annots(o.annots),
+ annotation_list(o.annotation_list),
motif_list(o.motif_list)
{
}
+Sequence::Sequence(const Sequence* o)
+ : seq(o->seq),
+ header(o->header),
+ species(o->species),
+ annotation_list(o->annotation_list),
+ motif_list(o->motif_list)
+{
+}
+
+Sequence::Sequence(const SequenceRef o)
+ : seq(new SeqSpan(o->seq)),
+ header(o->header),
+ species(o->species),
+ annotation_list(new SeqSpanRefList),
+ motif_list(o->motif_list)
+{
+ // copy over the annotations in the other sequence ref,
+ // attaching them to our current sequence ref
+ for(SeqSpanRefList::const_iterator annot_i = o->annotation_list->begin();
+ annot_i != o->annotation_list->end();
+ ++annot_i)
+ {
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_count = (*annot_i)->size();
+
+ SeqSpanRef new_annot(seq->subseq(annot_begin, annot_count));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ annotation_list->push_back(new_annot);
+ }
+}
+
+Sequence::Sequence(const SeqSpanRef& seq_ref)
+ : seq(seq_ref),
+ header(""),
+ species(""),
+ annotation_list(new SeqSpanRefList),
+ motif_list(new MotifList)
+{
+}
+
Sequence &Sequence::operator=(const Sequence& s)
{
if (this != &s) {
- parent = s.parent;
seq = s.seq;
- seq_start = s.seq_start;
- seq_count = s.seq_count;
- strand = s.strand;
header = s.header;
species = s.species;
- annots = s.annots;
+ annotation_list = s.annotation_list;
motif_list = s.motif_list;
}
return *this;
}
-static void multiplatform_getline(std::istream& in, std::string& line)
+void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
{
- line.clear();
- char c;
- in.get(c);
- while(in.good() and !(c == '\012' or c == '\015') ) {
- line.push_back(c);
- in.get(c);
- }
- // if we have cr-lf eat it
- c = in.peek();
- if (c=='\012' or c == '\015') {
- in.get();
- }
+ load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
}
//! load a fasta file into a sequence
-/*!
- * \param file_path the location of the fasta file in the filesystem
- * \param seq_num which sequence in the file to load
- * \param start_index starting position in the fasta sequence, 0 for beginning
- * \param end_index ending position in the fasta sequence, 0 for end
- * \return error message, empty string if no error. (gag!)
- */
-void Sequence::load_fasta(fs::path file_path, int seq_num,
- int start_index, int end_index)
+void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
+ int seq_num, int start_index, int end_index)
{
fs::fstream data_file;
data_file.open(file_path, std::ios::in);
throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
} else {
try {
- load_fasta(data_file, seq_num, start_index, end_index);
+ load_fasta(data_file, a, seq_num, start_index, end_index);
} catch(sequence_empty_error e) {
// there doesn't appear to be any sequence
// catch and rethrow to include the filename
errormsg << file_path.native_file_string()
<< " did not have any fasta sequences" << std::endl;
throw sequence_empty_file_error(errormsg.str());
+ } catch(sequence_invalid_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string();
+ msg << " " << e.what();
+ throw sequence_invalid_load_error(msg.str());
}
}
}
+void Sequence::load_fasta(std::istream& file,
+ int seq_num, int start_index, int end_index)
+{
+ load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
+}
+
void
-Sequence::load_fasta(std::iostream& data_file, int seq_num,
+Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
+ int seq_num,
int start_index, int end_index)
{
std::string file_data_line;
int header_counter = 0;
+ size_t line_counter = 0;
bool read_seq = true;
std::string rev_comp;
std::string sequence_raw;
std::string seq_tmp; // holds sequence during basic filtering
+ const Alphabet &alpha = Alphabet::get_alphabet(a);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
while ( (!data_file.eof()) && (header_counter < seq_num) )
{
multiplatform_getline(data_file, file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
header_counter++;
}
while ( !data_file.eof() && read_seq ) {
multiplatform_getline(data_file,file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
read_seq = false;
- else sequence_raw += file_data_line;
+ else {
+ for (std::string::const_iterator line_i = file_data_line.begin();
+ line_i != file_data_line.end();
+ ++line_i)
+ {
+ if(alpha.exists(*line_i)) {
+ sequence_raw += *line_i;
+ } else {
+ std::ostringstream msg;
+ msg << "Unrecognized characters in fasta sequence at line ";
+ msg << line_counter;
+ throw sequence_invalid_load_error(msg.str());
+ }
+ }
+ }
}
// Lastly, if subselection of the sequence was specified we keep cut out
std::string msg("The selected sequence appears to be empty");
throw sequence_empty_error(msg);
}
- set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
+ set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
} else {
std::string errormsg("There were no fasta sequences");
throw sequence_empty_file_error(errormsg);
}
}
-void Sequence::set_filtered_sequence(const std::string &old_seq,
+void Sequence::set_filtered_sequence(const std::string &in_seq,
+ AlphabetRef alphabet_,
size_type start,
size_type count,
- strand_type strand_)
+ SeqSpan::strand_type strand_)
{
- char conversionTable[257];
-
if ( count == npos)
- count = old_seq.size() - start;
- boost::shared_ptr<seq_string> new_seq(new seq_string);
- new_seq->reserve(count);
-
- // Make a conversion table
-
- // everything we don't specify below will become 'N'
- for(int table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = 'N';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // we want these to map to themselves - ie not to change
- conversionTable[(int)'A'] = 'A';
- conversionTable[(int)'T'] = 'T';
- conversionTable[(int)'G'] = 'G';
- conversionTable[(int)'C'] = 'C';
- // this is to upcase
- conversionTable[(int)'a'] = 'A';
- conversionTable[(int)'t'] = 'T';
- conversionTable[(int)'g'] = 'G';
- conversionTable[(int)'c'] = 'C';
+ count = in_seq.size() - start;
+ std::string new_seq;
+ new_seq.reserve(count);
// finally, the actual conversion loop
- for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
+ const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
+ std::string::const_iterator seq_i = in_seq.begin()+start;
+ for(size_type i = 0; i != count; ++i, ++seq_i)
{
- new_seq->append(1, conversionTable[ (int)old_seq[seq_index+start]]);
+ if (alpha_impl.exists(*seq_i)) {
+ new_seq.append(1, toupper(*seq_i));
+ } else {
+ new_seq.append(1, 'N');
+ }
}
- parent = 0;
- seq = new_seq;
- seq_start = 0;
- seq_count = count;
- strand = strand_;
+ SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
+ seq = new_seq_ref;
}
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
+ if (not fs::exists(file_path)) {
+ throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
+ }
+ if (fs::is_directory(file_path)) {
+ throw mussa_load_error(file_path.string() +
+ " is a directory, please provide a file for annotations."
+ );
+ }
fs::fstream data_stream(file_path, std::ios::in);
if (!data_stream)
{
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
+ throw mussa_load_error("Error loading annotation file " + file_path.string());
}
+ try {
+ load_annot(data_stream, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
+ }
+ data_stream.close();
+}
+
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
// so i should probably be passing the parse function some iterators
// but the annotations files are (currently) small, so i think i can
// get away with loading the whole file into memory
data_stream.get(c);
data.push_back(c);
}
- data_stream.close();
-
+
parse_annot(data, start_index, end_index);
}
*/
struct push_back_annot {
- std::list<annot>& annot_list;
+ Sequence* parent;
+ SeqSpanRefListRef children;
int& begin;
int& end;
std::string& name;
std::string& type;
+ int &parsed;
- push_back_annot(std::list<annot>& annot_list_,
+ push_back_annot(Sequence* parent_seq,
+ SeqSpanRefListRef children_list,
int& begin_,
int& end_,
std::string& name_,
- std::string& type_)
- : annot_list(annot_list_),
+ std::string& type_,
+ int &parsed_)
+ : parent(parent_seq),
+ children(children_list),
begin(begin_),
end(end_),
name(name_),
- type(type_)
+ type(type_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
- //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
- annot_list.push_back(annot(begin, end, name, type));
+ children->push_back(parent->make_annotation(name, type, begin, end));
+ ++parsed;
};
};
std::list<Sequence>& seq_list;
std::string& name;
std::string& seq;
+ int &parsed;
push_back_seq(std::list<Sequence>& seq_list_,
std::string& name_,
- std::string& seq_)
+ std::string& seq_,
+ int &parsed_)
: seq_list(seq_list_),
name(name_),
- seq(seq_)
+ seq(seq_),
+ parsed(parsed_)
{
}
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
+ std::string::iterator seq_i = seq.begin();
+ std::string::iterator seq_end = seq.end();
+
+ // this if block is a hack, for some reason spirit was
+ // duplicating the last character if the file didn't end
+ // with a new line.
+ // this checks for the trailing newline, and if it is missing
+ // removes the last character ( which should be the duplicated character.
+ // check test_sequence.cpp:sequence_no_trailing_newline for test case
+ // also see ticket:265 for more information
+ if (seq.size() > 0) {
+ std::string::value_type c = seq[seq.size()-1];
+ if (not (c == '\015' or c == '\012')) {
+ // doesn't end with a new line character
+ seq_end--;
+ }
+ }
+ // end hack
+
// filter out newlines from our sequence
std::string new_seq;
- for(std::string::const_iterator seq_i = seq.begin();
- seq_i != seq.end();
- ++seq_i)
+ for(; seq_i != seq_end; ++seq_i)
{
if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
}
//std::cout << "adding seq: " << name << " " << new_seq << std::endl;
-
Sequence s(new_seq);
s.set_fasta_header(name);
seq_list.push_back(s);
+ ++parsed;
};
};
-bool
+void
Sequence::parse_annot(std::string data, int start_index, int end_index)
{
int start=0;
int end=0;
std::string name;
std::string type;
- std::string seq;
+ std::string seqstr;
+ SeqSpanRefListRef parsed_annots(new SeqSpanRefList);
std::list<Sequence> query_seqs;
-
- bool status = spirit::parse(data.begin(), data.end(),
- (
- //begin grammar
- !(
- (
- spirit::alpha_p >>
- +(spirit::graph_p)
- )[spirit::assign_a(species)] >>
- +(spirit::space_p)
- ) >>
- *(
- ( // ignore html tags
- *(spirit::space_p) >>
- spirit::ch_p('<') >>
- +(~spirit::ch_p('>')) >>
- spirit::ch_p('>') >>
- *(spirit::space_p)
- )
- |
- ( // parse an absolute location name
- (spirit::uint_p[spirit::assign_a(start)] >>
- +spirit::space_p >>
- spirit::uint_p[spirit::assign_a(end)] >>
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(name)] >>
- // optional type
- !(
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(type)]
- )
- // to understand how this group gets set
- // read the comment above struct push_back_annot
- )[push_back_annot(annots, start, end, type, name)]
- |
- ((spirit::ch_p('>')|spirit::str_p(">")) >>
- (*(spirit::print_p))[spirit::assign_a(name)] >>
- spirit::eol_p >>
- (+(spirit::chset<>(nucleic_iupac_alphabet.c_str())))[spirit::assign_a(seq)]
- )[push_back_seq(query_seqs, name, seq)]
- ) >>
- *spirit::space_p
+ int parsed=0;
+
+ bool ok = spirit::parse(data.begin(), data.end(),
+ (
+ //begin grammar
+ !(
+ (
+ spirit::alpha_p >>
+ +(spirit::graph_p)
+ )[spirit::assign_a(species)] >>
+ +(spirit::space_p)
+ ) >>
+ *(
+ ( // ignore html tags
+ *(spirit::space_p) >>
+ spirit::ch_p('<') >>
+ +(~spirit::ch_p('>')) >>
+ spirit::ch_p('>') >>
+ *(spirit::space_p)
)
- //end grammar
- )).full;
-
- // go seearch for query sequences
+ |
+ ( // parse an absolute location name
+ (spirit::uint_p[spirit::assign_a(start)] >>
+ +spirit::space_p >>
+ spirit::uint_p[spirit::assign_a(end)] >>
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(name)] >>
+ // optional type
+ !(
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(type)]
+ )
+ // to understand how this group gets set
+ // read the comment above struct push_back_annot
+ )[push_back_annot(this, parsed_annots, start, end, name, type, parsed)]
+ |
+ ((spirit::ch_p('>')|spirit::str_p(">")) >>
+ (*(spirit::print_p))[spirit::assign_a(name)] >>
+ spirit::eol_p >>
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seqstr)]
+ )[push_back_seq(query_seqs, name, seqstr, parsed)]
+ ) >>
+ *spirit::space_p
+ )
+ //end grammar
+ )).full;
+ if (not ok) {
+ std::stringstream msg;
+ msg << "Error parsing annotation #" << parsed;
+ throw annotation_load_error(msg.str());
+ }
+ // If everything loaded correctly add the sequences to our annotation list
+ // add newly parsed annotations to our sequence
+ std::copy(parsed_annots->begin(), parsed_annots->end(), std::back_inserter(*annotation_list));
+ // go search for query sequences
find_sequences(query_seqs.begin(), query_seqs.end());
- return status;
}
-void Sequence::add_annotation(const annot& a)
+void Sequence::add_annotation(const SeqSpanRef a)
{
- annots.push_back(a);
+ annotation_list->push_back(a);
}
-const std::list<annot>& Sequence::annotations() const
+void Sequence::add_annotation(std::string name, std::string type, size_type start, size_type stop)
{
- return annots;
+ add_annotation(make_annotation(name, type, start, stop));
}
-Sequence
-Sequence::subseq(int start, int count)
+SeqSpanRef
+Sequence::make_annotation(std::string name, std::string type, size_type start, size_type stop) const
{
- if (!seq) {
- Sequence new_seq;
- return new_seq;
+ // we want things to be in the positive direction
+ if (stop < start) {
+ size_type tmp = start;
+ start = stop;
+ stop = tmp;
}
+ size_type count = stop - start;
+ SeqSpanRef new_annot(seq->subseq(start, count, SeqSpan::UnknownStrand));
+ AnnotationsRef metadata(new Annotations(name));
+ metadata->set("type", type);
+ new_annot->setAnnotations(metadata);
+ return new_annot;
+}
- // there might be an off by one error with start+count > size()
- if ( count == npos || start+count > size()) {
- count = size()-start;
- }
- Sequence new_seq(*this);
- new_seq.parent = this;
- new_seq.seq_start = seq_start+start;
- new_seq.seq_count = count;
+const SeqSpanRefList& Sequence::annotations() const
+{
+ return *annotation_list;
+}
+void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
+{
new_seq.motif_list = motif_list;
- new_seq.annots.clear();
- // attempt to copy & reannotate position based annotations
- int end = start+count;
+ new_seq.annotation_list.reset(new SeqSpanRefList);
- for(std::list<annot>::const_iterator annot_i = annots.begin();
- annot_i != annots.end();
+ for(SeqSpanRefList::const_iterator annot_i = annotation_list->begin();
+ annot_i != annotation_list->end();
++annot_i)
{
- int annot_begin= annot_i->begin;
- int annot_end = annot_i->end;
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_end = (*annot_i)->stop();
- if (annot_begin < end) {
+ if (annot_begin < start+count) {
if (annot_begin >= start) {
annot_begin -= start;
} else {
annot_begin = 0;
}
- if (annot_end < end) {
+ if (annot_end < start+count) {
annot_end -= start;
} else {
annot_end = count;
}
-
- annot new_annot(annot_begin, annot_end, annot_i->type, annot_i->name);
- new_seq.annots.push_back(new_annot);
+ SeqSpanRef new_annot(new_seq.seq->subseq(annot_begin, annot_end));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ new_seq.annotation_list->push_back(new_annot);
}
}
-
- return new_seq;
}
-std::string
-Sequence::rev_comp() const
+Sequence
+Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = length();
- rev_comp.reserve(len);
- // make a conversion table
- // init all parts of conversion table to '~' character
- // '~' I doubt will ever appear in a sequence file (jeez, I hope)
- // and may the fleas of 1000 camels infest the genitals of any biologist (and
- // seven generations of their progeny) who decides to make it mean
- // something special!!!
- // PS - double the curse for any smartass non-biologist who tries it as well
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
+ // FIXME: should i really allow a subsequence of an empty sequence?
+ if (!seq) {
+ Sequence new_seq;
+ return new_seq;
}
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
- // add in the characters for the bases we want to convert
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
-
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
- {
- table_i = (int) at(seq_i);
- rev_comp += conversionTable[table_i];
+ Sequence new_seq(*this);
+ new_seq.seq = seq->subseq(start, count, strand);
+ if (seq->annotations()) {
+ AnnotationsRef a(new Annotations(*(seq->annotations())));
+ new_seq.seq->setAnnotations(a);
}
+ copy_children(new_seq, start, count);
+
+ return new_seq;
+}
+
- return rev_comp;
+// FIXME: This needs to be moved into SeqSpan
+Sequence Sequence::rev_comp() const
+{
+ // a reverse complement is the whole opposite strand
+ return subseq(0, npos, SeqSpan::OppositeStrand);
+}
+
+const Alphabet& Sequence::get_alphabet() const
+{
+ return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
}
void Sequence::set_fasta_header(std::string header_)
return "";
}
-void Sequence::set_sequence(const std::string& s)
+void Sequence::set_sequence(const std::string& s, AlphabetRef a)
{
- set_filtered_sequence(s);
+ set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
}
std::string Sequence::get_sequence() const
{
- return *seq;
+ return seq->sequence();
}
Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
return at(i);
}
-Sequence::const_reference Sequence::at(Sequence::size_type i) const
-{
- if (!seq) throw std::out_of_range("empty sequence");
- return seq->at(i+seq_start);
-}
-
void
Sequence::clear()
{
- parent = 0;
seq.reset();
- seq_start = 0;
- seq_count = 0;
- strand = UnknownStrand;
header.clear();
species.clear();
- annots.clear();
- motif_list.clear();
-}
-
-const char *Sequence::c_str() const
-{
- if (seq)
- return seq->c_str()+seq_start;
- else
- return 0;
-}
-
-Sequence::const_iterator Sequence::begin() const
-{
- if (seq and seq_count != 0)
- return seq->begin()+seq_start;
- else
- return Sequence::const_iterator(0);
-}
-
-Sequence::const_iterator Sequence::end() const
-{
- if (seq and seq_count != 0) {
- return seq->begin() + seq_start + seq_count;
- } else {
- return Sequence::const_iterator(0);
- }
-}
-
-bool Sequence::empty() const
-{
- return (seq_count == 0) ? true : false;
-}
-
-Sequence::size_type Sequence::start() const
-{
- if (parent)
- return seq_start - parent->start();
- else
- return seq_start;
-}
-
-Sequence::size_type Sequence::stop() const
-{
- return start() + seq_count;
-}
-
-Sequence::size_type Sequence::size() const
-{
- return seq_count;
-}
-
-Sequence::size_type Sequence::length() const
-{
- return size();
+ annotation_list.reset(new SeqSpanRefList);
+ motif_list.reset(new MotifList);
}
void
Sequence::save(fs::fstream &save_file)
{
+ std::string type("type");
+ std::string empty_str("");
//fstream save_file;
- std::list<annot>::iterator annots_i;
-
- // not sure why, or if i'm doing something wrong, but can't seem to pass
- // file pointers down to this method from the mussa control class
- // so each call to save a sequence appends to the file started by mussa_class
- //save_file.open(save_file_path.c_str(), std::ios::app);
+ SeqSpanRefList::iterator annots_i;
+ AnnotationsRef metadata;
save_file << "<Sequence>" << std::endl;
save_file << *this << std::endl;
save_file << "<Annotations>" << std::endl;
save_file << species << std::endl;
- for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
+ for (annots_i = annotation_list->begin();
+ annots_i != annotation_list->end();
+ ++annots_i)
{
- save_file << annots_i->begin << " " << annots_i->end << " " ;
- save_file << annots_i->name << " " << annots_i->type << std::endl;
+ metadata = (*annots_i)->annotations();
+ save_file << (*annots_i)->parentStart() << " " << (*annots_i)->parentStop() << " " ;
+ save_file << metadata->name() << " "
+ << metadata->getdefault(type, empty_str) << std::endl;
}
save_file << "</Annotations>" << std::endl;
//save_file.close();
}
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+// fs::fstream load_file;
+// std::string file_data_line;
+// int seq_counter;
+// //annot an_annot;
+// int annot_begin;
+// int annot_end;
+// std::string annot_name;
+// std::string annot_type;
+//
+// std::string::size_type space_split_i;
+// std::string annot_value;
+//
+// annotation_list.reset(new SeqSpanRefList);
+//
+// load_file.open(load_file_path, std::ios::in);
+//
+// seq_counter = 0;
+// // search for the seq_num-th sequence
+// while ( (!load_file.eof()) && (seq_counter < seq_num) )
+// {
+// getline(load_file,file_data_line);
+// if (file_data_line == "<Sequence>")
+// seq_counter++;
+// }
+// getline(load_file, file_data_line);
+// // looks like the sequence is written as a single line
+// set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+// getline(load_file, file_data_line);
+// getline(load_file, file_data_line);
+// if (file_data_line == "<Annotations>")
+// {
+// getline(load_file, file_data_line);
+// species = file_data_line;
+// while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+// {
+// getline(load_file,file_data_line);
+// if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+// {
+// // need to get 4 values...almost same code 4 times...
+// // get annot start index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_begin = atoi (annot_value.c_str());
+// file_data_line = file_data_line.substr(space_split_i+1);
+// // get annot end index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_end = atoi (annot_value.c_str());
+//
+// if (space_split_i == std::string::npos) // no entry for type or name
+// {
+// std::cout << "seq, annots - no type or name\n";
+// annot_name = "";
+// annot_type = "";
+// }
+// else // else get annot type
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// //an_annot.type = annot_value;
+// annot_type = annot_value;
+// if (space_split_i == std::string::npos) // no entry for name
+// {
+// std::cout << "seq, annots - no name\n";
+// annot_name = "";
+// }
+// else // get annot name
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// // this seems like its wrong?
+// annot_type = annot_value;
+// }
+// }
+// add_annotation(annot_name, annot_type, annot_begin, annot_end);
+// }
+// //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+// // << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+// }
+// }
+// load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
{
- fs::fstream load_file;
+ boost::shared_ptr<Sequence> seq(new Sequence);
std::string file_data_line;
int seq_counter;
- annot an_annot;
+ //annot an_annot;
+ int annot_begin;
+ int annot_end;
+ std::string annot_name;
+ std::string annot_type;
+
std::string::size_type space_split_i;
std::string annot_value;
- annots.clear();
- load_file.open(load_file_path, std::ios::in);
+ //seq->annotation_list.reset(new SeqSpanRefList);
seq_counter = 0;
- // search for the seq_num-th sequence
+ // search for the next sequence
+ int seq_num = 1;
while ( (!load_file.eof()) && (seq_counter < seq_num) )
{
getline(load_file,file_data_line);
if (file_data_line == "<Sequence>")
seq_counter++;
}
+
+ // Could not find next sequence
+ if (load_file.eof())
+ {
+ seq.reset();
+ return seq;
+ }
+
getline(load_file, file_data_line);
// looks like the sequence is written as a single line
- set_filtered_sequence(file_data_line);
+ seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
{
getline(load_file, file_data_line);
- species = file_data_line;
+ seq->set_species(file_data_line);
while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
{
getline(load_file,file_data_line);
// get annot start index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.begin = atoi (annot_value.c_str());
+ annot_begin = atoi (annot_value.c_str());
file_data_line = file_data_line.substr(space_split_i+1);
// get annot end index
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
+ annot_end = atoi (annot_value.c_str());
if (space_split_i == std::string::npos) // no entry for type or name
{
std::cout << "seq, annots - no type or name\n";
- an_annot.type = "";
- an_annot.name = "";
+ annot_name = "";
+ annot_type = "";
}
else // else get annot type
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ //an_annot.type = annot_value;
+ annot_type = annot_value;
if (space_split_i == std::string::npos) // no entry for name
{
std::cout << "seq, annots - no name\n";
- an_annot.name = "";
+ annot_name = "";
}
else // get annot name
{
file_data_line = file_data_line.substr(space_split_i+1);
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
+ // this seems like its wrong?
+ annot_type = annot_value;
}
}
- annots.push_back(an_annot); // don't forget to actually add the annot
+ seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
}
//std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
// << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
}
}
- load_file.close();
-}
-
-std::string
-Sequence::rc_motif(std::string a_motif) const
-{
- std::string rev_comp;
- char conversionTable[257];
- int seq_i, table_i, len;
-
- len = a_motif.length();
- rev_comp.reserve(len);
-
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = '~';
- }
- // add end of std::string character for printing out table for testing purposes
- conversionTable[256] = '\0';
-
- // add in the characters for the bases we want to convert (IUPAC)
- conversionTable[(int)'A'] = 'T';
- conversionTable[(int)'T'] = 'A';
- conversionTable[(int)'G'] = 'C';
- conversionTable[(int)'C'] = 'G';
- conversionTable[(int)'N'] = 'N';
- conversionTable[(int)'M'] = 'K';
- conversionTable[(int)'R'] = 'Y';
- conversionTable[(int)'W'] = 'W';
- conversionTable[(int)'S'] = 'S';
- conversionTable[(int)'Y'] = 'R';
- conversionTable[(int)'K'] = 'M';
- conversionTable[(int)'V'] = 'B';
- conversionTable[(int)'H'] = 'D';
- conversionTable[(int)'D'] = 'H';
- conversionTable[(int)'B'] = 'V';
-
- // finally, the actual conversion loop
- for(seq_i = len - 1; seq_i >= 0; seq_i--)
- {
- //std::cout << "** i = " << seq_i << " bp = " <<
- table_i = (int) a_motif[seq_i];
- rev_comp += conversionTable[table_i];
- }
-
- //std::cout << "seq: " << a_motif << std::endl;
- //std::cout << "rc: " << rev_comp << std::endl;
-
- return rev_comp;
+ //load_file.close();
+ return seq;
}
-std::string
-Sequence::motif_normalize(const std::string& a_motif)
-{
- std::string valid_motif;
- int seq_i, len;
-
- len = a_motif.length();
- valid_motif.reserve(len);
-
- // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
- // current nonIUPAC symbols are omitted, which is not reported atm
- for(seq_i = 0; seq_i < len; seq_i++)
- {
- if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
- valid_motif += 'A';
- else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
- valid_motif += 'T';
- else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
- valid_motif += 'C';
- else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
- valid_motif += 'N';
- else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
- valid_motif += 'M';
- else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
- valid_motif += 'R';
- else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
- valid_motif += 'W';
- else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
- valid_motif += 'S';
- else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
- valid_motif += 'Y';
- else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
- valid_motif += 'G';
- else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
- valid_motif += 'V';
- else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
- valid_motif += 'H';
- else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
- valid_motif += 'D';
- else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
- valid_motif += 'B';
- else {
- std::string msg = "Letter ";
- msg += a_motif[seq_i];
- msg += " is not a valid IUPAC symbol";
- throw motif_normalize_error(msg);
- }
- }
- //std::cout << "valid_motif is: " << valid_motif << std::endl;
- return valid_motif;
-}
void Sequence::add_motif(const Sequence& a_motif)
{
motif_start_i != motif_starts.end();
++motif_start_i)
{
- motif_list.push_back(motif(*motif_start_i, a_motif.get_sequence()));
+ motif_list->push_back(motif(*motif_start_i, a_motif.get_sequence()));
}
}
void Sequence::clear_motifs()
{
- motif_list.clear();
+ if (motif_list)
+ motif_list->clear();
+ else
+ motif_list.reset(new MotifList);
}
-const std::list<motif>& Sequence::motifs() const
+const Sequence::MotifList& Sequence::motifs() const
{
- return motif_list;
+ return *motif_list;
}
std::vector<int>
-Sequence::find_motif(const std::string& a_motif) const
+Sequence::find_motif(const Sequence& a_motif) const
{
std::vector<int> motif_match_starts;
- std::string norm_motif_rc;
+ Sequence norm_motif_rc;
motif_match_starts.clear();
+ // std::cout << "motif is: " << norm_motif << std::endl;
- //std::cout << "motif is: " << a_motif << std::endl;
- std::string norm_motif = motif_normalize(a_motif);
- //std::cout << "motif is: " << a_motif << std::endl;
-
- if (norm_motif.size() > 0)
+ if (a_motif.size() > 0)
{
//std::cout << "Sequence: none blank motif\n";
- motif_scan(norm_motif, &motif_match_starts);
+ motif_scan(a_motif, &motif_match_starts);
- norm_motif_rc = rc_motif(a_motif);
+ norm_motif_rc = a_motif.rev_comp();;
// make sure not to do search again if it is a palindrome
- if (norm_motif_rc != norm_motif) {
+ if (norm_motif_rc != a_motif) {
motif_scan(norm_motif_rc, &motif_match_starts);
}
}
return motif_match_starts;
}
-std::vector<int>
-Sequence::find_motif(const Sequence& a_motif) const
-{
- return find_motif(a_motif.get_sequence());
-}
-
void
-Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts) const
+Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
{
// if there's no sequence we can't scan for it?
// should this throw an exception?
if (!seq) return;
- std::string::const_iterator seq_c = seq->begin();
- std::string::size_type seq_i;
- int motif_i, motif_len;
+ std::string::size_type seq_i = 0;
+ Sequence::size_type motif_i = 0;
+ Sequence::size_type motif_len = a_motif.length();
+ Sequence::value_type motif_char;
+ Sequence::value_type seq_char;
- //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl;
- motif_len = a_motif.length();
-
- //std::cout << "motif_length: " << motif_len << std::endl;
- //std::cout << "RAAARRRRR\n";
-
- motif_i = 0;
-
- //std::cout << "motif: " << a_motif << std::endl;
-
- //std::cout << "Sequence: motif, length= " << length << std::endl;
- seq_i = 0;
- while (seq_i < length())
+ while (seq_i < size())
{
- //std::cout << seq_c[seq_i];
- //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
- if (a_motif[motif_i] =='N')
+ motif_char = toupper(a_motif[motif_i]);
+ seq_char = toupper(seq->at(seq_i));
+ if (motif_char =='N')
motif_i++;
- else if (a_motif[motif_i] == seq_c[seq_i])
+ else if (motif_char == seq_char)
motif_i++;
- else if ((a_motif[motif_i] =='M') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+ else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
motif_i++;
- else if ((a_motif[motif_i] =='R') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='W') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='S') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+ else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[motif_i] =='Y') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='K') &&
- ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+ else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='V') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='G')))
+ else if ((motif_char =='V') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
motif_i++;
- else if ((a_motif[seq_i] =='H') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='H') &&
+ ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='D') &&
- ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='D') &&
+ ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
- else if ((a_motif[motif_i] =='B') &&
- ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
- (seq_c[seq_i]=='T')))
+ else if ((motif_char =='B') &&
+ ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
motif_i++;
-
else
{
+ // if a motif doesn't match, erase our current trial and try again
seq_i -= motif_i;
motif_i = 0;
}
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- //std::cout << "!!";
- annot new_motif;
motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
}
{
std::vector<int> seq_starts = find_motif(a_seq);
- //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
-
for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
seq_start_i != seq_starts.end();
++seq_start_i)
{
- annots.push_back(annot(*seq_start_i,
- *seq_start_i+a_seq.size(),
- "",
- name));
+ add_annotation(name, "", *seq_start_i, *seq_start_i+a_seq.size());
}
}
std::ostream& operator<<(std::ostream& out, const Sequence& s)
{
- for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
- out << *s_i;
+ if (s.seq) {
+ for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
+ out << *s_i;
+ }
}
return out;
}
}
}
-bool operator==(const Sequence& x, const Sequence& y)
+template <typename Iter1, typename Iter2>
+static
+bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
{
- if (x.empty() and y.empty()) {
- // if there's no sequence in either sequence structure, they're equal
+ Iter1 aseq_i = abegin;
+ Iter2 bseq_i = bbegin;
+ if (aend-abegin == bend-bbegin) {
+ // since the length of the two sequences is equal, we only need to
+ // test one.
+ for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
+ if (toupper(*aseq_i) != toupper(*bseq_i)) {
+ return false;
+ }
+ }
return true;
- } else if (x.empty() or y.empty()) {
- // if we fail the first test, and we discover one is empty,
- // we know they can't be equal. (and we need to do this
- // to prevent dereferencing an empty pointer)
- return false;
- } else if (x.seq_count != y.seq_count) {
- // if they're of different lenghts, they're not equal
+ } else {
return false;
}
- Sequence::const_iterator xseq_i = x.begin();
- Sequence::const_iterator yseq_i = y.begin();
- // since the length of the two sequences is equal, we only need to
- // test one.
- for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) {
- if (*xseq_i != *yseq_i) {
- return false;
+}
+
+bool operator==(const Sequence& x, const Sequence& y)
+{
+ if (x.seq and y.seq) {
+ // both x and y are defined
+ if (SeqSpan::isFamily(x.seq, y.seq)) {
+ // both are part of the same SeqSpan tree
+ return *(x.seq) == *(y.seq);
+ } else {
+ // we'll have to do a real comparison
+ return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
+ x.begin(), x.end(),
+ y.begin(), y.end()
+ );
}
+ } else {
+ // true if they're both empty (with either a null SeqSpanRef or
+ // a zero length string
+ return (x.size() == y.size());
}
- return true;
}
+
+bool operator!=(const Sequence& x, const Sequence& y)
+{
+ return not operator==(x, y);
+}
+