// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+#include <boost/filesystem/operations.hpp>
namespace fs = boost::filesystem;
#include <boost/spirit/core.hpp>
errormsg << file_path.native_file_string()
<< " did not have any fasta sequences" << std::endl;
throw sequence_empty_file_error(errormsg.str());
+ } catch(sequence_invalid_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string();
+ msg << " " << e.what();
+ throw sequence_invalid_load_error(msg.str());
}
}
}
-void Sequence::load_fasta(std::iostream& file,
+void Sequence::load_fasta(std::istream& file,
int seq_num, int start_index, int end_index)
{
load_fasta(file, alphabet, seq_num, start_index, end_index);
}
void
-Sequence::load_fasta(std::iostream& data_file, alphabet_ref a,
+Sequence::load_fasta(std::istream& data_file, alphabet_ref a,
int seq_num,
int start_index, int end_index)
{
std::string file_data_line;
int header_counter = 0;
+ size_t line_counter = 0;
bool read_seq = true;
std::string rev_comp;
std::string sequence_raw;
while ( (!data_file.eof()) && (header_counter < seq_num) )
{
multiplatform_getline(data_file, file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
header_counter++;
}
while ( !data_file.eof() && read_seq ) {
multiplatform_getline(data_file,file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
read_seq = false;
else {
if(alpha.exists(*line_i)) {
sequence_raw += *line_i;
} else {
- throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
+ std::ostringstream msg;
+ msg << "Unrecognized characters in fasta sequence at line ";
+ msg << line_counter;
+ throw sequence_invalid_load_error(msg.str());
}
}
}
for(size_type i = 0; i != count; ++i, ++seq_i)
{
if (alpha_impl.exists(*seq_i)) {
- new_seq->append(1, *seq_i);
+ new_seq->append(1, toupper(*seq_i));
} else {
new_seq->append(1, 'N');
}
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
+ if (not fs::exists(file_path)) {
+ throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
+ }
+ if (fs::is_directory(file_path)) {
+ throw mussa_load_error(file_path.string() +
+ " is a directory, please provide a file for annotations."
+ );
+ }
fs::fstream data_stream(file_path, std::ios::in);
if (!data_stream)
{
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
+ throw mussa_load_error("Error loading annotation file " + file_path.string());
}
// so i should probably be passing the parse function some iterators
data.push_back(c);
}
data_stream.close();
-
- parse_annot(data, start_index, end_index);
+
+ try {
+ parse_annot(data, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
+ }
}
/* If this works, yikes, this is some brain hurting code.
int& end;
std::string& name;
std::string& type;
+ int &parsed;
push_back_annot(std::list<annot>& annot_list_,
int& begin_,
int& end_,
std::string& name_,
- std::string& type_)
+ std::string& type_,
+ int &parsed_)
: annot_list(annot_list_),
begin(begin_),
end(end_),
name(name_),
- type(type_)
+ type(type_),
+ parsed(parsed_)
{
}
{
//std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
annot_list.push_back(annot(begin, end, name, type));
+ ++parsed;
};
};
std::list<Sequence>& seq_list;
std::string& name;
std::string& seq;
+ int &parsed;
push_back_seq(std::list<Sequence>& seq_list_,
std::string& name_,
- std::string& seq_)
+ std::string& seq_,
+ int &parsed_)
: seq_list(seq_list_),
name(name_),
- seq(seq_)
+ seq(seq_),
+ parsed(parsed_)
{
}
Sequence s(new_seq);
s.set_fasta_header(name);
seq_list.push_back(s);
+ ++parsed;
};
};
-bool
+void
Sequence::parse_annot(std::string data, int start_index, int end_index)
{
int start=0;
std::string name;
std::string type;
std::string seq;
+ std::list<annot> parsed_annots;
std::list<Sequence> query_seqs;
-
- bool status = spirit::parse(data.begin(), data.end(),
- (
- //begin grammar
- !(
- (
- spirit::alpha_p >>
- +(spirit::graph_p)
- )[spirit::assign_a(species)] >>
- +(spirit::space_p)
- ) >>
- *(
- ( // ignore html tags
- *(spirit::space_p) >>
- spirit::ch_p('<') >>
- +(~spirit::ch_p('>')) >>
- spirit::ch_p('>') >>
- *(spirit::space_p)
- )
- |
- ( // parse an absolute location name
- (spirit::uint_p[spirit::assign_a(start)] >>
- +spirit::space_p >>
- spirit::uint_p[spirit::assign_a(end)] >>
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(name)] >>
- // optional type
- !(
- +spirit::space_p >>
- (
- spirit::alpha_p >>
- *spirit::graph_p
- )[spirit::assign_a(type)]
- )
- // to understand how this group gets set
- // read the comment above struct push_back_annot
- )[push_back_annot(annots, start, end, type, name)]
- |
- ((spirit::ch_p('>')|spirit::str_p(">")) >>
- (*(spirit::print_p))[spirit::assign_a(name)] >>
- spirit::eol_p >>
- (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
- )[push_back_seq(query_seqs, name, seq)]
- ) >>
- *spirit::space_p
+ int parsed=0;
+
+ bool ok = spirit::parse(data.begin(), data.end(),
+ (
+ //begin grammar
+ !(
+ (
+ spirit::alpha_p >>
+ +(spirit::graph_p)
+ )[spirit::assign_a(species)] >>
+ +(spirit::space_p)
+ ) >>
+ *(
+ ( // ignore html tags
+ *(spirit::space_p) >>
+ spirit::ch_p('<') >>
+ +(~spirit::ch_p('>')) >>
+ spirit::ch_p('>') >>
+ *(spirit::space_p)
)
- //end grammar
- )).full;
-
+ |
+ ( // parse an absolute location name
+ (spirit::uint_p[spirit::assign_a(start)] >>
+ +spirit::space_p >>
+ spirit::uint_p[spirit::assign_a(end)] >>
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(name)] >>
+ // optional type
+ !(
+ +spirit::space_p >>
+ (
+ spirit::alpha_p >>
+ *spirit::graph_p
+ )[spirit::assign_a(type)]
+ )
+ // to understand how this group gets set
+ // read the comment above struct push_back_annot
+ )[push_back_annot(parsed_annots, start, end, type, name, parsed)]
+ |
+ ((spirit::ch_p('>')|spirit::str_p(">")) >>
+ (*(spirit::print_p))[spirit::assign_a(name)] >>
+ spirit::eol_p >>
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seq)]
+ )[push_back_seq(query_seqs, name, seq, parsed)]
+ ) >>
+ *spirit::space_p
+ )
+ //end grammar
+ )).full;
+ if (not ok) {
+ std::stringstream msg;
+ msg << "Error parsing annotation #" << parsed;
+ throw annotation_load_error(msg.str());
+ }
+ // add newly parsed annotations to our sequence
+ std::copy(parsed_annots.begin(), parsed_annots.end(), std::back_inserter(annots));
// go seearch for query sequences
find_sequences(query_seqs.begin(), query_seqs.end());
- return status;
}
void Sequence::add_annotation(const annot& a)
std::string rc_map = create_reverse_map();
// reverse and convert
- seq_string::const_reverse_iterator seq_i;
- seq_string::const_reverse_iterator seq_end = seq->rend();
- for(seq_i = seq->rbegin();
+ Sequence::const_reverse_iterator seq_i;
+ Sequence::const_reverse_iterator seq_end = rend();
+ for(seq_i = rbegin();
seq_i != seq_end;
++seq_i)
{
{
switch (alpha) {
case reduced_dna_alphabet:
- return Alphabet::reduced_dna_alphabet;
+ return Alphabet::reduced_dna_alphabet();
case reduced_rna_alphabet:
- return Alphabet::reduced_rna_alphabet;
+ return Alphabet::reduced_rna_alphabet();
case reduced_nucleic_alphabet:
- return Alphabet::reduced_nucleic_alphabet;
+ return Alphabet::reduced_nucleic_alphabet();
case nucleic_alphabet:
- return Alphabet::nucleic_alphabet;
+ return Alphabet::nucleic_alphabet();
case protein_alphabet:
- return Alphabet::protein_alphabet;
+ return Alphabet::protein_alphabet();
default:
throw std::runtime_error("unrecognized alphabet type");
break;
}
}
+Sequence::const_reverse_iterator Sequence::rbegin() const
+{
+ if (seq and seq_count != 0)
+ return seq->rbegin()+(seq->size()-(seq_start+seq_count));
+ else
+ return Sequence::const_reverse_iterator();
+}
+
+Sequence::const_reverse_iterator Sequence::rend() const
+{
+ if (seq and seq_count != 0) {
+ return rbegin() + seq_count;
+ } else {
+ return Sequence::const_reverse_iterator();
+ }
+}
+
bool Sequence::empty() const
{
return (seq_count == 0) ? true : false;
}
+Sequence::size_type Sequence::find_first_not_of(
+ const std::string& query,
+ Sequence::size_type index)
+{
+ typedef std::set<std::string::value_type> sequence_set;
+ sequence_set match_set;
+
+ for(const_iterator query_item = query.begin();
+ query_item != query.end();
+ ++query_item)
+ {
+ match_set.insert(*query_item);
+ }
+ for(const_iterator base = begin();
+ base != end();
+ ++base)
+ {
+ if(match_set.find(*base) == match_set.end()) {
+ return base-begin();
+ }
+ }
+ return Sequence::npos;
+}
+
Sequence::size_type Sequence::start() const
{
if (parent)
Sequence::value_type motif_char;
Sequence::value_type seq_char;
- while (seq_i < seq->length())
+ while (seq_i < size())
{
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
motif_char = toupper(a_motif[motif_i]);
- seq_char = toupper(seq->at(seq_i));
+ seq_char = toupper(seq->at(seq_start+seq_i));
if (motif_char =='N')
motif_i++;
else if (motif_char == seq_char)