// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+#include <boost/filesystem/operations.hpp>
namespace fs = boost::filesystem;
#include <boost/spirit/core.hpp>
errormsg << file_path.native_file_string()
<< " did not have any fasta sequences" << std::endl;
throw sequence_empty_file_error(errormsg.str());
+ } catch(sequence_invalid_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string();
+ msg << " " << e.what();
+ throw sequence_invalid_load_error(msg.str());
}
}
}
-void Sequence::load_fasta(std::iostream& file,
+void Sequence::load_fasta(std::istream& file,
int seq_num, int start_index, int end_index)
{
load_fasta(file, alphabet, seq_num, start_index, end_index);
}
void
-Sequence::load_fasta(std::iostream& data_file, alphabet_ref a,
+Sequence::load_fasta(std::istream& data_file, alphabet_ref a,
int seq_num,
int start_index, int end_index)
{
std::string file_data_line;
int header_counter = 0;
+ size_t line_counter = 0;
bool read_seq = true;
std::string rev_comp;
std::string sequence_raw;
while ( (!data_file.eof()) && (header_counter < seq_num) )
{
multiplatform_getline(data_file, file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
header_counter++;
}
while ( !data_file.eof() && read_seq ) {
multiplatform_getline(data_file,file_data_line);
+ ++line_counter;
if (file_data_line.substr(0,1) == ">")
read_seq = false;
else {
if(alpha.exists(*line_i)) {
sequence_raw += *line_i;
} else {
- throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
+ std::ostringstream msg;
+ msg << "Unrecognized characters in fasta sequence at line ";
+ msg << line_counter;
+ throw sequence_invalid_load_error(msg.str());
}
}
}
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
+ if (not fs::exists(file_path)) {
+ throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
+ }
+ if (fs::is_directory(file_path)) {
+ throw mussa_load_error(file_path.string() +
+ " is a directory, please provide a file for annotations."
+ );
+ }
fs::fstream data_stream(file_path, std::ios::in);
if (!data_stream)
{
- throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
+ throw mussa_load_error("Error loading annotation file " + file_path.string());
}
// so i should probably be passing the parse function some iterators
data.push_back(c);
}
data_stream.close();
-
- parse_annot(data, start_index, end_index);
+
+ try {
+ parse_annot(data, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
+ }
}
/* If this works, yikes, this is some brain hurting code.
std::string seq;
std::list<annot> parsed_annots;
std::list<Sequence> query_seqs;
- int parsed=1;
+ int parsed=0;
bool ok = spirit::parse(data.begin(), data.end(),
(
((spirit::ch_p('>')|spirit::str_p(">")) >>
(*(spirit::print_p))[spirit::assign_a(name)] >>
spirit::eol_p >>
- (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seq)]
)[push_back_seq(query_seqs, name, seq, parsed)]
) >>
*spirit::space_p
std::string rc_map = create_reverse_map();
// reverse and convert
- seq_string::const_reverse_iterator seq_i;
- seq_string::const_reverse_iterator seq_end = seq->rend();
- for(seq_i = seq->rbegin();
+ Sequence::const_reverse_iterator seq_i;
+ Sequence::const_reverse_iterator seq_end = rend();
+ for(seq_i = rbegin();
seq_i != seq_end;
++seq_i)
{
{
switch (alpha) {
case reduced_dna_alphabet:
- return Alphabet::reduced_dna_alphabet;
+ return Alphabet::reduced_dna_alphabet();
case reduced_rna_alphabet:
- return Alphabet::reduced_rna_alphabet;
+ return Alphabet::reduced_rna_alphabet();
case reduced_nucleic_alphabet:
- return Alphabet::reduced_nucleic_alphabet;
+ return Alphabet::reduced_nucleic_alphabet();
case nucleic_alphabet:
- return Alphabet::nucleic_alphabet;
+ return Alphabet::nucleic_alphabet();
case protein_alphabet:
- return Alphabet::protein_alphabet;
+ return Alphabet::protein_alphabet();
default:
throw std::runtime_error("unrecognized alphabet type");
break;
}
}
+Sequence::const_reverse_iterator Sequence::rbegin() const
+{
+ if (seq and seq_count != 0)
+ return seq->rbegin()+(seq->size()-(seq_start+seq_count));
+ else
+ return Sequence::const_reverse_iterator();
+}
+
+Sequence::const_reverse_iterator Sequence::rend() const
+{
+ if (seq and seq_count != 0) {
+ return rbegin() + seq_count;
+ } else {
+ return Sequence::const_reverse_iterator();
+ }
+}
+
bool Sequence::empty() const
{
return (seq_count == 0) ? true : false;
}
+Sequence::size_type Sequence::find_first_not_of(
+ const std::string& query,
+ Sequence::size_type index)
+{
+ typedef std::set<std::string::value_type> sequence_set;
+ sequence_set match_set;
+
+ for(const_iterator query_item = query.begin();
+ query_item != query.end();
+ ++query_item)
+ {
+ match_set.insert(*query_item);
+ }
+ for(const_iterator base = begin();
+ base != end();
+ ++base)
+ {
+ if(match_set.find(*base) == match_set.end()) {
+ return base-begin();
+ }
+ }
+ return Sequence::npos;
+}
+
Sequence::size_type Sequence::start() const
{
if (parent)
Sequence::value_type motif_char;
Sequence::value_type seq_char;
- while (seq_i < seq->length())
+ while (seq_i < size())
{
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
motif_char = toupper(a_motif[motif_i]);
- seq_char = toupper(seq->at(seq_i));
+ seq_char = toupper(seq->at(seq_start+seq_i));
if (motif_char =='N')
motif_i++;
else if (motif_char == seq_char)