// ---------- sequence.cc -----------
// ----------------------------------------
#include <boost/filesystem/fstream.hpp>
+namespace fs = boost::filesystem;
+
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/actor/push_back_actor.hpp>
+#include <boost/spirit/iterator/file_iterator.hpp>
+namespace spirit = boost::spirit;
#include "alg/sequence.hpp"
#include "mussa_exceptions.hpp"
#include <iostream>
#include <sstream>
-namespace fs = boost::filesystem;
-using namespace std;
-
annot::annot()
: start(0),
end(0),
motif_list.clear();
}
-Sequence::Sequence(string seq)
+Sequence::Sequence(std::string seq)
{
set_filtered_sequence(seq);
}
return sequence[index];
}
-ostream& operator<<(ostream& out, const Sequence& seq)
+std::ostream& operator<<(std::ostream& out, const Sequence& seq)
{
out << "Sequence(" << seq.get_seq() << ")";
return out;
int start_index, int end_index)
{
fs::fstream data_file;
- string file_data_line;
+ std::string file_data_line;
int header_counter = 0;
bool read_seq = true;
- string rev_comp;
- string sequence_raw;
- string seq_tmp; // holds sequence during basic filtering
+ std::string rev_comp;
+ std::string sequence_raw;
+ std::string seq_tmp; // holds sequence during basic filtering
- data_file.open(file_path, ios::in);
+ data_file.open(file_path, std::ios::in);
if (seq_num == 0) {
throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
// sequence filtering for upcasing agctn and convert non AGCTN to N
set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
} else {
- stringstream errormsg;
+ std::stringstream errormsg;
errormsg << file_path.native_file_string()
- << " did not have any fasta sequences" << endl;
+ << " did not have any fasta sequences" << std::endl;
throw mussa_load_error(errormsg.str());
}
data_file.close();
}
}
-void Sequence::set_filtered_sequence(const string &old_seq,
- string::size_type start,
- string::size_type count)
+void Sequence::set_filtered_sequence(const std::string &old_seq,
+ std::string::size_type start,
+ std::string::size_type count)
{
char conversionTable[257];
conversionTable[(int)'c'] = 'C';
// finally, the actual conversion loop
- for(string::size_type seq_index = 0; seq_index < count; seq_index++)
+ for(std::string::size_type seq_index = 0; seq_index < count; seq_index++)
{
sequence += conversionTable[ (int)old_seq[seq_index+start]];
}
// this doesn't work properly under gcc 3.x ... it can't recognize toupper
//transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
-
void
Sequence::load_annot(fs::path file_path, int start_index, int end_index)
{
- fs::fstream data_file;
- string file_data_line;
- annot an_annot;
- string::size_type space_split_i;
- string annot_value;
- list<annot>::iterator list_i;
- string err_msg;
-
-
- annots.clear();
- data_file.open(file_path, ios::in);
-
- if (!data_file)
+ fs::fstream data_stream(file_path, std::ios::in);
+ if (!data_stream)
{
throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
}
- // if file opened okay, read it
- else
+ // so i should probably be passing the parse function some iterators
+ // but the annotations files are (currently) small, so i think i can
+ // get away with loading the whole file into memory
+ std::string data;
+ char c;
+ while(data_stream.good()) {
+ data_stream.get(c);
+ data.push_back(c);
+ }
+ data_stream.close();
+
+ parse_annot(data, start_index, end_index);
+}
+
+/* If this works, yikes, this is some brain hurting code.
+ *
+ * what's going on is that when pb_annot is instantiated it stores references
+ * to begin, end, name, type, declared in the parse function, then
+ * when operator() is called it grabs values from those references
+ * and uses that to instantiate an annot object and append that to our
+ * annotation list.
+ *
+ * This weirdness is because the spirit library requires that actions
+ * conform to a specific prototype operator()(IteratorT, IteratorT)
+ * which doesn't provide any useful opportunity for me to actually
+ * grab the results of our parsing.
+ *
+ * so I instantiate this structure in order to have a place to grab
+ * my data from.
+ */
+
+struct push_back_annot {
+ std::list<annot>& annot_list;
+ int& begin;
+ int& end;
+ std::string& name;
+ std::string& type;
+
+ push_back_annot(std::list<annot>& annot_list_,
+ int& begin_,
+ int& end_,
+ std::string& name_,
+ std::string& type_)
+ : annot_list(annot_list_),
+ begin(begin_),
+ end(end_),
+ name(name_),
+ type(type_)
{
- getline(data_file,file_data_line);
- species = file_data_line;
+ }
- // end_index = 0 means no end was specified, so cut to the end
- if (end_index == 0)
- end_index = sequence.length();
+ void operator()(std::string::const_iterator,
+ std::string::const_iterator) const
+ {
+ annot_list.push_back(annot(begin, end, name, type));
+ };
+};
- //cout << "START: " << start_index << " END: " << end_index << endl;
- while ( !data_file.eof() )
+void
+Sequence::parse_annot(std::string data, int start_index, int end_index)
+{
+ std::string species_name;
+ int start=0;
+ int end=0;
+ std::string name;
+ std::string type;
+
+
+ bool status = spirit::parse(data.begin(), data.end(),
+ //begin grammar
+ (
+ (+(spirit::alpha_p))[spirit::assign_a(species_name)] >>
+ *((spirit::uint_p[spirit::assign_a(start)] >>
+ spirit::uint_p[spirit::assign_a(end)] >>
+ (*(spirit::alpha_p))[spirit::assign_a(name)] >>
+ (*(spirit::alpha_p))[spirit::assign_a(type)]
+ // to understand, read the comment above
+ // struct push_back_annot
+ )[push_back_annot(annots, start, end, name, type)])
+ ),
+ //end grammar
+ spirit::space_p).full;
+}
+
+/*
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
+ std::string file_data_line;
+ annot an_annot;
+ std::string::size_type space_split_i;
+ std::string annot_value;
+ std::list<annot>::iterator list_i;
+ std::string err_msg;
+
+ annots.clear();
+
+ getline(data_stream,file_data_line);
+ species = file_data_line;
+
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = sequence.length();
+
+ //std::cout << "START: " << start_index << " END: " << end_index << std::endl;
+
+ while ( !data_stream.eof() )
+ {
+ getline(data_stream,file_data_line);
+ if (file_data_line != "")
{
- getline(data_file,file_data_line);
- if (file_data_line != "")
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+
+ //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ // << std::endl;
+
+ // get annot name
+ space_split_i = file_data_line.find(" ");
+ if (space_split_i == std::string::npos) // no entries for name & type
+ {
+ std::cout << "seq, annots - no name or type\n";
+ an_annot.name = "";
+ an_annot.type = "";
+ }
+ else
{
- // need to get 4 values...almost same code 4 times...
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.start = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot end index
- space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
+ an_annot.name = annot_value;
file_data_line = file_data_line.substr(space_split_i+1);
-
- //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
- // << endl;
-
- // get annot name
+ // get annot type
space_split_i = file_data_line.find(" ");
- if (space_split_i == string::npos) // no entries for name & type
- {
- cout << "seq, annots - no name or type\n";
- an_annot.name = "";
+ if (space_split_i == std::string::npos) // no entry for type
an_annot.type = "";
- }
else
{
annot_value = file_data_line.substr(0,space_split_i);
- an_annot.name = annot_value;
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot type
- space_split_i = file_data_line.find(" ");
- if (space_split_i == string::npos) // no entry for type
- an_annot.type = "";
- else
- {
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
- }
+ an_annot.type = annot_value;
}
+ }
- // add annot to list if it falls within the range of sequence specified
- if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
- {
- an_annot.start -= start_index;
- an_annot.end -= start_index;
- annots.push_back(an_annot);
- }
- // else no (or bogus) annotations
+ // add annot to list if it falls within the range of sequence specified
+ if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
+ {
+ an_annot.start -= start_index;
+ an_annot.end -= start_index;
+ annots.push_back(an_annot);
}
+ // else no (or bogus) annotations
}
-
- data_file.close();
- /*
- // debugging check
- for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
- {
- cout << (*list_i).start << "," << (*list_i).end << "\t";
- cout << (*list_i).name << "\t" << (*list_i).type << endl;
- }
- */
}
}
+*/
const std::string& Sequence::get_species() const
{
return annots;
}
-string::size_type Sequence::length() const
+std::string::size_type Sequence::length() const
{
return size();
}
-string::size_type Sequence::size() const
+std::string::size_type Sequence::size() const
{
return sequence.size();
}
}
-const string&
+const std::string&
Sequence::get_seq() const
{
return sequence;
}
-string
+std::string
Sequence::subseq(int start, int end) const
{
return sequence.substr(start, end);
return sequence.c_str();
}
-string
+std::string
Sequence::rev_comp() const
{
- string rev_comp;
+ std::string rev_comp;
char conversionTable[257];
int seq_i, table_i, len;
}
-const string&
+const std::string&
Sequence::get_header() const
{
return header;
}
/*
//FIXME: i don't think this code is callable
-string
+std::string
Sequence::sp_name() const
{
return species;
*/
void
-Sequence::set_seq(const string& a_seq)
+Sequence::set_seq(const std::string& a_seq)
{
set_filtered_sequence(a_seq);
}
/*
-string
+std::string
Sequence::species()
{
return species;
void
Sequence::save(fs::fstream &save_file)
- //string save_file_path)
+ //std::string save_file_path)
{
//fstream save_file;
- list<annot>::iterator annots_i;
+ std::list<annot>::iterator annots_i;
// not sure why, or if i'm doing something wrong, but can't seem to pass
// file pointers down to this method from the mussa control class
// so each call to save a sequence appends to the file started by mussa_class
- //save_file.open(save_file_path.c_str(), ios::app);
+ //save_file.open(save_file_path.c_str(), std::ios::app);
- save_file << "<Sequence>" << endl;
- save_file << sequence << endl;
- save_file << "</Sequence>" << endl;
+ save_file << "<Sequence>" << std::endl;
+ save_file << sequence << std::endl;
+ save_file << "</Sequence>" << std::endl;
- save_file << "<Annotations>" << endl;
- save_file << species << endl;
+ save_file << "<Annotations>" << std::endl;
+ save_file << species << std::endl;
for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
{
save_file << annots_i->start << " " << annots_i->end << " " ;
- save_file << annots_i->name << " " << annots_i->type << endl;
+ save_file << annots_i->name << " " << annots_i->type << std::endl;
}
- save_file << "</Annotations>" << endl;
+ save_file << "</Annotations>" << std::endl;
//save_file.close();
}
Sequence::load_museq(fs::path load_file_path, int seq_num)
{
fs::fstream load_file;
- string file_data_line;
+ std::string file_data_line;
int seq_counter;
annot an_annot;
- string::size_type space_split_i;
- string annot_value;
+ std::string::size_type space_split_i;
+ std::string annot_value;
annots.clear();
- load_file.open(load_file_path, ios::in);
+ load_file.open(load_file_path, std::ios::in);
seq_counter = 0;
// search for the seq_num-th sequence
annot_value = file_data_line.substr(0,space_split_i);
an_annot.end = atoi (annot_value.c_str());
- if (space_split_i == string::npos) // no entry for type or name
+ if (space_split_i == std::string::npos) // no entry for type or name
{
- cout << "seq, annots - no type or name\n";
+ std::cout << "seq, annots - no type or name\n";
an_annot.type = "";
an_annot.name = "";
}
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
an_annot.type = annot_value;
- if (space_split_i == string::npos) // no entry for name
+ if (space_split_i == std::string::npos) // no entry for name
{
- cout << "seq, annots - no name\n";
+ std::cout << "seq, annots - no name\n";
an_annot.name = "";
}
else // get annot name
}
annots.push_back(an_annot); // don't forget to actually add the annot
}
- //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
- // << "-->" << an_annot.type << "::" << an_annot.name << endl;
+ //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ // << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
}
}
load_file.close();
}
-string
-Sequence::rc_motif(string a_motif)
+std::string
+Sequence::rc_motif(std::string a_motif)
{
- string rev_comp;
+ std::string rev_comp;
char conversionTable[257];
int seq_i, table_i, len;
{
conversionTable[table_i] = '~';
}
- // add end of string character for printing out table for testing purposes
+ // add end of std::string character for printing out table for testing purposes
conversionTable[256] = '\0';
// add in the characters for the bases we want to convert (IUPAC)
// finally, the actual conversion loop
for(seq_i = len - 1; seq_i >= 0; seq_i--)
{
- //cout << "** i = " << seq_i << " bp = " <<
+ //std::cout << "** i = " << seq_i << " bp = " <<
table_i = (int) a_motif[seq_i];
rev_comp += conversionTable[table_i];
}
- //cout << "seq: " << a_motif << endl;
- //cout << "rc: " << rev_comp << endl;
+ //std::cout << "seq: " << a_motif << std::endl;
+ //std::cout << "rc: " << rev_comp << std::endl;
return rev_comp;
}
-string
-Sequence::motif_normalize(string a_motif)
+std::string
+Sequence::motif_normalize(std::string a_motif)
{
- string valid_motif;
+ std::string valid_motif;
int seq_i, len;
len = a_motif.length();
else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
valid_motif += 'B';
else {
- string msg = "Letter ";
+ std::string msg = "Letter ";
msg += a_motif[seq_i];
msg += " is not a valid IUPAC symbol";
throw motif_normalize_error(msg);
}
}
- //cout << "valid_motif is: " << valid_motif << endl;
+ //std::cout << "valid_motif is: " << valid_motif << std::endl;
return valid_motif;
}
-void Sequence::add_motif(string a_motif)
+void Sequence::add_motif(std::string a_motif)
{
- vector<int> motif_starts = find_motif(a_motif);
+ std::vector<int> motif_starts = find_motif(a_motif);
- for(vector<int>::iterator motif_start_i = motif_starts.begin();
+ for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
motif_start_i != motif_starts.end();
++motif_start_i)
{
motif_list.clear();
}
-const list<motif>& Sequence::motifs() const
+const std::list<motif>& Sequence::motifs() const
{
return motif_list;
}
-vector<int>
-Sequence::find_motif(string a_motif)
+std::vector<int>
+Sequence::find_motif(std::string a_motif)
{
- vector<int> motif_match_starts;
- string a_motif_rc;
+ std::vector<int> motif_match_starts;
+ std::string a_motif_rc;
motif_match_starts.clear();
- //cout << "motif is: " << a_motif << endl;
+ //std::cout << "motif is: " << a_motif << std::endl;
a_motif = motif_normalize(a_motif);
- //cout << "motif is: " << a_motif << endl;
+ //std::cout << "motif is: " << a_motif << std::endl;
if (a_motif != "")
{
- //cout << "Sequence: none blank motif\n";
+ //std::cout << "Sequence: none blank motif\n";
motif_scan(a_motif, &motif_match_starts);
a_motif_rc = rc_motif(a_motif);
}
void
-Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
+Sequence::motif_scan(std::string a_motif, std::vector<int> * motif_match_starts)
{
char * seq_c;
- string::size_type seq_i;
+ std::string::size_type seq_i;
int motif_i, motif_len;
- // faster to loop thru the sequence as a old c string (ie char array)
+ // faster to loop thru the sequence as a old c std::string (ie char array)
seq_c = (char*)sequence.c_str();
- //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
+ //std::cout << "Sequence: motif, seq len = " << sequence.length() << std::endl;
motif_len = a_motif.length();
- //cout << "motif_length: " << motif_len << endl;
- //cout << "RAAARRRRR\n";
+ //std::cout << "motif_length: " << motif_len << std::endl;
+ //std::cout << "RAAARRRRR\n";
motif_i = 0;
- //cout << "motif: " << a_motif << endl;
+ //std::cout << "motif: " << a_motif << std::endl;
- //cout << "Sequence: motif, length= " << length << endl;
+ //std::cout << "Sequence: motif, length= " << length << std::endl;
seq_i = 0;
while (seq_i < sequence.length())
{
- //cout << seq_c[seq_i];
- //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
+ //std::cout << seq_c[seq_i];
+ //std::cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
if (a_motif[motif_i] =='N')
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- //cout << "!!";
+ //std::cout << "!!";
annot new_motif;
motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
seq_i++;
}
- //cout << endl;
+ //std::cout << std::endl;
}
void Sequence::add_string_annotation(std::string a_seq,
std::string name)
{
- vector<int> seq_starts = find_motif(a_seq);
+ std::vector<int> seq_starts = find_motif(a_seq);
- for(vector<int>::iterator seq_start_i = seq_starts.begin();
+ for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
seq_start_i != seq_starts.end();
++seq_start_i)
{