namespace spirit = boost::spirit;
#include "alg/sequence.hpp"
+#include "io.hpp"
#include "mussa_exceptions.hpp"
#include <string>
: seq(new SeqSpan(o->seq)),
header(o->header),
species(o->species),
- annotation_list(o->annotation_list),
+ annotation_list(new SeqSpanRefList),
motif_list(o->motif_list)
{
+ // copy over the annotations in the other sequence ref,
+ // attaching them to our current sequence ref
+ for(SeqSpanRefList::const_iterator annot_i = o->annotation_list->begin();
+ annot_i != o->annotation_list->end();
+ ++annot_i)
+ {
+ size_type annot_begin= (*annot_i)->start();
+ size_type annot_count = (*annot_i)->size();
+
+ SeqSpanRef new_annot(seq->subseq(annot_begin, annot_count));
+ new_annot->setAnnotations((*annot_i)->annotations());
+ annotation_list->push_back(new_annot);
+ }
}
Sequence::Sequence(const SeqSpanRef& seq_ref)
return *this;
}
-static void multiplatform_getline(std::istream& in, std::string& line)
-{
- line.clear();
- char c;
- in.get(c);
- while(in.good() and !(c == '\012' or c == '\015') ) {
- line.push_back(c);
- in.get(c);
- }
- // if we have cr-lf eat it
- c = in.peek();
- if (c=='\012' or c == '\015') {
- in.get();
- }
-}
-
void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
{
load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
throw mussa_load_error("Error loading annotation file " + file_path.string());
}
+ try {
+ load_annot(data_stream, start_index, end_index);
+ } catch(annotation_load_error e) {
+ std::ostringstream msg;
+ msg << file_path.native_file_string()
+ << " "
+ << e.what();
+ throw annotation_load_error(msg.str());
+ }
+ data_stream.close();
+}
+
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
// so i should probably be passing the parse function some iterators
// but the annotations files are (currently) small, so i think i can
// get away with loading the whole file into memory
data_stream.get(c);
data.push_back(c);
}
- data_stream.close();
- try {
- parse_annot(data, start_index, end_index);
- } catch(annotation_load_error e) {
- std::ostringstream msg;
- msg << file_path.native_file_string()
- << " "
- << e.what();
- throw annotation_load_error(msg.str());
- }
+ parse_annot(data, start_index, end_index);
}
/* If this works, yikes, this is some brain hurting code.
void operator()(std::string::const_iterator,
std::string::const_iterator) const
{
+ std::string::iterator seq_i = seq.begin();
+ std::string::iterator seq_end = seq.end();
+
+ // this if block is a hack, for some reason spirit was
+ // duplicating the last character if the file didn't end
+ // with a new line.
+ // this checks for the trailing newline, and if it is missing
+ // removes the last character ( which should be the duplicated character.
+ // check test_sequence.cpp:sequence_no_trailing_newline for test case
+ // also see ticket:265 for more information
+ if (seq.size() > 0) {
+ std::string::value_type c = seq[seq.size()-1];
+ if (not (c == '\015' or c == '\012')) {
+ // doesn't end with a new line character
+ seq_end--;
+ }
+ }
+ // end hack
+
// filter out newlines from our sequence
std::string new_seq;
- for(std::string::const_iterator seq_i = seq.begin();
- seq_i != seq.end();
- ++seq_i)
+ for(; seq_i != seq_end; ++seq_i)
{
if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
}
//std::cout << "adding seq: " << name << " " << new_seq << std::endl;
-
Sequence s(new_seq);
s.set_fasta_header(name);
seq_list.push_back(s);
} else {
annot_end = count;
}
-
- SeqSpanRef new_annot(seq->subseq(annot_begin, annot_end));
+ SeqSpanRef new_annot(new_seq.seq->subseq(annot_begin, annot_end));
new_annot->setAnnotations((*annot_i)->annotations());
new_seq.annotation_list->push_back(new_annot);
}
SeqSpanRefList::iterator annots_i;
AnnotationsRef metadata;
- // not sure why, or if i'm doing something wrong, but can't seem to pass
- // file pointers down to this method from the mussa control class
- // so each call to save a sequence appends to the file started by mussa_class
- //save_file.open(save_file_path.c_str(), std::ios::app);
-
save_file << "<Sequence>" << std::endl;
save_file << *this << std::endl;
save_file << "</Sequence>" << std::endl;
//save_file.close();
}
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+// fs::fstream load_file;
+// std::string file_data_line;
+// int seq_counter;
+// //annot an_annot;
+// int annot_begin;
+// int annot_end;
+// std::string annot_name;
+// std::string annot_type;
+//
+// std::string::size_type space_split_i;
+// std::string annot_value;
+//
+// annotation_list.reset(new SeqSpanRefList);
+//
+// load_file.open(load_file_path, std::ios::in);
+//
+// seq_counter = 0;
+// // search for the seq_num-th sequence
+// while ( (!load_file.eof()) && (seq_counter < seq_num) )
+// {
+// getline(load_file,file_data_line);
+// if (file_data_line == "<Sequence>")
+// seq_counter++;
+// }
+// getline(load_file, file_data_line);
+// // looks like the sequence is written as a single line
+// set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+// getline(load_file, file_data_line);
+// getline(load_file, file_data_line);
+// if (file_data_line == "<Annotations>")
+// {
+// getline(load_file, file_data_line);
+// species = file_data_line;
+// while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+// {
+// getline(load_file,file_data_line);
+// if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+// {
+// // need to get 4 values...almost same code 4 times...
+// // get annot start index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_begin = atoi (annot_value.c_str());
+// file_data_line = file_data_line.substr(space_split_i+1);
+// // get annot end index
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// annot_end = atoi (annot_value.c_str());
+//
+// if (space_split_i == std::string::npos) // no entry for type or name
+// {
+// std::cout << "seq, annots - no type or name\n";
+// annot_name = "";
+// annot_type = "";
+// }
+// else // else get annot type
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// //an_annot.type = annot_value;
+// annot_type = annot_value;
+// if (space_split_i == std::string::npos) // no entry for name
+// {
+// std::cout << "seq, annots - no name\n";
+// annot_name = "";
+// }
+// else // get annot name
+// {
+// file_data_line = file_data_line.substr(space_split_i+1);
+// space_split_i = file_data_line.find(" ");
+// annot_value = file_data_line.substr(0,space_split_i);
+// // this seems like its wrong?
+// annot_type = annot_value;
+// }
+// }
+// add_annotation(annot_name, annot_type, annot_begin, annot_end);
+// }
+// //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+// // << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+// }
+// }
+// load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
{
- fs::fstream load_file;
+ boost::shared_ptr<Sequence> seq(new Sequence);
std::string file_data_line;
int seq_counter;
//annot an_annot;
std::string::size_type space_split_i;
std::string annot_value;
- annotation_list.reset(new SeqSpanRefList);
-
- load_file.open(load_file_path, std::ios::in);
+ //seq->annotation_list.reset(new SeqSpanRefList);
seq_counter = 0;
- // search for the seq_num-th sequence
+ // search for the next sequence
+ int seq_num = 1;
while ( (!load_file.eof()) && (seq_counter < seq_num) )
{
getline(load_file,file_data_line);
if (file_data_line == "<Sequence>")
seq_counter++;
}
+
+ // Could not find next sequence
+ if (load_file.eof())
+ {
+ seq.reset();
+ return seq;
+ }
+
getline(load_file, file_data_line);
// looks like the sequence is written as a single line
- set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+ seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
{
getline(load_file, file_data_line);
- species = file_data_line;
+ seq->set_species(file_data_line);
while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
{
getline(load_file,file_data_line);
annot_type = annot_value;
}
}
- add_annotation(annot_name, annot_type, annot_begin, annot_end);
+ seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
}
//std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
// << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
}
}
- load_file.close();
+ //load_file.close();
+ return seq;
}