#include <boost/filesystem/fstream.hpp>
namespace fs = boost::filesystem;
+#include <boost/algorithm/string.hpp>
+
#include <iostream>
#include <sstream>
return soft_thres;
}
+
void
Mussa::set_analysis_mode(enum analysis_modes new_ana_mode)
{
void
Mussa::load_mupa_stream(std::istream& para_file, fs::path& file_path_base)
{
- bool error_occured = false;
- string file_data_line;
- string param, value;
+ std::string line;
+ std::vector< std::string > tokens;
+ int line_count = 0;
+
+ enum parsing_state_enum { START, INSEQUENCE };
+ parsing_state_enum parsing_state = START;
+
+ // sequence file parameters
+ fs::path seq_file;
fs::path annot_file;
int split_index, fasta_index;
int sub_seq_start, sub_seq_end;
bool seq_params, did_seq;
- string err_msg;
- bool parsing_path;
- string::size_type new_index, dir_index;
+ std::string err_msg;
// initialize values
clear();
- // setup loop by getting file's first line
- getline(para_file, file_data_line);
- split_index = file_data_line.find(" ");
- param = file_data_line.substr(0,split_index);
- value = file_data_line.substr(split_index+1);
- while (para_file)
+ while (para_file.good())
{
- did_seq = false;
- if (param == "ANA_NAME")
- analysis_name = value;
- else if (param == "APPEND_WIN")
+ ++line_count;
+ // setup loop by getting file's first line
+ multiplatform_getline(para_file, line);
+ // strip leading/trailing whitespace
+ boost::trim(line);
+ // ignore commented out or blank lines
+ if ( line.size() == 0 or line[0] == '#' ) {
+ continue;
+ }
+
+ // split the line on white spance
+ boost::split(tokens, line, boost::is_space());
+ // do we have a name/value pair?
+ if (tokens.size() != 2) {
+ std::stringstream errmsg;
+ errmsg << "Error parsing MUPA file line: "
+ << line_count << std::endl
+ << line;
+ throw mussa_load_error(errmsg.str());
+ }
+
+ boost::to_upper(tokens[0]);
+ // Parameters only useful after a sequence block
+ if (parsing_state == INSEQUENCE) {
+ // in the following if blocks, if we do
+ // successfully match a token we should continue
+ // on to the next token
+ // but if we don't match a token we want to
+ // fall through to the top level parsing
+
+ if (tokens[0] == "FASTA_INDEX") {
+ fasta_index = atoi(tokens[1].c_str());
+ continue;
+ } else if (tokens[0] == "ANNOTATION") {
+ annot_file = file_path_base / tokens[1];
+ continue;
+ } else if (tokens[0] == "SEQ_START") {
+ sub_seq_start = atoi(tokens[1].c_str());
+ continue;
+ } else if (tokens[0] == "SEQ_END") {
+ sub_seq_end = atoi(tokens[1].c_str());
+ continue;
+ } else {
+ // any other token means we're done with this
+ // sequence so we should load it
+ // (and let the "unknown" token fall through into the
+ // top level token parser)
+ load_sequence(seq_file, annot_file, fasta_index, sub_seq_start,
+ sub_seq_end);
+ parsing_state = START;
+ }
+ }
+ // if we didn't consume a token from the previous if block
+ // try
+ // top level token parsing
+ if (tokens[0] == "ANA_NAME") {
+ analysis_name = tokens[1];
+ } else if (tokens[0] == "APPEND_WIN") {
win_append = true;
- else if (param == "APPEND_THRES")
+ } else if (tokens[0] == "APPEND_THRES") {
thres_append = true;
- else if (param == "SEQUENCE_NUM")
+ } else if (tokens[0] == "SEQUENCE_NUM") {
; // ignore sequence_num now
- else if (param == "WINDOW")
- window = atoi(value.c_str());
- else if (param == "THRESHOLD")
- threshold = atoi(value.c_str());
- else if (param == "SEQUENCE")
- {
- fs::path seq_file = file_path_base / value;
- //cout << "seq_file_name " << seq_files.back() << endl;
+ } else if (tokens[0] == "WINDOW") {
+ window = atoi(tokens[1].c_str());
+ } else if (tokens[0] == "THRESHOLD") {
+ threshold = atoi(tokens[1].c_str());
+ } else if (tokens[0] == "SEQUENCE") {
+ if (parsing_state == INSEQUENCE) {
+ cout << "seq_file_name call2" << seq_file << endl;
+ load_sequence(seq_file, annot_file, fasta_index, sub_seq_start,
+ sub_seq_end);
+ parsing_state = START;
+ }
+ // reset sequence parameters
+ seq_file = file_path_base / tokens[1];
fasta_index = 1;
annot_file = "";
sub_seq_start = 0;
sub_seq_end = 0;
seq_params = true;
-
- while (para_file && seq_params)
- {
- multiplatform_getline(para_file,file_data_line);
- split_index = file_data_line.find(" ");
- param = file_data_line.substr(0,split_index);
- value = file_data_line.substr(split_index+1);
-
- if (param == "FASTA_INDEX")
- fasta_index = atoi(value.c_str());
- else if (param == "ANNOTATION")
- annot_file = file_path_base / value;
- else if (param == "SEQ_START")
- sub_seq_start = atoi(value.c_str());
- else if (param == "SEQ_END")
- {
- sub_seq_end = atoi(value.c_str());
- }
- //ignore empty lines or that start with '#'
- else if ((param == "") || (param == "#")) {
- // pass
- } else {
- seq_params = false;
- }
- }
- load_sequence(seq_file, annot_file, fasta_index, sub_seq_start,
- sub_seq_end);
- did_seq = true;
- }
- //ignore empty lines or that start with '#'
- else if ( (param.size() == 0) || (param[0] == '#'))
- {} // pass
- else
- {
+ parsing_state = INSEQUENCE;
+ } else {
clog << "Illegal/misplaced mussa parameter in file\n";
- clog << param << "\n";
- error_occured = true;
- }
-
- if (!did_seq)
- {
- multiplatform_getline(para_file,file_data_line);
- split_index = file_data_line.find(" ");
- param = file_data_line.substr(0,split_index);
- value = file_data_line.substr(split_index+1);
- did_seq = false;
+ clog << tokens[0] << "\n";
+ std::stringstream errmsg;
+ errmsg << "Invalid mussa paaramater '"
+ << tokens[0]
+ << "' on line: "
+ << line_count << std::endl
+ << line;
+ throw mussa_load_error(errmsg.str());
+ throw mussa_load_error("Error parsing MUPA file");
}
}
- if (error_occured) {
- throw mussa_load_error("Error parsing MUPA file");
+ // if we hit the end of the file and there's a sequence
+ // pending, go ahead and load it
+ if (parsing_state == INSEQUENCE) {
+ load_sequence(seq_file, annot_file, fasta_index, sub_seq_start,
+ sub_seq_end);
}
+
soft_thres = threshold;
- // no file was loaded, signal error
set_dirty(true);
}