From 944d43be5db80cddba58cecb12bedcbe451bb3f7 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 3 Aug 2007 01:38:30 +0000 Subject: [PATCH] make the mupa parser more robust ticket:268 improved mupa parser --- alg/mussa.cpp | 178 +++++++++++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 74 deletions(-) diff --git a/alg/mussa.cpp b/alg/mussa.cpp index 0912cb4..cda06ff 100644 --- a/alg/mussa.cpp +++ b/alg/mussa.cpp @@ -17,6 +17,8 @@ #include namespace fs = boost::filesystem; +#include + #include #include @@ -209,6 +211,7 @@ int Mussa::get_soft_threshold() const return soft_thres; } + void Mussa::set_analysis_mode(enum analysis_modes new_ana_mode) { @@ -401,104 +404,131 @@ Mussa::load_mupa_file(fs::path para_file_path) void Mussa::load_mupa_stream(std::istream& para_file, fs::path& file_path_base) { - bool error_occured = false; - string file_data_line; - string param, value; + std::string line; + std::vector< std::string > tokens; + int line_count = 0; + + enum parsing_state_enum { START, INSEQUENCE }; + parsing_state_enum parsing_state = START; + + // sequence file parameters + fs::path seq_file; fs::path annot_file; int split_index, fasta_index; int sub_seq_start, sub_seq_end; bool seq_params, did_seq; - string err_msg; - bool parsing_path; - string::size_type new_index, dir_index; + std::string err_msg; // initialize values clear(); - // setup loop by getting file's first line - getline(para_file, file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - while (para_file) + while (para_file.good()) { - did_seq = false; - if (param == "ANA_NAME") - analysis_name = value; - else if (param == "APPEND_WIN") + ++line_count; + // setup loop by getting file's first line + multiplatform_getline(para_file, line); + // strip leading/trailing whitespace + boost::trim(line); + // ignore commented out or blank lines + if ( line.size() == 0 or line[0] == '#' ) { + continue; + } + + // split the line on white spance + boost::split(tokens, line, boost::is_space()); + // do we have a name/value pair? + if (tokens.size() != 2) { + std::stringstream errmsg; + errmsg << "Error parsing MUPA file line: " + << line_count << std::endl + << line; + throw mussa_load_error(errmsg.str()); + } + + boost::to_upper(tokens[0]); + // Parameters only useful after a sequence block + if (parsing_state == INSEQUENCE) { + // in the following if blocks, if we do + // successfully match a token we should continue + // on to the next token + // but if we don't match a token we want to + // fall through to the top level parsing + + if (tokens[0] == "FASTA_INDEX") { + fasta_index = atoi(tokens[1].c_str()); + continue; + } else if (tokens[0] == "ANNOTATION") { + annot_file = file_path_base / tokens[1]; + continue; + } else if (tokens[0] == "SEQ_START") { + sub_seq_start = atoi(tokens[1].c_str()); + continue; + } else if (tokens[0] == "SEQ_END") { + sub_seq_end = atoi(tokens[1].c_str()); + continue; + } else { + // any other token means we're done with this + // sequence so we should load it + // (and let the "unknown" token fall through into the + // top level token parser) + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); + parsing_state = START; + } + } + // if we didn't consume a token from the previous if block + // try + // top level token parsing + if (tokens[0] == "ANA_NAME") { + analysis_name = tokens[1]; + } else if (tokens[0] == "APPEND_WIN") { win_append = true; - else if (param == "APPEND_THRES") + } else if (tokens[0] == "APPEND_THRES") { thres_append = true; - else if (param == "SEQUENCE_NUM") + } else if (tokens[0] == "SEQUENCE_NUM") { ; // ignore sequence_num now - else if (param == "WINDOW") - window = atoi(value.c_str()); - else if (param == "THRESHOLD") - threshold = atoi(value.c_str()); - else if (param == "SEQUENCE") - { - fs::path seq_file = file_path_base / value; - //cout << "seq_file_name " << seq_files.back() << endl; + } else if (tokens[0] == "WINDOW") { + window = atoi(tokens[1].c_str()); + } else if (tokens[0] == "THRESHOLD") { + threshold = atoi(tokens[1].c_str()); + } else if (tokens[0] == "SEQUENCE") { + if (parsing_state == INSEQUENCE) { + cout << "seq_file_name call2" << seq_file << endl; + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); + parsing_state = START; + } + // reset sequence parameters + seq_file = file_path_base / tokens[1]; fasta_index = 1; annot_file = ""; sub_seq_start = 0; sub_seq_end = 0; seq_params = true; - - while (para_file && seq_params) - { - multiplatform_getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - - if (param == "FASTA_INDEX") - fasta_index = atoi(value.c_str()); - else if (param == "ANNOTATION") - annot_file = file_path_base / value; - else if (param == "SEQ_START") - sub_seq_start = atoi(value.c_str()); - else if (param == "SEQ_END") - { - sub_seq_end = atoi(value.c_str()); - } - //ignore empty lines or that start with '#' - else if ((param == "") || (param == "#")) { - // pass - } else { - seq_params = false; - } - } - load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, - sub_seq_end); - did_seq = true; - } - //ignore empty lines or that start with '#' - else if ( (param.size() == 0) || (param[0] == '#')) - {} // pass - else - { + parsing_state = INSEQUENCE; + } else { clog << "Illegal/misplaced mussa parameter in file\n"; - clog << param << "\n"; - error_occured = true; - } - - if (!did_seq) - { - multiplatform_getline(para_file,file_data_line); - split_index = file_data_line.find(" "); - param = file_data_line.substr(0,split_index); - value = file_data_line.substr(split_index+1); - did_seq = false; + clog << tokens[0] << "\n"; + std::stringstream errmsg; + errmsg << "Invalid mussa paaramater '" + << tokens[0] + << "' on line: " + << line_count << std::endl + << line; + throw mussa_load_error(errmsg.str()); + throw mussa_load_error("Error parsing MUPA file"); } } - if (error_occured) { - throw mussa_load_error("Error parsing MUPA file"); + // if we hit the end of the file and there's a sequence + // pending, go ahead and load it + if (parsing_state == INSEQUENCE) { + load_sequence(seq_file, annot_file, fasta_index, sub_seq_start, + sub_seq_end); } + soft_thres = threshold; - // no file was loaded, signal error set_dirty(true); } -- 2.30.2