From f75652f23387cc56ed7e436dd994da2441f40c08 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 9 Jun 2006 21:35:07 +0000 Subject: [PATCH] More robust eol handling for Sequence::load_fasta The load_fasta code used getline() which only uses "native" end of line conventions. Unfortunately for me OS X has "native" and "backwards compatible" eol conventions (which aren't compatible). So this patch will accept CR, CR-LF, LF, and because I was lazy LF-CR. I also unit tested that which required breaking load_fasta into a function that handles loading from a file name and one that works from an iostream. --- alg/sequence.cpp | 125 +++++++++++++++++++++++-------------- alg/sequence.hpp | 13 +++- alg/test/test_sequence.cpp | 28 +++++++++ mussa_exceptions.hpp | 25 ++++++++ 4 files changed, 140 insertions(+), 51 deletions(-) diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 3ff96fc..90b4805 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -112,6 +112,23 @@ std::ostream& operator<<(std::ostream& out, const Sequence& seq) return out; } +static void multiplatform_getline(std::istream& in, std::string& line) +{ + line.clear(); + char c; + in.get(c); + while(in.good() and !(c == '\012' or c == '\015') ) { + line.push_back(c); + in.get(c); + } + // if we have cr-lf eat it + c = in.peek(); + if (c=='\012' or c == '\015') { + in.get(); + } + +} + //! load a fasta file into a sequence /*! * \param file_path the location of the fasta file in the filesystem @@ -120,11 +137,39 @@ std::ostream& operator<<(std::ostream& out, const Sequence& seq) * \param end_index ending position in the fasta sequence, 0 for end * \return error message, empty string if no error. (gag!) */ +void Sequence::load_fasta(fs::path file_path, int seq_num, + int start_index, int end_index) +{ + fs::fstream data_file; + data_file.open(file_path, std::ios::in); + + if (!data_file.good()) + { + throw mussa_load_error("Sequence File: "+file_path.string()+" not found"); + } else { + try { + load_fasta(data_file, seq_num, start_index, end_index); + } catch(sequence_empty_error e) { + // there doesn't appear to be any sequence + // catch and rethrow to include the filename + std::stringstream msg; + msg << "The selected sequence in " + << file_path.native_file_string() + << " appears to be empty"; + throw sequence_empty_error(msg.str()); + } catch(sequence_empty_file_error e) { + std::stringstream errormsg; + errormsg << file_path.native_file_string() + << " did not have any fasta sequences" << std::endl; + throw sequence_empty_file_error(errormsg.str()); + } + } +} + void -Sequence::load_fasta(fs::path file_path, int seq_num, +Sequence::load_fasta(std::iostream& data_file, int seq_num, int start_index, int end_index) { - fs::fstream data_file; std::string file_data_line; int header_counter = 0; bool read_seq = true; @@ -132,61 +177,45 @@ Sequence::load_fasta(fs::path file_path, int seq_num, std::string sequence_raw; std::string seq_tmp; // holds sequence during basic filtering - data_file.open(file_path, std::ios::in); - if (seq_num == 0) { throw mussa_load_error("fasta sequence number is 1 based (can't be 0)"); } - if (!data_file) - { - throw mussa_load_error("Sequence File: " + file_path.string() + " not found"); - } - // if file opened okay, read it - else + + // search for the header of the fasta sequence we want + while ( (!data_file.eof()) && (header_counter < seq_num) ) { - // search for the header of the fasta sequence we want - while ( (!data_file.eof()) && (header_counter < seq_num) ) - { - getline(data_file,file_data_line); - if (file_data_line.substr(0,1) == ">") - header_counter++; - } + multiplatform_getline(data_file, file_data_line); + if (file_data_line.substr(0,1) == ">") + header_counter++; + } - if (header_counter > 0) { - header = file_data_line.substr(1); + if (header_counter > 0) { + header = file_data_line.substr(1); - sequence_raw = ""; + sequence_raw = ""; - while ( !data_file.eof() && read_seq ) { - getline(data_file,file_data_line); - if (file_data_line.substr(0,1) == ">") - read_seq = false; - else sequence_raw += file_data_line; - } + while ( !data_file.eof() && read_seq ) { + multiplatform_getline(data_file,file_data_line); + if (file_data_line.substr(0,1) == ">") + read_seq = false; + else sequence_raw += file_data_line; + } - // Lastly, if subselection of the sequence was specified we keep cut out - // and only keep that part - // end_index = 0 means no end was specified, so cut to the end - if (end_index == 0) - end_index = sequence_raw.size(); - - // sequence filtering for upcasing agctn and convert non AGCTN to N - if (end_index-start_index <= 0) { - // there doesn't appear to be any sequence - std::stringstream msg; - msg << "The selected sequence in " - << file_path.native_file_string() - << " appears to be empty"; - throw mussa_load_error(msg.str()); - } - set_filtered_sequence(sequence_raw, start_index, end_index-start_index); - } else { - std::stringstream errormsg; - errormsg << file_path.native_file_string() - << " did not have any fasta sequences" << std::endl; - throw mussa_load_error(errormsg.str()); + // Lastly, if subselection of the sequence was specified we keep cut out + // and only keep that part + // end_index = 0 means no end was specified, so cut to the end + if (end_index == 0) + end_index = sequence_raw.size(); + + // sequence filtering for upcasing agctn and convert non AGCTN to N + if (end_index-start_index <= 0) { + std::string msg("The selected sequence appears to be empty"); + throw sequence_empty_error(msg); } - data_file.close(); + set_filtered_sequence(sequence_raw, start_index, end_index-start_index); + } else { + std::string errormsg("There were no fasta sequences"); + throw sequence_empty_file_error(errormsg); } } diff --git a/alg/sequence.hpp b/alg/sequence.hpp index f67ca83..4aa6713 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -84,16 +84,23 @@ class Sequence std::string::size_type count=0); //! load sequence AGCT from fasta file - /*! \throws mussa_load_error - */ + //! \throw mussa_load_error + //! \throw sequence_empty_error + //! \throw sequence_empty_file_error void load_fasta(const boost::filesystem::path file_path, int seq_num=1, int start_index=0, int end_index=0); + //! load sequence from stream + //! \throw mussa_load_error + //! \throw sequence_empty_error + //! \throw sequence_empty_file_error + void load_fasta(std::iostream& file, int seq_num=1, + int start_index=0, int end_index=0); //! load sequence annotations //! \throws mussa_load_error void load_annot(const boost::filesystem::path file_path, int start_index, int end_index); //! load sequence annotations //! \throws mussa_load_error - //void load_annot(std::istream& data_stream, int start_index, int end_index); + void load_annot(std::fstream& data_stream, int start_index, int end_index); void parse_annot(std::string data, int start_index, int end_index); const std::list& annotations() const; const std::list& motifs() const; diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index eaf6e5c..2516483 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -21,6 +21,34 @@ BOOST_AUTO_TEST_CASE( sequence_load_exception ) BOOST_CHECK_THROW( s.load_annot("alkejralk", 0, 0), mussa_load_error); } +BOOST_AUTO_TEST_CASE( sequence_eol_conventions ) +{ + string header(">Header"); + string line1("AAAAGGGGCCCCTTTTT"); + string line2("AAAAGGGGCCCCTTTTT"); + int seq_len = line1.size() + line2.size(); + + stringstream cr; + cr << header << "\015" << line1 << "\015" << line2 << "\015"; + Sequence seq_cr; + seq_cr.load_fasta(cr); + + stringstream crlf; + crlf << header << "\015\012" << line1 << "\015\012" << line2 << "\015\012"; + Sequence seq_crlf; + seq_crlf.load_fasta(crlf); + + stringstream lf; + lf << header << "\012" << line1 << "\012" << line2 << "\012"; + Sequence seq_lf; + seq_lf.load_fasta(lf); + + BOOST_CHECK_EQUAL(seq_cr.size(), seq_len); + BOOST_CHECK_EQUAL(seq_crlf.size(), seq_len); + BOOST_CHECK_EQUAL(seq_cr.size(), seq_len); +} + + //! Do simple operations work correctly? BOOST_AUTO_TEST_CASE( sequence_filter ) { diff --git a/mussa_exceptions.hpp b/mussa_exceptions.hpp index f9ee710..aa920b1 100644 --- a/mussa_exceptions.hpp +++ b/mussa_exceptions.hpp @@ -25,6 +25,31 @@ public: mussa_error(msg) {}; }; +//! Error loading sequence +class sequence_load_error : public mussa_load_error +{ +public: + explicit sequence_load_error(const std::string& msg) : + mussa_load_error(msg) {}; +}; + +// Empty sequence +class sequence_empty_error : public sequence_load_error +{ +public: + explicit sequence_empty_error(const std::string& msg) : + sequence_load_error(msg) {}; +}; + +// Empty fasta file +class sequence_empty_file_error : public sequence_load_error +{ +public: + explicit sequence_empty_file_error(const std::string& msg) : + sequence_load_error(msg) {}; +}; + + //! failure running analysis class mussa_analysis_error : public mussa_error { -- 2.30.2