From 53e101526b30a3f5b6bff8dd37681a00a4746d98 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Thu, 8 Jun 2006 22:23:09 +0000 Subject: [PATCH] improve fasta parsing This extends the parser to use both upper & lower case iupac symbols for our annotation file. (And also I'm trying to simplify the grammar). Additionally this patch also throws an error message when the result of reading a fasta sequence is empty. --- alg/sequence.cpp | 39 ++++++++++++++++++++++++++------------ alg/sequence.hpp | 1 + alg/test/test_sequence.cpp | 2 +- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/alg/sequence.cpp b/alg/sequence.cpp index 903d6c7..3ff96fc 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -27,6 +27,7 @@ namespace fs = boost::filesystem; #include #include #include +#include namespace spirit = boost::spirit; #include "alg/sequence.hpp" @@ -36,6 +37,15 @@ namespace spirit = boost::spirit; #include #include +// some standard dna alphabets +// \012 = nl +// \015 = cr +// this should make our sequence parsing end-of-line convention +// independent +static const char* dna_alphabet = "AaCcGgTtNn\012\015"; +static const char* rna_alphabet = "AaCcGgNnUu\012\015"; +static const char* iupac_alphabet = "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015"; + annot::annot() : start(0), end(0), @@ -57,9 +67,9 @@ motif::motif(int start, std::string motif) sequence(motif) { } - + Sequence::Sequence() - : sequence(""), + : sequence(""), header(""), species("") { @@ -67,7 +77,9 @@ Sequence::Sequence() motif_list.clear(); } -Sequence::Sequence(std::string seq) +Sequence::Sequence(std::string seq) + : header(""), + species("") { set_filtered_sequence(seq); } @@ -159,6 +171,14 @@ Sequence::load_fasta(fs::path file_path, int seq_num, end_index = sequence_raw.size(); // sequence filtering for upcasing agctn and convert non AGCTN to N + if (end_index-start_index <= 0) { + // there doesn't appear to be any sequence + std::stringstream msg; + msg << "The selected sequence in " + << file_path.native_file_string() + << " appears to be empty"; + throw mussa_load_error(msg.str()); + } set_filtered_sequence(sequence_raw, start_index, end_index-start_index); } else { std::stringstream errormsg; @@ -303,7 +323,7 @@ struct push_back_seq { seq_i != seq.end(); ++seq_i) { - if (*seq_i != '\n') new_seq += *seq_i; + if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i; } //std::cout << "adding seq: " << name << " " << new_seq << std::endl; @@ -345,14 +365,9 @@ Sequence::parse_annot(std::string data, int start_index, int end_index) )[push_back_annot(annots, start, end, type, name)] | (spirit::ch_p('>') >> - (*(~spirit::chlit('\n')))[spirit::assign_a(name)] >> - +spirit::space_p >> - (+(spirit::ch_p('A')| - spirit::ch_p('G')| - spirit::ch_p('C')| - spirit::ch_p('T')| - spirit::ch_p('N')| - spirit::ch_p('\n')))[spirit::assign_a(seq)] + (*(spirit::print_p))[spirit::assign_a(name)] >> + spirit::eol_p >> + (+(spirit::chset<>(iupac_alphabet)))[spirit::assign_a(seq)] )[push_back_seq(query_seqs, name, seq)] ) >> *spirit::space_p diff --git a/alg/sequence.hpp b/alg/sequence.hpp index be1f70f..f67ca83 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -139,4 +139,5 @@ class Sequence void save(boost::filesystem::fstream &save_file); void load_museq(boost::filesystem::path load_file_path, int seq_num); }; + #endif diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 0036924..eaf6e5c 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE( annotation_load ) "60 50 backward\n" ">ident3 asdf\n" "GCT\n" - "GCTN\n" + "gCTn\n" "75\t90\tname2\ttype2\n" ; string s(100, 'A'); -- 2.30.2