From a852aac4f69fd5eb7d5a1c6e4118a5e7477f4d36 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Wed, 17 May 2006 08:10:00 +0000 Subject: [PATCH] try to read fasta blocks in the annotation file though this also revealed some problems with the spirit parser not really splitting on whitespace. I should look into how to do per character parsing. --- alg/sequence.cpp | 60 +++++++++++++++++++++++++++++++------- alg/sequence.hpp | 1 + alg/test/test_sequence.cpp | 31 ++++++++++++-------- 3 files changed, 70 insertions(+), 22 deletions(-) diff --git a/alg/sequence.cpp b/alg/sequence.cpp index d5c77c8..f0cb6e5 100644 --- a/alg/sequence.cpp +++ b/alg/sequence.cpp @@ -220,6 +220,7 @@ Sequence::load_annot(fs::path file_path, int start_index, int end_index) { throw mussa_load_error("Sequence File: " + file_path.string() + " not found"); } + // so i should probably be passing the parse function some iterators // but the annotations files are (currently) small, so i think i can // get away with loading the whole file into memory @@ -274,32 +275,67 @@ struct push_back_annot { void operator()(std::string::const_iterator, std::string::const_iterator) const { + std::cout << "adding annot: " << begin << " " << end << " " << name << " " << type << std::endl; annot_list.push_back(annot(begin, end, name, type)); }; }; +struct push_back_seq { + std::list& seq_list; + std::string& name; + std::string& seq; + + push_back_seq(std::list& seq_list_, + std::string& name_, + std::string& seq_) + : seq_list(seq_list_), + name(name_), + seq(seq_) + { + } + + void operator()(std::string::const_iterator, + std::string::const_iterator) const + { + std::cout << "adding seq: " << name << " " << seq << std::endl; + Sequence s(seq); + s.set_header(name); + seq_list.push_back(s); + }; +}; void Sequence::parse_annot(std::string data, int start_index, int end_index) { - std::string species_name; int start=0; int end=0; std::string name; std::string type; - + std::string seq; + std::list query_seqs; bool status = spirit::parse(data.begin(), data.end(), //begin grammar ( - (+(spirit::alpha_p))[spirit::assign_a(species_name)] >> - *((spirit::uint_p[spirit::assign_a(start)] >> - spirit::uint_p[spirit::assign_a(end)] >> - (*(spirit::alpha_p))[spirit::assign_a(name)] >> - (*(spirit::alpha_p))[spirit::assign_a(type)] - // to understand, read the comment above - // struct push_back_annot - )[push_back_annot(annots, start, end, name, type)]) + (+(spirit::alpha_p))[spirit::assign_a(species)] >> + *( + // parse an absolute location name + (spirit::uint_p[spirit::assign_a(start)] >> + spirit::uint_p[spirit::assign_a(end)] >> + (*(spirit::alpha_p))[spirit::assign_a(name)]/* >> + (*(spirit::alpha_p))[spirit::assign_a(type)]*/ + // to understand how this group gets set + // read the comment above struct push_back_annot + )[push_back_annot(annots, start, end, type, name)] + | + (spirit::ch_p('>') >> + (*(spirit::alpha_p))[spirit::assign_a(name)] >> + (+(spirit::ch_p('A')| + spirit::ch_p('G')| + spirit::ch_p('C')| + spirit::ch_p('T'))[spirit::assign_a(seq)]) + )[push_back_seq(query_seqs, name, seq)] + ) ), //end grammar spirit::space_p).full; @@ -491,6 +527,10 @@ Sequence::rev_comp() const return rev_comp; } +void Sequence::set_header(std::string &header_) +{ + header = header_; +} const std::string& Sequence::get_header() const diff --git a/alg/sequence.hpp b/alg/sequence.hpp index c3fb53f..3b09312 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -118,6 +118,7 @@ class Sequence std::string::size_type size() const; void clear(); + void set_header(std::string& header); const std::string& get_header() const; //! add a motif to our list of motifs //! \throws motif_normalize_error if there's something wrong with a_motif diff --git a/alg/test/test_sequence.cpp b/alg/test/test_sequence.cpp index 1e478fa..c2dea4e 100644 --- a/alg/test/test_sequence.cpp +++ b/alg/test/test_sequence.cpp @@ -67,24 +67,31 @@ BOOST_AUTO_TEST_CASE( sequence_load ) BOOST_AUTO_TEST_CASE( annotation_load ) { string annot_data = "human\n" - "0 10 name type\n" - "10 20 name\n" - "20 30\n" - "15 20 backward\n"; + "0 10 name\n" // type\n" + "10 20 myf\n" + "20 30 myod\n" + "50\t55 anothername\n" + "60 50 backward\n" + ">ident\n" + "GCT\n" + "GCT\n" + ; string s('A',100); s += "GCTGCT"; Sequence seq(s); //istringstream annot_stream(annot_data); seq.parse_annot(annot_data, 0, 0); - typedef std::list annot_list_t; - annot_list_t annots = seq.annotations(); - for(annot_list_t::iterator annot_i = annots.begin(); - annot_i != annots.end(); - ++annot_i) - { - std::cout << "start " << annot_i->start << endl; - } + std::list annots_list = seq.annotations(); + std::vector annots(annots_list.begin(), annots_list.end()); + BOOST_REQUIRE_EQUAL( annots.size(), 5); + BOOST_CHECK_EQUAL( annots[0].start, 0 ); + BOOST_CHECK_EQUAL( annots[0].end, 10 ); + //BOOST_CHECK_EQUAL( annots[0].type, "type"); + BOOST_CHECK_EQUAL( annots[0].name, "name"); + //BOOST_CHECK_EQUAL( annots[1].name, "myf7"); + + //BOOST_CHECK_EQUAL( annots } // ticket:83 when you try to load a sequence from a file that doesn't -- 2.30.2