glseqbrowser.cpp
glsequence.cpp
mussa.cpp
+ motif_parser.cpp
nway_entropy.cpp
nway_other.cpp
nway_paths.cpp
// some standard dna alphabets
// Include nl (\012), and cr (\015) to make sequence parsing eol
// convention independent.
-const Alphabet Alphabet::reduced_dna_alphabet("AaCcGgTtNn\012\015");
-const Alphabet Alphabet::reduced_rna_alphabet("AaCcGgUuNn\012\015");
-const Alphabet Alphabet::reduced_nucleic_alphabet("AaCcGgTtUuNn\012\015");
+const char *Alphabet::reduced_dna_cstr = "AaCcGgTtNn\012\015";
+const char *Alphabet::reduced_rna_cstr = "AaCcGgUuNn\012\015";
+const char *Alphabet::reduced_nucleic_cstr = "AaCcGgTtUuNn\012\015";
//! this is the general iupac alphabet for nucleotides
-const Alphabet Alphabet::nucleic_alphabet(
- "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015"
-);
+const char *Alphabet::nucleic_cstr =
+ "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn-~.?\012\015";
//! the protein alphabet
-const Alphabet Alphabet::protein_alphabet(
- "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015"
-);
+const char *Alphabet::protein_cstr =
+ "AaCcDdEeFfGgHhIiKkLlMmNnPpQqRrSsTtVvWwYy\012\015";
+
+const Alphabet& Alphabet::reduced_dna_alphabet() {
+ static Alphabet *a = new Alphabet(reduced_dna_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::reduced_rna_alphabet() {
+ static Alphabet *a = new Alphabet(reduced_rna_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::reduced_nucleic_alphabet() {
+ static Alphabet *a = new Alphabet(reduced_nucleic_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::nucleic_alphabet() {
+ static Alphabet *a = new Alphabet(nucleic_cstr);
+ return *a;
+}
+const Alphabet& Alphabet::protein_alphabet() {
+ static Alphabet *a = new Alphabet(protein_cstr);
+ return *a;
+}
Alphabet::Alphabet(const char *a) :
alphabet(a)
alphabet_set.insert(alphabet.begin(), alphabet.end());
}
-const char *Alphabet::c_str() const
-{
- alphabet.c_str();
-}
-
bool Alphabet::exists(const char c) const
{
return (alphabet_set.find(c) != alphabet_set.end());
public:
typedef std::string::const_iterator const_iterator;
- //! return reference to the characters in our alphabet
- const char *c_str() const;
//! case-insensitive test to check a character for existence in our alphabet
bool exists(const char) const;
// note, if you want to define an alphabet for a sequence, you probably want
// to update the enumeration in Sequence, and Sequence::get_sequence
//! The standard DNA alphabet, with unique, and unknown characters
- static const Alphabet reduced_dna_alphabet;
+ static const char *reduced_dna_cstr;
+ static const Alphabet &reduced_dna_alphabet();
//! The standard RNA alphabet, with unique, and unknown characters
- static const Alphabet reduced_rna_alphabet;
+ static const char *reduced_rna_cstr;
+ static const Alphabet &reduced_rna_alphabet();
//! The standard DNA/RNA alphabet, with unique, and unknown characters
- static const Alphabet reduced_nucleic_alphabet;
+ static const char *reduced_nucleic_cstr;
+ static const Alphabet &reduced_nucleic_alphabet();
//! this is the general IUPAC alphabet for nucleotides
- static const Alphabet nucleic_alphabet;
+ static const char *nucleic_cstr;
+ static const Alphabet &nucleic_alphabet();
//! the protein alphabet
- static const Alphabet protein_alphabet;
+ static const char *protein_cstr;
+ static const Alphabet &protein_alphabet();
private:
//! what are allowable symbols in our alphabet
--- /dev/null
+#include "mussa_exceptions.hpp"
+#include "alg/alphabet.hpp"
+#include "alg/motif_parser.hpp"
+
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/actor/push_back_actor.hpp>
+#include <boost/spirit/iterator/file_iterator.hpp>
+#include <boost/spirit/utility/chset.hpp>
+namespace spirit = boost::spirit;
+
+#include <sstream>
+#include <stdexcept>
+
+motif_parser::push_channel::push_channel( motif_parser::ParsedMotifs *parsed_ ) :
+ parsed(parsed_)
+{
+}
+
+void motif_parser::push_channel::operator()(float f) const
+{
+ parsed->channels.push_back(f);
+}
+
+motif_parser::push_sequence::push_sequence( ParsedMotifs *parsed_) :
+ parsed(parsed_)
+{
+}
+
+template<typename Iterator>
+void motif_parser::push_sequence::operator()(
+ Iterator start,
+ Iterator end) const
+{
+ std::copy(start, end, std::back_inserter(parsed->sequence));
+}
+
+motif_parser::push_name::push_name( ParsedMotifs *parsed_) :
+ parsed(parsed_)
+{
+}
+
+template<typename Iterator>
+void motif_parser::push_name::operator()(
+ Iterator start,
+ Iterator end) const
+{
+ std::copy(start, end, std::back_inserter(parsed->name));
+}
+
+motif_parser::push_motif::push_motif( ParsedMotifs *parsed_) :
+ parsed(parsed_) {}
+
+template<typename Iterator>
+void motif_parser::push_motif::operator()(
+ Iterator start,
+ Iterator end) const
+{
+ float red, green, blue, alpha;
+ Sequence seq(parsed->sequence, Sequence::nucleic_alphabet);
+ seq.set_fasta_header(parsed->name);
+
+ alpha = 1.0;
+ switch (parsed->channels.size()) {
+ case 4:
+ alpha = parsed->channels[3];
+ // note fall through.
+ case 3:
+ red = parsed->channels[0];
+ green = parsed->channels[1];
+ blue = parsed->channels[2];
+ break;
+ default:
+ throw std::runtime_error("wrong number of channels");
+ break;
+ }
+ Color c(red, green, blue, alpha);
+ parsed->color_mapper->appendInstanceColor("motif", seq.c_str(), c);
+ parsed->motifs.insert(seq);
+
+ parsed->sequence.clear();
+ parsed->name.clear();
+ parsed->channels.clear();
+}
+
+motif_parser::ParsedMotifs::ParsedMotifs(
+ Mussa::motif_set& motifs_,
+ boost::shared_ptr<AnnotationColors> color_mapper_) :
+ motifs(motifs_),
+ color_mapper(color_mapper_)
+{
+}
+
+void motif_parser::ParsedMotifs::parse(const std::string &data)
+{
+ const char *alphabet = Alphabet::nucleic_cstr;
+
+ // parse our string
+ spirit::parse_info<std::string::const_iterator> result;
+ result = spirit::parse(data.begin(), data.end(),
+ *(
+ (
+ (
+ (+spirit::chset<>(alphabet))[motif_parser::push_sequence(this)] >>
+ +spirit::space_p
+ ) >>
+ !(
+ (
+ // names can either be letter followed by non-space characters
+ (spirit::alpha_p >> *spirit::graph_p)[motif_parser::push_name(this)]
+ |
+ // or a quoted string
+ (
+ spirit::ch_p('"') >>
+ (+(~spirit::ch_p('"')))[motif_parser::push_name(this)] >>
+ spirit::ch_p('"')
+ )
+ ) >> +spirit::space_p
+ ) >>
+ spirit::real_p[motif_parser::push_channel(this)] >> +spirit::space_p >>
+ spirit::real_p[motif_parser::push_channel(this)] >> +spirit::space_p >>
+ spirit::real_p[motif_parser::push_channel(this)] >> +spirit::space_p >>
+ !(spirit::real_p[motif_parser::push_channel(this)] >> +spirit::space_p)
+ )[push_motif(this)]
+ ));
+ if (not result.full) {
+ std::stringstream msg;
+ msg << "Error at character " << result.length;
+ // erase our potentially broken motif list
+ motifs.clear();
+ throw motif_load_error(msg.str());
+ }
+}
\ No newline at end of file
--- /dev/null
+#ifndef MOTIF_PARSER_HPP_
+#define MOTIF_PARSER_HPP_
+
+#include <string>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+
+#include "alg/mussa.hpp"
+#include "alg/annotation_colors.hpp"
+
+namespace motif_parser {
+ // Helper functor to append created motifs to our Mussa analysis
+ class ParsedMotifs {
+ public:
+ ParsedMotifs(Mussa::motif_set& motifs_,
+ boost::shared_ptr<AnnotationColors> color_mapper_);
+
+ void parse(const std::string &data);
+ private:
+ friend struct push_name;
+ friend struct push_sequence;
+ friend struct push_channel;
+ friend struct push_motif;
+
+ std::string sequence;
+ std::string name;
+ std::vector<float> channels;
+
+ Mussa::motif_set& motifs;
+ boost::shared_ptr<AnnotationColors> color_mapper;
+ };
+
+ struct push_name {
+ push_name(ParsedMotifs *p);
+ template<typename Iterator>
+ void operator()(Iterator, Iterator) const;
+ private:
+ ParsedMotifs *parsed;
+ };
+ struct push_sequence {
+ push_sequence(ParsedMotifs *p);
+ template<typename Iterator>
+ void operator()(Iterator, Iterator) const;
+ private:
+ ParsedMotifs *parsed;
+ };
+ struct push_channel {
+ push_channel(ParsedMotifs *p);
+ void operator()(float) const;
+ private:
+ ParsedMotifs *parsed;
+ };
+ struct push_motif {
+ push_motif(ParsedMotifs *p);
+ template<typename Iterator>
+ void operator()(Iterator, Iterator) const;
+ private:
+ ParsedMotifs *parsed;
+ };
+};
+
+#endif /*MOTIF_PARSER_HPP_*/
#include <boost/filesystem/fstream.hpp>
namespace fs = boost::filesystem;
-#include <boost/spirit/core.hpp>
-#include <boost/spirit/actor/push_back_actor.hpp>
-#include <boost/spirit/iterator/file_iterator.hpp>
-#include <boost/spirit/utility/chset.hpp>
-namespace spirit = boost::spirit;
-
#include <iostream>
#include <sstream>
#include "mussa_exceptions.hpp"
-#include "alg/mussa.hpp"
#include "alg/flp.hpp"
+#include "alg/mussa.hpp"
+#include "alg/motif_parser.hpp"
using namespace std;
update_sequences_motifs();
}
-// Helper functor to append created motifs to our Mussa analysis
-struct push_back_motif {
- Mussa::motif_set& motifs;
- boost::shared_ptr<AnnotationColors> color_mapper;
- std::string& seq_string;
- std::string& name;
- float& red;
- float& green;
- float& blue;
- float& alpha;
- int& parsed;
-
- push_back_motif(Mussa::motif_set& motifs_,
- boost::shared_ptr<AnnotationColors> color_mapper_,
- std::string& seq_,
- std::string& name_,
- float &red_, float &green_, float &blue_, float &alpha_,
- int &parsed_)
- : motifs(motifs_),
- color_mapper(color_mapper_),
- seq_string(seq_),
- name(name_),
- red(red_),
- green(green_),
- blue(blue_),
- alpha(alpha_),
- parsed(parsed_)
- {
- }
-
- void operator()(std::string::const_iterator,
- std::string::const_iterator) const
- {
- Sequence seq(seq_string, Sequence::nucleic_alphabet);
- // shouldn't we have a better field than "fasta header" and speices?
- seq.set_fasta_header(name);
- // we need to clear the name in case the next motif doesn't have one.
- name.clear();
- // be nice if glsequence was a subclass of sequence so we could
- // just attach colors directly to the motif.
- Color c(red, green, blue, alpha);
- color_mapper->appendInstanceColor("motif", seq.c_str(), c);
- alpha = 1.0;
- motifs.insert(seq);
- ++parsed;
- };
-};
-
void Mussa::load_motifs(fs::path filename)
{
fs::ifstream f;
f.open(filename, ifstream::in);
load_motifs(f);
}
-
-// I mostly split the ifstream out so I can use a stringstream to test it.
+
void Mussa::load_motifs(std::istream &in)
{
std::string data;
- const char *alphabet = Alphabet::nucleic_alphabet.c_str();
- string seq;
- string name;
- float red = 1.0;
- float green = 0.0;
- float blue = 0.0;
- float alpha = 1.0;
- int parsed = 1;
+ const char *alphabet = Alphabet::nucleic_cstr;
+ motif_parser::ParsedMotifs parsed_motifs(motif_sequences, color_mapper);
// slurp our data into a string
std::streamsize bytes_read = 1;
bytes_read = in.readsome(buf, bufsiz);
data.append(buf, buf+bytes_read);
}
- // parse our string
- bool ok = spirit::parse(data.begin(), data.end(),
- *(
- (
- (
- (+spirit::chset<>(alphabet))[spirit::assign_a(seq)] >>
- +spirit::space_p
- ) >>
- !(
- (
- // names can either be letter followed by non-space characters
- (spirit::alpha_p >> *spirit::graph_p)[spirit::assign_a(name)]
- |
- // or a quoted string
- (
- spirit::ch_p('"') >>
- (+(~spirit::ch_p('"')))[spirit::assign_a(name)] >>
- spirit::ch_p('"')
- )
- ) >> +spirit::space_p
- ) >>
- spirit::real_p[spirit::assign_a(red)] >> +spirit::space_p >>
- spirit::real_p[spirit::assign_a(green)] >> +spirit::space_p >>
- spirit::real_p[spirit::assign_a(blue)] >> +spirit::space_p >>
- !(spirit::real_p[spirit::assign_a(alpha)] >> +spirit::space_p)
- )[push_back_motif(motif_sequences, color_mapper, seq, name, red, green, blue, alpha, parsed)]
- )).full;
- if (not ok) {
- stringstream msg;
- msg << "Error parsing motif #" << parsed;
- // erase our potentially broken motif list
- motif_sequences.clear();
- throw motif_load_error(msg.str());
- }
+ parsed_motifs.parse(data);
update_sequences_motifs();
}
((spirit::ch_p('>')|spirit::str_p(">")) >>
(*(spirit::print_p))[spirit::assign_a(name)] >>
spirit::eol_p >>
- (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
+ (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seq)]
)[push_back_seq(query_seqs, name, seq, parsed)]
) >>
*spirit::space_p
{
switch (alpha) {
case reduced_dna_alphabet:
- return Alphabet::reduced_dna_alphabet;
+ return Alphabet::reduced_dna_alphabet();
case reduced_rna_alphabet:
- return Alphabet::reduced_rna_alphabet;
+ return Alphabet::reduced_rna_alphabet();
case reduced_nucleic_alphabet:
- return Alphabet::reduced_nucleic_alphabet;
+ return Alphabet::reduced_nucleic_alphabet();
case nucleic_alphabet:
- return Alphabet::nucleic_alphabet;
+ return Alphabet::nucleic_alphabet();
case protein_alphabet:
- return Alphabet::protein_alphabet;
+ return Alphabet::protein_alphabet();
default:
throw std::runtime_error("unrecognized alphabet type");
break;
BOOST_AUTO_TEST_CASE( alphabet_simple )
{
- const Alphabet &a = Alphabet::reduced_dna_alphabet;
+ Alphabet a(Alphabet::reduced_dna_alphabet());
// exists is case insensitive
BOOST_CHECK_EQUAL( a.exists('a'), true);
BOOST_CHECK_EQUAL( a.exists('A'), true);
BOOST_CHECK_EQUAL( a.exists('Q'), false);
BOOST_CHECK_EQUAL( a.exists('q'), false);
- BOOST_CHECK_EQUAL( a.c_str(), "AaCcGgTtNn\012\015"); // copied from alphabet.cpp
+ // copied from alphabet.cpp
+ BOOST_CHECK_EQUAL( Alphabet::reduced_dna_cstr, "AaCcGgTtNn\012\015");
}