X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=alg%2Fsequence.hpp;h=fc8a0cfe7b2eea9d2690f5adfdc9af5b0b175544;hb=f3c0553a22b0e4ddc39ee45f51725352d92e97f1;hp=b335dcd1dfac1401ce473ef22258334b705d0cd5;hpb=578d0743979ac62b6e55f588b004d98be6cb4c73;p=mussa.git diff --git a/alg/sequence.hpp b/alg/sequence.hpp index b335dcd..fc8a0cf 100644 --- a/alg/sequence.hpp +++ b/alg/sequence.hpp @@ -28,26 +28,37 @@ #include #include +#include #include +#include "alphabet.hpp" +#include "seq.hpp" +#include "seq_span.hpp" + // Sequence data class -//! Attach annotation information to a sequence track -struct annot +/* The way that motifs are found currently doesn't really + * indicate that the match was a reverse compliment + */ +struct motif { - annot(); - annot(int begin, int end, std::string type, std::string name); - ~annot(); - + motif(); + //motif(int begin, int end, std::string type, std::string name); + //! this constructor is for when we're adding motifs to our annotations + motif(int begin, std::string motif); + ~motif(); + int begin; int end; std::string type; std::string name; + std::string sequence; + + friend bool operator==(const motif& left, const motif& right); - friend bool operator==(const annot& left, const annot& right); -private: // boost::serialization support +private: friend class boost::serialization::access; template void serialize(Archive& ar, const unsigned int /*version*/) { @@ -55,114 +66,102 @@ private: ar & BOOST_SERIALIZATION_NVP(end); ar & BOOST_SERIALIZATION_NVP(type); ar & BOOST_SERIALIZATION_NVP(name); - } -}; -BOOST_CLASS_EXPORT(annot); - - -/* The way that motifs are found currently doesn't really - * indicate that the match was a reverse compliment - */ -struct motif : public annot -{ - std::string sequence; - - motif() : annot(), sequence("") {}; - //! this constructor is for when we're adding motifs to our annotations - motif(int begin, std::string motif); - ~motif(); - - // boost::serialization support -private: - friend class boost::serialization::access; - template - void serialize(Archive& ar, const unsigned int /*version*/) { - ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(annot); ar & BOOST_SERIALIZATION_NVP(sequence); } }; BOOST_CLASS_EXPORT(motif); -//! the only purpose of this class is that the shared_ptr template -//! functions need the serialization support to be in-class. -class seq_string : public std::string -{ -private: - friend class boost::serialization::access; - template - void serialize(Archive& ar, const unsigned int /*version*/) { - //ar & BOOST_SERIALIZATION_BASE_OBJECT_NVP(std::string); - ar & boost::serialization::make_nvp("bases", - boost::serialization::base_object(*this) - ); - } -}; +class Sequence; +typedef boost::shared_ptr SequenceRef; //! sequence track for mussa. class Sequence { public: - typedef std::string::difference_type difference_type; - typedef std::string::iterator iterator; - typedef std::string::const_iterator const_iterator; - typedef std::string::reference reference; - typedef std::string::const_reference const_reference; - typedef std::string::size_type size_type; - static const size_type npos = std::string::npos; - - // some standard dna alphabets - // Include nl (\012), and cr (\015) to make sequence parsing eol - // convention independent. - - static const std::string dna_alphabet; - static const std::string rna_alphabet; - //! this is the general iupac alphabet for nucleotides - static const std::string nucleic_iupac_alphabet; - //! the protein alphabet - static const std::string protein_alphabet; - - Sequence(); - ~Sequence(); - Sequence(const char* seq); - Sequence(const std::string& seq); + typedef SeqString::value_type value_type; + typedef SeqString::difference_type difference_type; + typedef SeqString::iterator iterator; + typedef SeqString::reverse_iterator reverse_iterator; + typedef SeqString::const_iterator const_iterator; + typedef SeqString::const_reverse_iterator const_reverse_iterator; + typedef SeqString::reference reference; + typedef SeqString::const_reference const_reference; + typedef SeqString::size_type size_type; + static const size_type npos = SeqString::npos; + + typedef std::list MotifList; + typedef boost::shared_ptr MotifListRef; + + Sequence(AlphabetRef a = reduced_dna_alphabet); + Sequence(const char* seq, + AlphabetRef a = reduced_dna_alphabet, + SeqSpan::strand_type strand = SeqSpan::PlusStrand); + Sequence(const std::string& seq, + AlphabetRef a = reduced_dna_alphabet, + SeqSpan::strand_type strand = SeqSpan::PlusStrand); + //! make a new sequence, with the same SeqSpan Sequence(const Sequence& seq); + //! make a new sequence, with the same SeqSpan + Sequence(const Sequence *); + //! Make a new sequence using a copy of SeqSpan + Sequence(const SequenceRef); + Sequence(const SeqSpanRef&); + ~Sequence(); //! assignment to constant sequences Sequence &operator=(const Sequence&); friend std::ostream& operator<<(std::ostream&, const Sequence&); friend bool operator<(const Sequence&, const Sequence&); friend bool operator==(const Sequence&, const Sequence&); + friend bool operator!=(const Sequence&, const Sequence&); const_reference operator[](size_type) const; //! set sequence to a (sub)string containing nothing but AGCTN - void set_filtered_sequence(const std::string& seq, - std::string::size_type start=0, - std::string::size_type count=0); + void set_filtered_sequence(const std::string& seq, + AlphabetRef a=reduced_dna_alphabet, + size_type start=0, + size_type count=npos, + SeqSpan::strand_type strand=SeqSpan::PlusStrand); //! retrive element at specific position - const_reference at(size_type n) const; + const_reference at(size_type i) const { return seq->at(i); } //! clear the sequence and its annotations void clear(); - //! return c pointer to the sequence data - const char *c_str() const; + //! return a non-null terminated c pointer to the sequence data + const char *data() const { return seq->data(); } //! forward iterator - const_iterator begin() const; + const_iterator begin() const { return seq->begin(); } //! last iterator - const_iterator end() const; + const_iterator end() const { return seq->end(); } //! is our sequence empty? - bool empty() const; + bool empty() const { return (seq) ? seq->empty() : true ; } + //! find first + size_type find_first_not_of(const std::string& q, size_type index=0) { return seq->find_first_not_of(q, index); } //! how many base pairs are there in our sequence - size_type size() const; + size_type size() const { return (seq) ? seq->size() : 0; } //! alias for size (used by string) - size_type length() const; + size_type length() const { return size(); } + //! reverse iterator + const_reverse_iterator rbegin() const { return seq->rbegin(); } + //! reverse end iterator + const_reverse_iterator rend() const { return seq->rend(); } + //! is our sequence empty? + //! start position relative to "base" sequence + size_type start() const { return seq->parentStart(); } + //! one past the last position relative to "base" sequence + size_type stop() const { return seq->parentStop(); } //! return a subsequence, copying over any appropriate annotation - Sequence subseq(int start=0, int count = std::string::npos) const; - //! return a reverse compliment - std::string rev_comp() const; + Sequence subseq(size_type start=0, + size_type count = npos, + SeqSpan::strand_type strand = SeqSpan::SameStrand) const; + //! reverse a character + std::string create_reverse_map() const; + //! return a reverse compliment (this needs to be improved?) + Sequence rev_comp() const; //! set sequence (filtered) - void set_sequence(const std::string &); + void set_sequence(const std::string &, AlphabetRef); //! get sequence std::string get_sequence() const; //! set species name @@ -175,65 +174,82 @@ public: std::string get_fasta_header() const; //! get name (will return the first non-empty, of fasta_header, species) std::string get_name() const; + //! return a reference to whichever alphabet we're currently representing + const Alphabet& get_alphabet() const; + //! load sequence from fasta file using the sequences current alphabet + void load_fasta(const boost::filesystem::path file_path, int seq_num=1, + int start_index=0, int end_index=0); //! load sequence AGCT from fasta file //! \throw mussa_load_error //! \throw sequence_empty_error //! \throw sequence_empty_file_error - void load_fasta(const boost::filesystem::path file_path, int seq_num=1, - int start_index=0, int end_index=0); + void load_fasta(const boost::filesystem::path file_path, + AlphabetRef a, + int seq_num=1, + int start_index=0, int end_index=0); + void load_fasta(std::istream& file, + int seq_num=1, int start_index=0, int end_index=0); //! load sequence from stream //! \throw mussa_load_error //! \throw sequence_empty_error //! \throw sequence_empty_file_error - void load_fasta(std::iostream& file, int seq_num=1, - int start_index=0, int end_index=0); + void load_fasta(std::istream& file, + AlphabetRef a, + int seq_num=1, + int start_index=0, int end_index=0); //! load sequence annotations //! \throws mussa_load_error void load_annot(const boost::filesystem::path file_path, int start_index, int end_index); //! load sequence annotations //! \throws mussa_load_error - void load_annot(std::fstream& data_stream, int start_index, int end_index); - bool parse_annot(std::string data, int start_index=0, int end_index=0); + void load_annot(std::istream& data_stream, int start_index, int end_index); + //! parse annotation file + /*! \throws annotation_load_error + */ + void parse_annot(std::string data, int start_index=0, int end_index=0); //! add an annotation to our list of annotations - void add_annotation(const annot& a); - const std::list& annotations() const; - const std::list& motifs() const; + void add_annotation(const SeqSpanRef a); + //! add an annotation using tristan's mussa file paramenters + void add_annotation(std::string name, std::string type, size_type start, size_type stop); + //! create an initialized annotation with the "standard" types. + SeqSpanRef make_annotation(std::string name, std::string type, size_type start, size_type stop) const; + const SeqSpanRefList& annotations() const; + + const MotifList& motifs() const; //! add a motif to our list of motifs - //! \throws motif_normalize_error if there's something wrong with a_motif void add_motif(const Sequence& a_motif); //! clear our list of found motifs void clear_motifs(); //! search a sequence for a_motif //! \throws motif_normalize_error if there's something wrong with a_motif - std::vector find_motif(const std::string& a_motif) const; - //! search a sequence for a_motif - //! \throws motif_normalize_error if there's something wrong with a_motif - std::vector find_motif(const Sequence& a_motif) const; - //! convert IUPAC symbols to upperase - //! \throws motif_normalize_error if there is an invalid symbol - static std::string motif_normalize(const std::string& a_motif); - + std::vector find_motif(const Sequence& a_motif) const; //! annotate the current sequence with other sequences void find_sequences(std::list::iterator start, std::list::iterator end); + SeqSpanRef seqspan() { return seq; } void save(boost::filesystem::fstream &save_file); - void load_museq(boost::filesystem::path load_file_path, int seq_num); + //void load_museq(boost::filesystem::path load_file_path, int seq_num); + static SequenceRef load_museq(boost::filesystem::fstream& load_file); -private: - boost::shared_ptr seq; +protected: + SeqSpanRef seq; + //! fasta header std::string header; //! species name std::string species; - //! store our oldstyle annotations - std::list annots; + //! store annotation regions + SeqSpanRefListRef annotation_list; //! a seperate list for motifs since we're currently not saving them - std::list motif_list; + MotifListRef motif_list; + + //! copy over all our annotation children + void copy_children(Sequence &, size_type start, size_type count) const; - void motif_scan(std::string a_motif, std::vector * motif_match_starts) const; + void motif_scan(const Sequence& a_motif, std::vector * motif_match_starts) const; std::string rc_motif(std::string a_motif) const; //! look for a string sequence type and and it to an annotation list void add_string_annotation(std::string a_seq, std::string name); @@ -245,10 +261,9 @@ private: ar & BOOST_SERIALIZATION_NVP(seq); ar & BOOST_SERIALIZATION_NVP(header); ar & BOOST_SERIALIZATION_NVP(species); - ar & BOOST_SERIALIZATION_NVP(annots); + ar & BOOST_SERIALIZATION_NVP(annotation_list); ar & BOOST_SERIALIZATION_NVP(motif_list); } }; BOOST_CLASS_EXPORT(Sequence); - #endif