+// This file is part of the Mussa source distribution.
+// http://mussa.caltech.edu/
+// Contact author: Tristan De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA. It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
+// This file is part of the Mussa source distribution.
+// http://mussa.caltech.edu/
+// Contact author: Tristan De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA. It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
+// ----------------------------------------
+// ---------- sequence.cc -----------
+// ----------------------------------------
+
+#include "alg/sequence.hpp"
+#include "mussa_exceptions.hpp"
+
+#include <string>
+#include <iostream>
+
+using namespace std;
+
+Sequence::Sequence()
+{
+}
+
+Sequence::Sequence(string seq)
+{
+ set_filtered_sequence(seq);
+}
+
+Sequence &Sequence::operator=(const Sequence& s)
+{
+ if (this != &s) {
+ sequence = s.sequence;
+ header = s.header;
+ species = s.species;
+ annots = s.annots;
+ }
+ return *this;
+}
+
+Sequence &Sequence::operator=(const std::string& s)
+{
+ set_filtered_sequence(s);
+ return *this;
+}
+
+//! load a fasta file into a sequence
+/*!
+ * \param file_path the location of the fasta file in the filesystem
+ * \param seq_num which sequence in the file to load
+ * \param start_index starting position in the fasta sequence, 0 for beginning
+ * \param end_index ending position in the fasta sequence, 0 for end
+ * \return error message, empty string if no error. (gag!)
+ */
+void
+Sequence::load_fasta(string file_path, int seq_num,
+ int start_index, int end_index)
+{
+ fstream data_file;
+ string file_data_line;
+ int header_counter = 0;
+ bool read_seq = true;
+ string rev_comp;
+ string sequence_raw;
+ string seq_tmp; // holds sequence during basic filtering
+
+ data_file.open(file_path.c_str(), ios::in);
+
+ if (!data_file)
+ {
+ throw mussa_load_error("Sequence File: " + file_path + " not found");
+ }
+ // if file opened okay, read it
+ else
+ {
+ // search for the header of the fasta sequence we want
+ while ( (!data_file.eof()) && (header_counter < seq_num) )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line.substr(0,1) == ">")
+ header_counter++;
+ }
+
+ header = file_data_line.substr(1);
+
+ sequence_raw = "";
+
+ while ( !data_file.eof() && read_seq )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line.substr(0,1) == ">")
+ read_seq = false;
+ else sequence_raw += file_data_line;
+ }
+
+ data_file.close();
+
+ // Lastly, if subselection of the sequence was specified we keep cut out
+ // and only keep that part
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = sequence_raw.size();
+
+ // sequence filtering for upcasing agctn and convert non AGCTN to N
+ set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
+ }
+}
+
+void Sequence::set_filtered_sequence(const string &old_seq,
+ string::size_type start,
+ string::size_type count)
+{
+ char conversionTable[257];
+
+ if ( count == 0)
+ count = old_seq.size() - start;
+ sequence.clear();
+ sequence.reserve(count);
+
+ // Make a conversion table
+
+ // everything we don't specify below will become 'N'
+ for(int table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = 'N';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // we want these to map to themselves - ie not to change
+ conversionTable[(int)'A'] = 'A';
+ conversionTable[(int)'T'] = 'T';
+ conversionTable[(int)'G'] = 'G';
+ conversionTable[(int)'C'] = 'C';
+ // this is to upcase
+ conversionTable[(int)'a'] = 'A';
+ conversionTable[(int)'t'] = 'T';
+ conversionTable[(int)'g'] = 'G';
+ conversionTable[(int)'c'] = 'C';
+
+ // finally, the actual conversion loop
+ for(string::size_type seq_index = 0; seq_index < count; seq_index++)
+ {
+ sequence += conversionTable[ (int)old_seq[seq_index+start]];
+ }
+}
+
+ // this doesn't work properly under gcc 3.x ... it can't recognize toupper
+ //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
+
+
+void
+Sequence::load_annot(string file_path, int start_index, int end_index)
+{
+ fstream data_file;
+ string file_data_line;
+ annot an_annot;
+ string::size_type space_split_i;
+ string annot_value;
+ list<annot>::iterator list_i;
+ string err_msg;
+
+
+ annots.clear();
+ data_file.open(file_path.c_str(), ios::in);
+
+ if (!data_file)
+ {
+ throw mussa_load_error("Sequence File: " + file_path + " not found");
+ }
+ // if file opened okay, read it
+ else
+ {
+ getline(data_file,file_data_line);
+ species = file_data_line;
+
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = sequence.length();
+
+ //cout << "START: " << start_index << " END: " << end_index << endl;
+
+ while ( !data_file.eof() )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line != "")
+ {
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+
+ //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ // << endl;
+
+ // get annot name
+ space_split_i = file_data_line.find(" ");
+ if (space_split_i == string::npos) // no entries for name & type
+ {
+ cout << "seq, annots - no name or type\n";
+ an_annot.name = "";
+ an_annot.type = "";
+ }
+ else
+ {
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.name = annot_value;
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot type
+ space_split_i = file_data_line.find(" ");
+ if (space_split_i == string::npos) // no entry for type
+ an_annot.type = "";
+ else
+ {
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ }
+ }
+
+
+ // add annot to list if it falls within the range of sequence specified
+ if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
+ {
+ an_annot.start -= start_index;
+ an_annot.end -= start_index;
+ annots.push_back(an_annot);
+ }
+ else
+ cout << "FAILED!!!!!!\n";
+ }
+ }
+
+ data_file.close();
+ /*
+ // debugging check
+ for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
+ {
+ cout << (*list_i).start << "," << (*list_i).end << "\t";
+ cout << (*list_i).name << "\t" << (*list_i).type << endl;
+ }
+ */
+ }
+}
+
+bool Sequence::empty() const
+{
+ return (size() == 0);
+}
+
+const std::list<annot> Sequence::annotations() const
+{
+ return annots;
+}
+
+string::size_type Sequence::length() const
+{
+ return size();
+}
+
+string::size_type Sequence::size() const
+{
+ return sequence.size();
+}
+
+Sequence::iterator Sequence::begin()
+{
+ return sequence.begin();
+}
+
+Sequence::const_iterator Sequence::begin() const
+{
+ return sequence.begin();
+}
+
+Sequence::iterator Sequence::end()
+{
+ return sequence.end();
+}
+
+Sequence::const_iterator Sequence::end() const
+{
+ return sequence.end();
+}
+
+
+const string&
+Sequence::get_seq() const
+{
+ return sequence;
+}
+
+
+string
+Sequence::subseq(int start, int end) const
+{
+ return sequence.substr(start, end);
+}
+
+
+const char *
+Sequence::c_seq() const
+{
+ return sequence.c_str();
+}
+
+string
+Sequence::rev_comp() const
+{
+ string rev_comp;
+ char conversionTable[257];
+ int seq_i, table_i, len;
+
+ len = sequence.length();
+ rev_comp.reserve(len);
+ // make a conversion table
+ // init all parts of conversion table to '~' character
+ // '~' I doubt will ever appear in a sequence file (jeez, I hope)
+ // and may the fleas of 1000 camels infest the genitals of any biologist (and
+ // seven generations of their progeny) who decides to make it mean
+ // something special!!!
+ // PS - double the curse for any smartass non-biologist who tries it as well
+ for(table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = '~';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // add in the characters for the bases we want to convert
+ conversionTable[(int)'A'] = 'T';
+ conversionTable[(int)'T'] = 'A';
+ conversionTable[(int)'G'] = 'C';
+ conversionTable[(int)'C'] = 'G';
+ conversionTable[(int)'N'] = 'N';
+
+ // finally, the actual conversion loop
+ for(seq_i = len - 1; seq_i >= 0; seq_i--)
+ {
+ table_i = (int) sequence[seq_i];
+ rev_comp += conversionTable[table_i];
+ }
+
+ return rev_comp;
+}
+
+
+const string&
+Sequence::get_header() const
+{
+ return header;
+}
+/*
+//FIXME: i don't think this code is callable
+string
+Sequence::sp_name() const
+{
+ return species;
+}
+*/
+
+void
+Sequence::set_seq(const string& a_seq)
+{
+ set_filtered_sequence(a_seq);
+}
+
+
+/*
+string
+Sequence::species()
+{
+ return species;
+}
+*/
+
+void
+Sequence::clear()
+{
+ sequence = "";
+ header = "";
+ species = "";
+ annots.clear();
+}
+
+void
+Sequence::save(fstream &save_file)
+ //string save_file_path)
+{
+ //fstream save_file;
+ list<annot>::iterator annots_i;
+
+ // not sure why, or if i'm doing something wrong, but can't seem to pass
+ // file pointers down to this method from the mussa control class
+ // so each call to save a sequence appends to the file started by mussa_class
+ //save_file.open(save_file_path.c_str(), ios::app);
+
+ save_file << "<Sequence>" << endl;
+ save_file << sequence << endl;
+ save_file << "</Sequence>" << endl;
+
+ save_file << "<Annotations>" << endl;
+ save_file << species << endl;
+ for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
+ {
+ save_file << annots_i->start << " " << annots_i->end << " " ;
+ save_file << annots_i->name << " " << annots_i->type << endl;
+ }
+ save_file << "</Annotations>" << endl;
+ //save_file.close();
+}
+
+void
+Sequence::load_museq(string load_file_path, int seq_num)
+{
+ fstream load_file;
+ string file_data_line;
+ int seq_counter;
+ annot an_annot;
+ string::size_type space_split_i;
+ string annot_value;
+
+ annots.clear();
+ load_file.open(load_file_path.c_str(), ios::in);
+
+ seq_counter = 0;
+ // search for the seq_num-th sequence
+ while ( (!load_file.eof()) && (seq_counter < seq_num) )
+ {
+ getline(load_file,file_data_line);
+ if (file_data_line == "<Sequence>")
+ seq_counter++;
+ }
+ getline(load_file, file_data_line);
+ sequence = file_data_line;
+ getline(load_file, file_data_line);
+ getline(load_file, file_data_line);
+ if (file_data_line == "<Annotations>")
+ {
+ getline(load_file, file_data_line);
+ species = file_data_line;
+ while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+ {
+ getline(load_file,file_data_line);
+ if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+ {
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+
+ if (space_split_i == string::npos) // no entry for type or name
+ {
+ cout << "seq, annots - no type or name\n";
+ an_annot.type = "";
+ an_annot.name = "";
+ }
+ else // else get annot type
+ {
+ file_data_line = file_data_line.substr(space_split_i+1);
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ if (space_split_i == string::npos) // no entry for name
+ {
+ cout << "seq, annots - no name\n";
+ an_annot.name = "";
+ }
+ else // get annot name
+ {
+ file_data_line = file_data_line.substr(space_split_i+1);
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ }
+ }
+ annots.push_back(an_annot); // don't forget to actually add the annot
+ }
+ //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ // << "-->" << an_annot.type << "::" << an_annot.name << endl;
+ }
+ }
+ load_file.close();
+}
+
+
+string
+Sequence::rc_motif(string a_motif)
+{
+ string rev_comp;
+ char conversionTable[257];
+ int seq_i, table_i, len;
+
+ len = a_motif.length();
+ rev_comp.reserve(len);
+
+ for(table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = '~';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // add in the characters for the bases we want to convert (IUPAC)
+ conversionTable[(int)'A'] = 'T';
+ conversionTable[(int)'T'] = 'A';
+ conversionTable[(int)'G'] = 'C';
+ conversionTable[(int)'C'] = 'G';
+ conversionTable[(int)'N'] = 'N';
+ conversionTable[(int)'M'] = 'K';
+ conversionTable[(int)'R'] = 'Y';
+ conversionTable[(int)'W'] = 'W';
+ conversionTable[(int)'S'] = 'S';
+ conversionTable[(int)'Y'] = 'R';
+ conversionTable[(int)'K'] = 'M';
+ conversionTable[(int)'V'] = 'B';
+ conversionTable[(int)'H'] = 'D';
+ conversionTable[(int)'D'] = 'H';
+ conversionTable[(int)'B'] = 'V';
+
+ // finally, the actual conversion loop
+ for(seq_i = len - 1; seq_i >= 0; seq_i--)
+ {
+ //cout << "** i = " << seq_i << " bp = " <<
+ table_i = (int) a_motif[seq_i];
+ rev_comp += conversionTable[table_i];
+ }
+
+ //cout << "seq: " << a_motif << endl;
+ //cout << "rc: " << rev_comp << endl;
+
+ return rev_comp;
+}
+
+string
+Sequence::motif_validate(string a_motif)
+{
+ string valid_motif;
+ int seq_i, len;
+
+ len = a_motif.length();
+ valid_motif.reserve(len);
+
+ // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
+ // current nonIUPAC symbols are omitted, which is not reported atm
+ for(seq_i = 0; seq_i < len; seq_i++)
+ {
+ if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
+ valid_motif += 'A';
+ else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
+ valid_motif += 'T';
+ else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
+ valid_motif += 'G';
+ else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
+ valid_motif += 'C';
+ else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
+ valid_motif += 'N';
+ else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
+ valid_motif += 'M';
+ else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
+ valid_motif += 'R';
+ else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
+ valid_motif += 'W';
+ else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
+ valid_motif += 'S';
+ else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
+ valid_motif += 'Y';
+ else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
+ valid_motif += 'G';
+ else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
+ valid_motif += 'V';
+ else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
+ valid_motif += 'H';
+ else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
+ valid_motif += 'D';
+ else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
+ valid_motif += 'B';
+ }
+
+ //cout << "valid_motif is: " << valid_motif << endl;
+
+ return valid_motif;
+}
+
+
+vector<int>
+Sequence::find_motif(string a_motif)
+{
+ vector<int> motif_match_starts;
+ string a_motif_rc;
+
+
+ motif_match_starts.clear();
+
+ //cout << "motif is: " << a_motif << endl;
+ a_motif = motif_validate(a_motif);
+ //cout << "motif is: " << a_motif << endl;
+
+
+ if (a_motif != "")
+ {
+ //cout << "Sequence: none blank motif\n";
+ motif_scan(a_motif, &motif_match_starts);
+
+ a_motif_rc = rc_motif(a_motif);
+ // make sure not to do search again if it is a palindrome
+ if (a_motif_rc != a_motif)
+ motif_scan(a_motif_rc, &motif_match_starts);
+ }
+ return motif_match_starts;
+}
+
+void
+Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
+{
+ char * seq_c;
+ string::size_type seq_i;
+ int motif_i, motif_len;
+
+ // faster to loop thru the sequence as a old c string (ie char array)
+ seq_c = (char*)sequence.c_str();
+ //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
+ motif_len = a_motif.length();
+
+ //cout << "motif_length: " << motif_len << endl;
+ //cout << "RAAARRRRR\n";
+
+ motif_i = 0;
+
+ //cout << "motif: " << a_motif << endl;
+
+ //cout << "Sequence: motif, length= " << length << endl;
+ seq_i = 0;
+ while (seq_i < sequence.length())
+ {
+ //cout << seq_c[seq_i];
+ //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
+ // this is pretty much a straight translation of Nora's python code
+ // to match iupac letter codes
+ if (a_motif[motif_i] =='N')
+ motif_i++;
+ else if (a_motif[motif_i] == seq_c[seq_i])
+ motif_i++;
+ else if ((a_motif[motif_i] =='M') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='R') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='W') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='S') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='Y') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='K') &&
+ ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='V') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
+ (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[seq_i] =='H') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='D') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='B') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+
+ else
+ {
+ seq_i -= motif_i;
+ motif_i = 0;
+ }
+
+ // end Nora stuff, now we see if a match is found this pass
+ if (motif_i == motif_len)
+ {
+ //cout << "!!";
+ motif_match_starts->push_back(seq_i - motif_len + 1);
+ motif_i = 0;
+ }
+
+ seq_i++;
+ }
+ cout << endl;
+}
+