+// This file is part of the Mussa source distribution.
+// http://mussa.caltech.edu/
+// Contact author: Tristan De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA. It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
+// This file is part of the Mussa source distribution.
+// http://mussa.caltech.edu/
+// Contact author: Tristan De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA. It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
// ----------------------------------------
// ---------- sequence.cc -----------
// ----------------------------------------
}
-void
+string
Sequence::load_fasta(string file_path, int seq_num,
int start_index, int end_index)
{
int seq_i, table_i, len;
string sequence_raw;
char * seq_tmp; // holds sequence during basic filtering
-
+ string err_msg;
data_file.open(file_path.c_str(), ios::in);
-
- // search for the header of the fasta sequence we want
- while ( (!data_file.eof()) && (header_counter < seq_num) )
+ if (!data_file)
{
- getline(data_file,file_data_line);
- if (file_data_line.substr(0,1) == ">")
- header_counter++;
+ err_msg = "Sequence File: " + file_path + " not found";
+ return err_msg;
}
+ // if file opened okay, read it
+ else
+ {
+ // search for the header of the fasta sequence we want
+ while ( (!data_file.eof()) && (header_counter < seq_num) )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line.substr(0,1) == ">")
+ header_counter++;
+ }
- header = file_data_line.substr(1);
-
- sequence_raw = "";
+ header = file_data_line.substr(1);
- while ( !data_file.eof() && read_seq )
- {
- getline(data_file,file_data_line);
- if (file_data_line.substr(0,1) == ">")
- read_seq = false;
- else sequence_raw += file_data_line;
- }
+ sequence_raw = "";
- data_file.close();
+ while ( !data_file.eof() && read_seq )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line.substr(0,1) == ">")
+ read_seq = false;
+ else sequence_raw += file_data_line;
+ }
- // sequence filtering for upcasing agctn and convert non AGCTN to N
+ data_file.close();
- len = sequence_raw.length();
- seq_tmp = (char*)sequence_raw.c_str();
+ // sequence filtering for upcasing agctn and convert non AGCTN to N
- sequence = "";
+ len = sequence_raw.length();
+ seq_tmp = (char*)sequence_raw.c_str();
- // Make a conversion table
+ sequence = "";
- // everything we don't specify below will become 'N'
- for(table_i=0; table_i < 256; table_i++)
- {
- conversionTable[table_i] = 'N';
- }
- // add end of string character for printing out table for testing purposes
- conversionTable[256] = '\0';
+ // Make a conversion table
- // we want these to map to themselves - ie not to change
- conversionTable['A'] = 'A';
- conversionTable['T'] = 'T';
- conversionTable['G'] = 'G';
- conversionTable['C'] = 'C';
- // this is to upcase
- conversionTable['a'] = 'A';
- conversionTable['t'] = 'T';
- conversionTable['g'] = 'G';
- conversionTable['c'] = 'C';
+ // everything we don't specify below will become 'N'
+ for(table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = 'N';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // we want these to map to themselves - ie not to change
+ conversionTable['A'] = 'A';
+ conversionTable['T'] = 'T';
+ conversionTable['G'] = 'G';
+ conversionTable['C'] = 'C';
+ // this is to upcase
+ conversionTable['a'] = 'A';
+ conversionTable['t'] = 'T';
+ conversionTable['g'] = 'G';
+ conversionTable['c'] = 'C';
+
+ // finally, the actual conversion loop
+ for(seq_i = 0; seq_i < len; seq_i++)
+ {
+ table_i = (int) seq_tmp[seq_i];
+ sequence += conversionTable[table_i];
+ }
- // finally, the actual conversion loop
- for(seq_i = 0; seq_i < len; seq_i++)
- {
- table_i = (int) seq_tmp[seq_i];
- sequence += conversionTable[table_i];
- }
+ // Lastly, if subselection of the sequence was specified we keep cut out
+ // and only keep that part
- // Lastly, if subselection of the sequence was specified we keep cut out
- // and only keep that part
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = len;
- // end_index = 0 means no end was specified, so cut to the end
- if (end_index == 0)
- end_index = len;
+ // start_index = 0 if no start was specified
+ sequence = sequence.substr(start_index, end_index - start_index);
+ length = sequence.length();
- // start_index = 0 if no start was specified
- sequence = sequence.substr(start_index, end_index - start_index);
+ return "";
+ }
}
//transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
-void
-Sequence::load_annot(string file_path)
+string
+Sequence::load_annot(string file_path, int start_index, int end_index)
{
fstream data_file;
string file_data_line;
annot an_annot;
int space_split_i;
string annot_value;
+ list<annot>::iterator list_i;
+ string err_msg;
+
annots.clear();
data_file.open(file_path.c_str(), ios::in);
- getline(data_file,file_data_line);
- species = file_data_line;
-
- while ( !data_file.eof() )
+ if (!data_file)
+ {
+ err_msg = "Sequence File: " + file_path + " not found";
+ return err_msg;
+ }
+ // if file opened okay, read it
+ else
{
getline(data_file,file_data_line);
- if (file_data_line != "")
+ species = file_data_line;
+
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = length;
+
+ //cout << "START: " << start_index << " END: " << end_index << endl;
+
+ while ( !data_file.eof() )
{
- // need to get 4 values...almost same code 4 times...
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.start = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot end index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.end = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.name = annot_value;
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
- annots.push_back(an_annot);
- }
- }
+ getline(data_file,file_data_line);
+ if (file_data_line != "")
+ {
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
- data_file.close();
+ cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ << endl;
- list<annot>::iterator list_i;
- for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
- {
- cout << (*list_i).start << "," << (*list_i).end << "\t";
- cout << (*list_i).name << "\t" << (*list_i).type << endl;
+ // get annot name
+ space_split_i = file_data_line.find(" ");
+ if (space_split_i == string::npos) // no entries for name & type
+ {
+ cout << "seq, annots - no name or type\n";
+ an_annot.name = "";
+ an_annot.type = "";
+ }
+ else
+ {
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.name = annot_value;
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot type
+ space_split_i = file_data_line.find(" ");
+ if (space_split_i == string::npos) // no entry for type
+ an_annot.type = "";
+ else
+ {
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ }
+ }
+
+
+ // add annot to list if it falls within the range of sequence specified
+ if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
+ {
+ an_annot.start -= start_index;
+ an_annot.end -= start_index;
+ annots.push_back(an_annot);
+ }
+ else
+ cout << "FAILED!!!!!!\n";
+ }
+ }
+
+ data_file.close();
+ /*
+ // debugging check
+ for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
+ {
+ cout << (*list_i).start << "," << (*list_i).end << "\t";
+ cout << (*list_i).name << "\t" << (*list_i).type << endl;
+ }
+ */
+ return "";
}
}
return sequence;
}
+
+string
+Sequence::subseq(int start, int end)
+{
+ return sequence.substr(start, end);
+}
+
+
const char *
Sequence::c_seq()
{
return header;
}
+string
+Sequence::sp_name()
+{
+ return species;
+}
+
void
Sequence::set_seq(string a_seq)
{
sequence = a_seq;
+ length = sequence.length();
}
save_file << "</Sequence>" << endl;
save_file << "<Annotations>" << endl;
+ save_file << species << endl;
for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
{
save_file << annots_i->start << " " << annots_i->end << " " ;
seq_counter++;
}
getline(load_file, file_data_line);
- //cout << "*fee\n";
sequence = file_data_line;
- //cout << "*fie\n";
+ length = sequence.length();
getline(load_file, file_data_line);
getline(load_file, file_data_line);
if (file_data_line == "<Annotations>")
{
+ getline(load_file, file_data_line);
+ species = file_data_line;
while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
{
- //cout << "*foe\n";
getline(load_file,file_data_line);
if ((file_data_line != "") && (file_data_line != "</Annotations>"))
{
- //cout << "*fum\n";
// need to get 4 values...almost same code 4 times...
// get annot start index
space_split_i = file_data_line.find(" ");
space_split_i = file_data_line.find(" ");
annot_value = file_data_line.substr(0,space_split_i);
an_annot.end = atoi (annot_value.c_str());
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.name = annot_value;
- file_data_line = file_data_line.substr(space_split_i+1);
- // get annot start index
- space_split_i = file_data_line.find(" ");
- annot_value = file_data_line.substr(0,space_split_i);
- an_annot.type = annot_value;
- annots.push_back(an_annot);
+
+ if (space_split_i == string::npos) // no entry for type or name
+ {
+ cout << "seq, annots - no type or name\n";
+ an_annot.type = "";
+ an_annot.name = "";
+ }
+ else // else get annot type
+ {
+ file_data_line = file_data_line.substr(space_split_i+1);
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ if (space_split_i == string::npos) // no entry for name
+ {
+ cout << "seq, annots - no name\n";
+ an_annot.name = "";
+ }
+ else // get annot name
+ {
+ file_data_line = file_data_line.substr(space_split_i+1);
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ }
+ }
+ annots.push_back(an_annot); // don't forget to actually add the annot
}
+ cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+ << "-->" << an_annot.type << "::" << an_annot.name << endl;
}
}
load_file.close();
}
-list<int>
+string
+Sequence::rc_motif(string a_motif)
+{
+ string rev_comp;
+ char conversionTable[257];
+ int seq_i, table_i, len;
+
+ len = a_motif.length();
+ rev_comp.reserve(len);
+
+ for(table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = '~';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // add in the characters for the bases we want to convert (IUPAC)
+ conversionTable['A'] = 'T';
+ conversionTable['T'] = 'A';
+ conversionTable['G'] = 'C';
+ conversionTable['C'] = 'G';
+ conversionTable['N'] = 'N';
+ conversionTable['M'] = 'K';
+ conversionTable['R'] = 'Y';
+ conversionTable['W'] = 'W';
+ conversionTable['S'] = 'S';
+ conversionTable['Y'] = 'R';
+ conversionTable['K'] = 'M';
+ conversionTable['V'] = 'B';
+ conversionTable['H'] = 'D';
+ conversionTable['D'] = 'H';
+ conversionTable['B'] = 'V';
+
+ // finally, the actual conversion loop
+ for(seq_i = len - 1; seq_i >= 0; seq_i--)
+ {
+ //cout << "** i = " << seq_i << " bp = " <<
+ table_i = (int) a_motif[seq_i];
+ rev_comp += conversionTable[table_i];
+ }
+
+ cout << "seq: " << a_motif << endl;
+ cout << "rc: " << rev_comp << endl;
+
+ return rev_comp;
+}
+
+string
+Sequence::motif_validate(string a_motif)
+{
+ string valid_motif;
+ int seq_i, len;
+
+
+ len = a_motif.length();
+ valid_motif.reserve(len);
+
+ // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
+ // current nonIUPAC symbols are omitted, which is not reported atm
+ for(seq_i = 0; seq_i < len; seq_i++)
+ {
+ if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
+ valid_motif += 'A';
+ else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
+ valid_motif += 'T';
+ else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
+ valid_motif += 'G';
+ else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
+ valid_motif += 'C';
+ else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
+ valid_motif += 'N';
+ else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
+ valid_motif += 'M';
+ else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
+ valid_motif += 'R';
+ else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
+ valid_motif += 'W';
+ else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
+ valid_motif += 'S';
+ else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
+ valid_motif += 'Y';
+ else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
+ valid_motif += 'G';
+ else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
+ valid_motif += 'V';
+ else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
+ valid_motif += 'H';
+ else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
+ valid_motif += 'D';
+ else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
+ valid_motif += 'B';
+ }
+
+ cout << "valid_motif is: " << valid_motif << endl;
+
+ return valid_motif;
+}
+
+
+vector<int>
Sequence::find_motif(string a_motif)
+{
+ vector<int> motif_match_starts;
+ string a_motif_rc;
+
+
+ motif_match_starts.clear();
+
+ cout << "motif is: " << a_motif << endl;
+ a_motif = motif_validate(a_motif);
+ //cout << "motif is: " << a_motif << endl;
+
+
+ if (a_motif != "")
+ {
+ cout << "Sequence: none blank motif\n";
+ motif_scan(a_motif, &motif_match_starts);
+
+ a_motif_rc = rc_motif(a_motif);
+ // make sure not to do search again if it is a palindrome
+ if (a_motif_rc != a_motif)
+ motif_scan(a_motif_rc, &motif_match_starts);
+ }
+ return motif_match_starts;
+}
+
+void
+Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
{
char * seq_c;
int seq_i, motif_i, motif_len;
- list<int> motif_match_starts;
- motif_match_starts.clear();
// faster to loop thru the sequence as a old c string (ie char array)
seq_c = (char*)sequence.c_str();
+ //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
motif_len = a_motif.length();
-
- for (seq_i = 0; seq_i < length; seq_i++)
- {
+ //cout << "motif_length: " << motif_len << endl;
+ //cout << "RAAARRRRR\n";
+
+ motif_i = 0;
+ //cout << "motif: " << a_motif << endl;
+
+ //cout << "Sequence: motif, length= " << length << endl;
+ seq_i = 0;
+ while (seq_i < length)
+ {
+ cout << seq_c[seq_i];
+ //if ((seq_i > 10885) && (seq_i < 10917))
+ //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
// this is pretty much a straight translation of Nora's python code
// to match iupac letter codes
- if (a_motif[motif_i] == seq_c[seq_i])
+ if (a_motif[motif_i] =='N')
motif_i++;
- else if (seq_c[seq_i] =='N')
+ else if (a_motif[motif_i] == seq_c[seq_i])
motif_i++;
else if ((a_motif[motif_i] =='M') &&
((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
(seq_c[seq_i]=='T')))
motif_i++;
+
else
+ {
+ seq_i -= motif_i;
motif_i = 0;
+ }
// end Nora stuff, now we see if a match is found this pass
if (motif_i == motif_len)
{
- motif_match_starts.push_back(seq_i - motif_len + 1);
+ //cout << "!!";
+ motif_match_starts->push_back(seq_i - motif_len + 1);
motif_i = 0;
}
- }
- return motif_match_starts;
+ seq_i++;
+ }
+ cout << endl;
}
+
+/*
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.name = annot_value;
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+*/