void
-Sequence::load_fasta(string file_path, int seq_num)
+Sequence::load_fasta(string file_path, int seq_num,
+ int start_index, int end_index)
{
fstream data_file;
string file_data_line;
int header_counter = 0;
bool read_seq = true;
+ string rev_comp;
+ char conversionTable[257];
+ int seq_i, table_i, len;
+ string sequence_raw;
+ char * seq_tmp; // holds sequence during basic filtering
data_file.open(file_path.c_str(), ios::in);
header = file_data_line.substr(1);
+ sequence_raw = "";
+
while ( !data_file.eof() && read_seq )
{
getline(data_file,file_data_line);
if (file_data_line.substr(0,1) == ">")
read_seq = false;
- else sequence += file_data_line;
+ else sequence_raw += file_data_line;
}
data_file.close();
- // need more sequence filtering for non AGCTN and convert to N
- transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
+ // sequence filtering for upcasing agctn and convert non AGCTN to N
+
+ len = sequence_raw.length();
+ seq_tmp = (char*)sequence_raw.c_str();
+
+ sequence = "";
+
+ // Make a conversion table
+
+ // everything we don't specify below will become 'N'
+ for(table_i=0; table_i < 256; table_i++)
+ {
+ conversionTable[table_i] = 'N';
+ }
+ // add end of string character for printing out table for testing purposes
+ conversionTable[256] = '\0';
+
+ // we want these to map to themselves - ie not to change
+ conversionTable['A'] = 'A';
+ conversionTable['T'] = 'T';
+ conversionTable['G'] = 'G';
+ conversionTable['C'] = 'C';
+ // this is to upcase
+ conversionTable['a'] = 'A';
+ conversionTable['t'] = 'T';
+ conversionTable['g'] = 'G';
+ conversionTable['c'] = 'C';
+
+ // finally, the actual conversion loop
+ for(seq_i = 0; seq_i < len; seq_i++)
+ {
+ table_i = (int) seq_tmp[seq_i];
+ sequence += conversionTable[table_i];
+ }
+
+
+ // Lastly, if subselection of the sequence was specified we keep cut out
+ // and only keep that part
+
+ // end_index = 0 means no end was specified, so cut to the end
+ if (end_index == 0)
+ end_index = len;
+
+ // start_index = 0 if no start was specified
+ sequence = sequence.substr(start_index, end_index - start_index);
}
+ // this doesn't work properly under gcc 3.x ... it can't recognize toupper
+ //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
+
+
void
Sequence::load_annot(string file_path)
{
fstream data_file;
string file_data_line;
annot an_annot;
+ int space_split_i;
+ string annot_value;
+ annots.clear();
data_file.open(file_path.c_str(), ios::in);
+ getline(data_file,file_data_line);
+ species = file_data_line;
+
+ while ( !data_file.eof() )
+ {
+ getline(data_file,file_data_line);
+ if (file_data_line != "")
+ {
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.name = annot_value;
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ annots.push_back(an_annot);
+ }
+ }
+
data_file.close();
+ list<annot>::iterator list_i;
+ for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
+ {
+ cout << (*list_i).start << "," << (*list_i).end << "\t";
+ cout << (*list_i).name << "\t" << (*list_i).type << endl;
+ }
}
species_num = -4;
annots.clear();
}
+
+void
+Sequence::save(fstream &save_file)
+ //string save_file_path)
+{
+ //fstream save_file;
+ list<annot>::iterator annots_i;
+ int i;
+
+ // not sure why, or if i'm doing something wrong, but can't seem to pass
+ // file pointers down to this method from the mussa control class
+ // so each call to save a sequence appends to the file started by mussa_class
+ //save_file.open(save_file_path.c_str(), ios::app);
+
+ save_file << "<Sequence>" << endl;
+ save_file << sequence << endl;
+ save_file << "</Sequence>" << endl;
+
+ save_file << "<Annotations>" << endl;
+ for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
+ {
+ save_file << annots_i->start << " " << annots_i->end << " " ;
+ save_file << annots_i->name << " " << annots_i->type << endl;
+ }
+ save_file << "</Annotations>" << endl;
+ //save_file.close();
+}
+
+void
+Sequence::load_museq(string load_file_path, int seq_num)
+{
+ fstream load_file;
+ string file_data_line;
+ int seq_counter;
+ annot an_annot;
+ int space_split_i;
+ string annot_value;
+
+ annots.clear();
+ load_file.open(load_file_path.c_str(), ios::in);
+
+ seq_counter = 0;
+ // search for the seq_num-th sequence
+ while ( (!load_file.eof()) && (seq_counter < seq_num) )
+ {
+ getline(load_file,file_data_line);
+ if (file_data_line == "<Sequence>")
+ seq_counter++;
+ }
+ getline(load_file, file_data_line);
+ //cout << "*fee\n";
+ sequence = file_data_line;
+ //cout << "*fie\n";
+ getline(load_file, file_data_line);
+ getline(load_file, file_data_line);
+ if (file_data_line == "<Annotations>")
+ {
+ while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
+ {
+ //cout << "*foe\n";
+ getline(load_file,file_data_line);
+ if ((file_data_line != "") && (file_data_line != "</Annotations>"))
+ {
+ //cout << "*fum\n";
+ // need to get 4 values...almost same code 4 times...
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.start = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot end index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.end = atoi (annot_value.c_str());
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.name = annot_value;
+ file_data_line = file_data_line.substr(space_split_i+1);
+ // get annot start index
+ space_split_i = file_data_line.find(" ");
+ annot_value = file_data_line.substr(0,space_split_i);
+ an_annot.type = annot_value;
+ annots.push_back(an_annot);
+ }
+ }
+ }
+ load_file.close();
+}
+
+
+list<int>
+Sequence::find_motif(string a_motif)
+{
+ char * seq_c;
+ int seq_i, motif_i, motif_len;
+ list<int> motif_match_starts;
+
+ motif_match_starts.clear();
+ // faster to loop thru the sequence as a old c string (ie char array)
+ seq_c = (char*)sequence.c_str();
+ motif_len = a_motif.length();
+
+
+ for (seq_i = 0; seq_i < length; seq_i++)
+ {
+
+ // this is pretty much a straight translation of Nora's python code
+ // to match iupac letter codes
+ if (a_motif[motif_i] == seq_c[seq_i])
+ motif_i++;
+ else if (seq_c[seq_i] =='N')
+ motif_i++;
+ else if ((a_motif[motif_i] =='M') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='R') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='W') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='S') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='Y') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='K') &&
+ ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='V') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
+ (seq_c[seq_i]=='G')))
+ motif_i++;
+ else if ((a_motif[seq_i] =='H') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='D') &&
+ ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+ else if ((a_motif[motif_i] =='B') &&
+ ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
+ (seq_c[seq_i]=='T')))
+ motif_i++;
+ else
+ motif_i = 0;
+
+ // end Nora stuff, now we see if a match is found this pass
+ if (motif_i == motif_len)
+ {
+ motif_match_starts.push_back(seq_i - motif_len + 1);
+ motif_i = 0;
+ }
+ }
+
+ return motif_match_starts;
+}