1 // ----------------------------------------
2 // ---------- sequence.cc -----------
3 // ----------------------------------------
14 Sequence::load_fasta(string file_path, int seq_num,
15 int start_index, int end_index)
18 string file_data_line;
19 int header_counter = 0;
22 char conversionTable[257];
23 int seq_i, table_i, len;
25 char * seq_tmp; // holds sequence during basic filtering
28 data_file.open(file_path.c_str(), ios::in);
31 // search for the header of the fasta sequence we want
32 while ( (!data_file.eof()) && (header_counter < seq_num) )
34 getline(data_file,file_data_line);
35 if (file_data_line.substr(0,1) == ">")
39 header = file_data_line.substr(1);
43 while ( !data_file.eof() && read_seq )
45 getline(data_file,file_data_line);
46 if (file_data_line.substr(0,1) == ">")
48 else sequence_raw += file_data_line;
53 // sequence filtering for upcasing agctn and convert non AGCTN to N
55 len = sequence_raw.length();
56 seq_tmp = (char*)sequence_raw.c_str();
60 // Make a conversion table
62 // everything we don't specify below will become 'N'
63 for(table_i=0; table_i < 256; table_i++)
65 conversionTable[table_i] = 'N';
67 // add end of string character for printing out table for testing purposes
68 conversionTable[256] = '\0';
70 // we want these to map to themselves - ie not to change
71 conversionTable['A'] = 'A';
72 conversionTable['T'] = 'T';
73 conversionTable['G'] = 'G';
74 conversionTable['C'] = 'C';
76 conversionTable['a'] = 'A';
77 conversionTable['t'] = 'T';
78 conversionTable['g'] = 'G';
79 conversionTable['c'] = 'C';
81 // finally, the actual conversion loop
82 for(seq_i = 0; seq_i < len; seq_i++)
84 table_i = (int) seq_tmp[seq_i];
85 sequence += conversionTable[table_i];
89 // Lastly, if subselection of the sequence was specified we keep cut out
90 // and only keep that part
92 // end_index = 0 means no end was specified, so cut to the end
96 // start_index = 0 if no start was specified
97 sequence = sequence.substr(start_index, end_index - start_index);
98 length = sequence.length();
102 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
103 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
107 Sequence::load_annot(string file_path, int start_index, int end_index)
110 string file_data_line;
114 list<annot>::iterator list_i;
117 data_file.open(file_path.c_str(), ios::in);
119 getline(data_file,file_data_line);
120 species = file_data_line;
122 // end_index = 0 means no end was specified, so cut to the end
126 cout << "START: " << start_index << " END: " << end_index << endl;
128 while ( !data_file.eof() )
130 getline(data_file,file_data_line);
131 if (file_data_line != "")
133 // need to get 4 values...almost same code 4 times...
134 // get annot start index
135 space_split_i = file_data_line.find(" ");
136 annot_value = file_data_line.substr(0,space_split_i);
137 an_annot.start = atoi (annot_value.c_str());
138 file_data_line = file_data_line.substr(space_split_i+1);
139 // get annot end index
140 space_split_i = file_data_line.find(" ");
141 annot_value = file_data_line.substr(0,space_split_i);
142 an_annot.end = atoi (annot_value.c_str());
143 file_data_line = file_data_line.substr(space_split_i+1);
144 // get annot start index
145 space_split_i = file_data_line.find(" ");
146 annot_value = file_data_line.substr(0,space_split_i);
147 an_annot.name = annot_value;
148 file_data_line = file_data_line.substr(space_split_i+1);
149 // get annot start index
150 space_split_i = file_data_line.find(" ");
151 annot_value = file_data_line.substr(0,space_split_i);
152 an_annot.type = annot_value;
154 cout << "ANA_START: " << an_annot.start <<
155 " ANA_END: " << an_annot.end << endl;
157 // add annot to list if it falls within the range of sequence specified
158 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
160 an_annot.start -= start_index;
161 an_annot.end -= start_index;
162 annots.push_back(an_annot);
165 cout << "FAILED!!!!!!\n";
172 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
174 cout << (*list_i).start << "," << (*list_i).end << "\t";
175 cout << (*list_i).name << "\t" << (*list_i).type << endl;
183 return sequence.length();
196 return sequence.c_str();
203 char conversionTable[257];
204 int seq_i, table_i, len;
206 len = sequence.length();
207 rev_comp.reserve(len);
208 // make a conversion table
209 // init all parts of conversion table to '~' character
210 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
211 // and may the fleas of 1000 camels infest the genitals of any biologist (and
212 // seven generations of their progeny) who decides to make it mean
213 // something special!!!
214 // PS - double the curse for any smartass non-biologist who tries it as well
215 for(table_i=0; table_i < 256; table_i++)
217 conversionTable[table_i] = '~';
219 // add end of string character for printing out table for testing purposes
220 conversionTable[256] = '\0';
222 // add in the characters for the bases we want to convert
223 conversionTable['A'] = 'T';
224 conversionTable['T'] = 'A';
225 conversionTable['G'] = 'C';
226 conversionTable['C'] = 'G';
227 conversionTable['N'] = 'N';
229 // finally, the actual conversion loop
230 for(seq_i = len - 1; seq_i >= 0; seq_i--)
232 table_i = (int) sequence[seq_i];
233 rev_comp += conversionTable[table_i];
254 Sequence::set_seq(string a_seq)
280 Sequence::save(fstream &save_file)
281 //string save_file_path)
284 list<annot>::iterator annots_i;
287 // not sure why, or if i'm doing something wrong, but can't seem to pass
288 // file pointers down to this method from the mussa control class
289 // so each call to save a sequence appends to the file started by mussa_class
290 //save_file.open(save_file_path.c_str(), ios::app);
292 save_file << "<Sequence>" << endl;
293 save_file << sequence << endl;
294 save_file << "</Sequence>" << endl;
296 save_file << "<Annotations>" << endl;
297 save_file << species << endl;
298 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
300 save_file << annots_i->start << " " << annots_i->end << " " ;
301 save_file << annots_i->name << " " << annots_i->type << endl;
303 save_file << "</Annotations>" << endl;
308 Sequence::load_museq(string load_file_path, int seq_num)
311 string file_data_line;
318 load_file.open(load_file_path.c_str(), ios::in);
321 // search for the seq_num-th sequence
322 while ( (!load_file.eof()) && (seq_counter < seq_num) )
324 getline(load_file,file_data_line);
325 if (file_data_line == "<Sequence>")
328 getline(load_file, file_data_line);
329 sequence = file_data_line;
330 length = sequence.length();
331 getline(load_file, file_data_line);
332 getline(load_file, file_data_line);
333 if (file_data_line == "<Annotations>")
335 getline(load_file, file_data_line);
336 species = file_data_line;
337 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
339 getline(load_file,file_data_line);
340 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
342 // need to get 4 values...almost same code 4 times...
343 // get annot start index
344 space_split_i = file_data_line.find(" ");
345 annot_value = file_data_line.substr(0,space_split_i);
346 an_annot.start = atoi (annot_value.c_str());
347 file_data_line = file_data_line.substr(space_split_i+1);
348 // get annot end index
349 space_split_i = file_data_line.find(" ");
350 annot_value = file_data_line.substr(0,space_split_i);
351 an_annot.end = atoi (annot_value.c_str());
352 file_data_line = file_data_line.substr(space_split_i+1);
353 // get annot start index
354 space_split_i = file_data_line.find(" ");
355 annot_value = file_data_line.substr(0,space_split_i);
356 an_annot.name = annot_value;
357 file_data_line = file_data_line.substr(space_split_i+1);
358 // get annot start index
359 space_split_i = file_data_line.find(" ");
360 annot_value = file_data_line.substr(0,space_split_i);
361 an_annot.type = annot_value;
362 annots.push_back(an_annot);
371 Sequence::rc_motif(string a_motif)
374 char conversionTable[257];
375 int seq_i, table_i, len;
377 len = a_motif.length();
378 rev_comp.reserve(len);
380 for(table_i=0; table_i < 256; table_i++)
382 conversionTable[table_i] = '~';
384 // add end of string character for printing out table for testing purposes
385 conversionTable[256] = '\0';
387 // add in the characters for the bases we want to convert (IUPAC)
388 conversionTable['A'] = 'T';
389 conversionTable['T'] = 'A';
390 conversionTable['G'] = 'C';
391 conversionTable['C'] = 'G';
392 conversionTable['N'] = 'N';
393 conversionTable['M'] = 'K';
394 conversionTable['R'] = 'Y';
395 conversionTable['W'] = 'W';
396 conversionTable['S'] = 'S';
397 conversionTable['Y'] = 'R';
398 conversionTable['K'] = 'M';
399 conversionTable['V'] = 'B';
400 conversionTable['H'] = 'D';
401 conversionTable['D'] = 'H';
402 conversionTable['B'] = 'V';
404 // finally, the actual conversion loop
405 for(seq_i = len - 1; seq_i >= 0; seq_i--)
407 //cout << "** i = " << seq_i << " bp = " <<
408 table_i = (int) a_motif[seq_i];
409 rev_comp += conversionTable[table_i];
412 cout << "seq: " << a_motif << endl;
413 cout << "rc: " << rev_comp << endl;
420 Sequence::find_motif(string a_motif)
422 vector<int> motif_match_starts;
426 motif_match_starts.clear();
427 motif_scan(a_motif, &motif_match_starts);
429 a_motif_rc = rc_motif(a_motif);
430 // make sure not to do search again if it is a palindrome
431 if (a_motif_rc != a_motif)
432 motif_scan(a_motif_rc, &motif_match_starts);
434 return motif_match_starts;
438 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
441 int seq_i, motif_i, motif_len;
443 // faster to loop thru the sequence as a old c string (ie char array)
444 seq_c = (char*)sequence.c_str();
445 motif_len = a_motif.length();
447 //cout << "motif_length: " << motif_len << endl;
448 //cout << "RAAARRRRR\n";
452 //cout << "length: " << length << endl;
454 while (seq_i < length)
456 // this is pretty much a straight translation of Nora's python code
457 // to match iupac letter codes
458 if (a_motif[motif_i] =='N')
460 else if (a_motif[motif_i] == seq_c[seq_i])
462 else if ((a_motif[motif_i] =='M') &&
463 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
465 else if ((a_motif[motif_i] =='R') &&
466 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
468 else if ((a_motif[motif_i] =='W') &&
469 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
471 else if ((a_motif[motif_i] =='S') &&
472 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
474 else if ((a_motif[motif_i] =='Y') &&
475 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
477 else if ((a_motif[motif_i] =='K') &&
478 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
480 else if ((a_motif[motif_i] =='V') &&
481 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
482 (seq_c[seq_i]=='G')))
484 else if ((a_motif[seq_i] =='H') &&
485 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
486 (seq_c[seq_i]=='T')))
488 else if ((a_motif[motif_i] =='D') &&
489 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
490 (seq_c[seq_i]=='T')))
492 else if ((a_motif[motif_i] =='B') &&
493 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
494 (seq_c[seq_i]=='T')))
502 // end Nora stuff, now we see if a match is found this pass
503 if (motif_i == motif_len)
505 //cout << "MATCH!!!\n";
506 motif_match_starts->push_back(seq_i - motif_len + 1);