1 // ----------------------------------------
2 // ---------- sequence.cc -----------
3 // ----------------------------------------
14 Sequence::load_fasta(string file_path, int seq_num,
15 int start_index, int end_index)
18 string file_data_line;
19 int header_counter = 0;
22 char conversionTable[257];
23 int seq_i, table_i, len;
25 char * seq_tmp; // holds sequence during basic filtering
28 data_file.open(file_path.c_str(), ios::in);
32 err_msg = "Sequence File: " + file_path + " not found";
35 // if file opened okay, read it
38 // search for the header of the fasta sequence we want
39 while ( (!data_file.eof()) && (header_counter < seq_num) )
41 getline(data_file,file_data_line);
42 if (file_data_line.substr(0,1) == ">")
46 header = file_data_line.substr(1);
50 while ( !data_file.eof() && read_seq )
52 getline(data_file,file_data_line);
53 if (file_data_line.substr(0,1) == ">")
55 else sequence_raw += file_data_line;
60 // sequence filtering for upcasing agctn and convert non AGCTN to N
62 len = sequence_raw.length();
63 seq_tmp = (char*)sequence_raw.c_str();
67 // Make a conversion table
69 // everything we don't specify below will become 'N'
70 for(table_i=0; table_i < 256; table_i++)
72 conversionTable[table_i] = 'N';
74 // add end of string character for printing out table for testing purposes
75 conversionTable[256] = '\0';
77 // we want these to map to themselves - ie not to change
78 conversionTable['A'] = 'A';
79 conversionTable['T'] = 'T';
80 conversionTable['G'] = 'G';
81 conversionTable['C'] = 'C';
83 conversionTable['a'] = 'A';
84 conversionTable['t'] = 'T';
85 conversionTable['g'] = 'G';
86 conversionTable['c'] = 'C';
88 // finally, the actual conversion loop
89 for(seq_i = 0; seq_i < len; seq_i++)
91 table_i = (int) seq_tmp[seq_i];
92 sequence += conversionTable[table_i];
96 // Lastly, if subselection of the sequence was specified we keep cut out
97 // and only keep that part
99 // end_index = 0 means no end was specified, so cut to the end
103 // start_index = 0 if no start was specified
104 sequence = sequence.substr(start_index, end_index - start_index);
105 length = sequence.length();
112 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
113 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
117 Sequence::load_annot(string file_path, int start_index, int end_index)
120 string file_data_line;
124 list<annot>::iterator list_i;
129 data_file.open(file_path.c_str(), ios::in);
133 err_msg = "Sequence File: " + file_path + " not found";
136 // if file opened okay, read it
139 getline(data_file,file_data_line);
140 species = file_data_line;
142 // end_index = 0 means no end was specified, so cut to the end
146 //cout << "START: " << start_index << " END: " << end_index << endl;
148 while ( !data_file.eof() )
150 getline(data_file,file_data_line);
151 if (file_data_line != "")
153 // need to get 4 values...almost same code 4 times...
154 // get annot start index
155 space_split_i = file_data_line.find(" ");
156 annot_value = file_data_line.substr(0,space_split_i);
157 an_annot.start = atoi (annot_value.c_str());
158 file_data_line = file_data_line.substr(space_split_i+1);
159 // get annot end index
160 space_split_i = file_data_line.find(" ");
161 annot_value = file_data_line.substr(0,space_split_i);
162 an_annot.end = atoi (annot_value.c_str());
163 file_data_line = file_data_line.substr(space_split_i+1);
165 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
169 space_split_i = file_data_line.find(" ");
170 if (space_split_i == string::npos) // no entries for name & type
172 cout << "seq, annots - no name or type\n";
178 annot_value = file_data_line.substr(0,space_split_i);
179 an_annot.name = annot_value;
180 file_data_line = file_data_line.substr(space_split_i+1);
182 space_split_i = file_data_line.find(" ");
183 if (space_split_i == string::npos) // no entry for type
187 annot_value = file_data_line.substr(0,space_split_i);
188 an_annot.type = annot_value;
193 // add annot to list if it falls within the range of sequence specified
194 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
196 an_annot.start -= start_index;
197 an_annot.end -= start_index;
198 annots.push_back(an_annot);
201 cout << "FAILED!!!!!!\n";
208 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
210 cout << (*list_i).start << "," << (*list_i).end << "\t";
211 cout << (*list_i).name << "\t" << (*list_i).type << endl;
222 return sequence.length();
234 Sequence::subseq(int start, int end)
236 return sequence.substr(start, end);
243 return sequence.c_str();
250 char conversionTable[257];
251 int seq_i, table_i, len;
253 len = sequence.length();
254 rev_comp.reserve(len);
255 // make a conversion table
256 // init all parts of conversion table to '~' character
257 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
258 // and may the fleas of 1000 camels infest the genitals of any biologist (and
259 // seven generations of their progeny) who decides to make it mean
260 // something special!!!
261 // PS - double the curse for any smartass non-biologist who tries it as well
262 for(table_i=0; table_i < 256; table_i++)
264 conversionTable[table_i] = '~';
266 // add end of string character for printing out table for testing purposes
267 conversionTable[256] = '\0';
269 // add in the characters for the bases we want to convert
270 conversionTable['A'] = 'T';
271 conversionTable['T'] = 'A';
272 conversionTable['G'] = 'C';
273 conversionTable['C'] = 'G';
274 conversionTable['N'] = 'N';
276 // finally, the actual conversion loop
277 for(seq_i = len - 1; seq_i >= 0; seq_i--)
279 table_i = (int) sequence[seq_i];
280 rev_comp += conversionTable[table_i];
301 Sequence::set_seq(string a_seq)
327 Sequence::save(fstream &save_file)
328 //string save_file_path)
331 list<annot>::iterator annots_i;
334 // not sure why, or if i'm doing something wrong, but can't seem to pass
335 // file pointers down to this method from the mussa control class
336 // so each call to save a sequence appends to the file started by mussa_class
337 //save_file.open(save_file_path.c_str(), ios::app);
339 save_file << "<Sequence>" << endl;
340 save_file << sequence << endl;
341 save_file << "</Sequence>" << endl;
343 save_file << "<Annotations>" << endl;
344 save_file << species << endl;
345 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
347 save_file << annots_i->start << " " << annots_i->end << " " ;
348 save_file << annots_i->name << " " << annots_i->type << endl;
350 save_file << "</Annotations>" << endl;
355 Sequence::load_museq(string load_file_path, int seq_num)
358 string file_data_line;
365 load_file.open(load_file_path.c_str(), ios::in);
368 // search for the seq_num-th sequence
369 while ( (!load_file.eof()) && (seq_counter < seq_num) )
371 getline(load_file,file_data_line);
372 if (file_data_line == "<Sequence>")
375 getline(load_file, file_data_line);
376 sequence = file_data_line;
377 length = sequence.length();
378 getline(load_file, file_data_line);
379 getline(load_file, file_data_line);
380 if (file_data_line == "<Annotations>")
382 getline(load_file, file_data_line);
383 species = file_data_line;
384 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
386 getline(load_file,file_data_line);
387 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
389 // need to get 4 values...almost same code 4 times...
390 // get annot start index
391 space_split_i = file_data_line.find(" ");
392 annot_value = file_data_line.substr(0,space_split_i);
393 an_annot.start = atoi (annot_value.c_str());
394 file_data_line = file_data_line.substr(space_split_i+1);
395 // get annot end index
396 space_split_i = file_data_line.find(" ");
397 annot_value = file_data_line.substr(0,space_split_i);
398 an_annot.end = atoi (annot_value.c_str());
400 if (space_split_i == string::npos) // no entry for type or name
402 cout << "seq, annots - no type or name\n";
406 else // else get annot type
408 file_data_line = file_data_line.substr(space_split_i+1);
409 space_split_i = file_data_line.find(" ");
410 annot_value = file_data_line.substr(0,space_split_i);
411 an_annot.type = annot_value;
412 if (space_split_i == string::npos) // no entry for name
414 cout << "seq, annots - no name\n";
417 else // get annot name
419 file_data_line = file_data_line.substr(space_split_i+1);
420 space_split_i = file_data_line.find(" ");
421 annot_value = file_data_line.substr(0,space_split_i);
422 an_annot.type = annot_value;
425 annots.push_back(an_annot); // don't forget to actually add the annot
427 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
428 << "-->" << an_annot.type << "::" << an_annot.name << endl;
436 Sequence::rc_motif(string a_motif)
439 char conversionTable[257];
440 int seq_i, table_i, len;
442 len = a_motif.length();
443 rev_comp.reserve(len);
445 for(table_i=0; table_i < 256; table_i++)
447 conversionTable[table_i] = '~';
449 // add end of string character for printing out table for testing purposes
450 conversionTable[256] = '\0';
452 // add in the characters for the bases we want to convert (IUPAC)
453 conversionTable['A'] = 'T';
454 conversionTable['T'] = 'A';
455 conversionTable['G'] = 'C';
456 conversionTable['C'] = 'G';
457 conversionTable['N'] = 'N';
458 conversionTable['M'] = 'K';
459 conversionTable['R'] = 'Y';
460 conversionTable['W'] = 'W';
461 conversionTable['S'] = 'S';
462 conversionTable['Y'] = 'R';
463 conversionTable['K'] = 'M';
464 conversionTable['V'] = 'B';
465 conversionTable['H'] = 'D';
466 conversionTable['D'] = 'H';
467 conversionTable['B'] = 'V';
469 // finally, the actual conversion loop
470 for(seq_i = len - 1; seq_i >= 0; seq_i--)
472 //cout << "** i = " << seq_i << " bp = " <<
473 table_i = (int) a_motif[seq_i];
474 rev_comp += conversionTable[table_i];
477 cout << "seq: " << a_motif << endl;
478 cout << "rc: " << rev_comp << endl;
484 Sequence::motif_validate(string a_motif)
490 len = a_motif.length();
491 valid_motif.reserve(len);
493 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
494 // current nonIUPAC symbols are omitted, which is not reported atm
495 for(seq_i = 0; seq_i < len; seq_i++)
497 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
499 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
501 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
503 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
505 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
507 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
509 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
511 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
513 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
515 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
517 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
519 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
521 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
523 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
525 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
529 cout << "valid_motif is: " << valid_motif << endl;
536 Sequence::find_motif(string a_motif)
538 vector<int> motif_match_starts;
542 motif_match_starts.clear();
544 cout << "motif is: " << a_motif << endl;
545 a_motif = motif_validate(a_motif);
546 cout << "motif is: " << a_motif << endl;
548 motif_scan(a_motif, &motif_match_starts);
550 a_motif_rc = rc_motif(a_motif);
551 // make sure not to do search again if it is a palindrome
552 if (a_motif_rc != a_motif)
553 motif_scan(a_motif_rc, &motif_match_starts);
555 return motif_match_starts;
559 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
562 int seq_i, motif_i, motif_len;
564 // faster to loop thru the sequence as a old c string (ie char array)
565 seq_c = (char*)sequence.c_str();
566 motif_len = a_motif.length();
568 //cout << "motif_length: " << motif_len << endl;
569 //cout << "RAAARRRRR\n";
573 cout << "motif: " << a_motif << endl;
575 //cout << "length: " << length << endl;
577 while (seq_i < length)
579 //if ((seq_i > 10885) && (seq_i < 10917))
580 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
581 // this is pretty much a straight translation of Nora's python code
582 // to match iupac letter codes
583 if (a_motif[motif_i] =='N')
585 else if (a_motif[motif_i] == seq_c[seq_i])
587 else if ((a_motif[motif_i] =='M') &&
588 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
590 else if ((a_motif[motif_i] =='R') &&
591 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
593 else if ((a_motif[motif_i] =='W') &&
594 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
596 else if ((a_motif[motif_i] =='S') &&
597 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
599 else if ((a_motif[motif_i] =='Y') &&
600 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
602 else if ((a_motif[motif_i] =='K') &&
603 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
605 else if ((a_motif[motif_i] =='V') &&
606 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
607 (seq_c[seq_i]=='G')))
609 else if ((a_motif[seq_i] =='H') &&
610 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
611 (seq_c[seq_i]=='T')))
613 else if ((a_motif[motif_i] =='D') &&
614 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
615 (seq_c[seq_i]=='T')))
617 else if ((a_motif[motif_i] =='B') &&
618 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
619 (seq_c[seq_i]=='T')))
628 // end Nora stuff, now we see if a match is found this pass
629 if (motif_i == motif_len)
632 motif_match_starts->push_back(seq_i - motif_len + 1);
642 // get annot start index
643 space_split_i = file_data_line.find(" ");
644 annot_value = file_data_line.substr(0,space_split_i);
645 an_annot.name = annot_value;
646 file_data_line = file_data_line.substr(space_split_i+1);
647 // get annot start index
648 space_split_i = file_data_line.find(" ");
649 annot_value = file_data_line.substr(0,space_split_i);
650 an_annot.type = annot_value;