1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
25 #include "sequence.hh"
34 Sequence::load_fasta(string file_path, int seq_num,
35 int start_index, int end_index)
38 string file_data_line;
39 int header_counter = 0;
42 char conversionTable[257];
43 int seq_i, table_i, len;
45 char * seq_tmp; // holds sequence during basic filtering
48 data_file.open(file_path.c_str(), ios::in);
52 err_msg = "Sequence File: " + file_path + " not found";
55 // if file opened okay, read it
58 // search for the header of the fasta sequence we want
59 while ( (!data_file.eof()) && (header_counter < seq_num) )
61 getline(data_file,file_data_line);
62 if (file_data_line.substr(0,1) == ">")
66 header = file_data_line.substr(1);
70 while ( !data_file.eof() && read_seq )
72 getline(data_file,file_data_line);
73 if (file_data_line.substr(0,1) == ">")
75 else sequence_raw += file_data_line;
80 // sequence filtering for upcasing agctn and convert non AGCTN to N
82 len = sequence_raw.length();
83 seq_tmp = (char*)sequence_raw.c_str();
87 // Make a conversion table
89 // everything we don't specify below will become 'N'
90 for(table_i=0; table_i < 256; table_i++)
92 conversionTable[table_i] = 'N';
94 // add end of string character for printing out table for testing purposes
95 conversionTable[256] = '\0';
97 // we want these to map to themselves - ie not to change
98 conversionTable['A'] = 'A';
99 conversionTable['T'] = 'T';
100 conversionTable['G'] = 'G';
101 conversionTable['C'] = 'C';
103 conversionTable['a'] = 'A';
104 conversionTable['t'] = 'T';
105 conversionTable['g'] = 'G';
106 conversionTable['c'] = 'C';
108 // finally, the actual conversion loop
109 for(seq_i = 0; seq_i < len; seq_i++)
111 table_i = (int) seq_tmp[seq_i];
112 sequence += conversionTable[table_i];
116 // Lastly, if subselection of the sequence was specified we keep cut out
117 // and only keep that part
119 // end_index = 0 means no end was specified, so cut to the end
123 // start_index = 0 if no start was specified
124 sequence = sequence.substr(start_index, end_index - start_index);
125 length = sequence.length();
132 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
133 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
137 Sequence::load_annot(string file_path, int start_index, int end_index)
140 string file_data_line;
144 list<annot>::iterator list_i;
149 data_file.open(file_path.c_str(), ios::in);
153 err_msg = "Sequence File: " + file_path + " not found";
156 // if file opened okay, read it
159 getline(data_file,file_data_line);
160 species = file_data_line;
162 // end_index = 0 means no end was specified, so cut to the end
166 //cout << "START: " << start_index << " END: " << end_index << endl;
168 while ( !data_file.eof() )
170 getline(data_file,file_data_line);
171 if (file_data_line != "")
173 // need to get 4 values...almost same code 4 times...
174 // get annot start index
175 space_split_i = file_data_line.find(" ");
176 annot_value = file_data_line.substr(0,space_split_i);
177 an_annot.start = atoi (annot_value.c_str());
178 file_data_line = file_data_line.substr(space_split_i+1);
179 // get annot end index
180 space_split_i = file_data_line.find(" ");
181 annot_value = file_data_line.substr(0,space_split_i);
182 an_annot.end = atoi (annot_value.c_str());
183 file_data_line = file_data_line.substr(space_split_i+1);
185 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
189 space_split_i = file_data_line.find(" ");
190 if (space_split_i == string::npos) // no entries for name & type
192 cout << "seq, annots - no name or type\n";
198 annot_value = file_data_line.substr(0,space_split_i);
199 an_annot.name = annot_value;
200 file_data_line = file_data_line.substr(space_split_i+1);
202 space_split_i = file_data_line.find(" ");
203 if (space_split_i == string::npos) // no entry for type
207 annot_value = file_data_line.substr(0,space_split_i);
208 an_annot.type = annot_value;
213 // add annot to list if it falls within the range of sequence specified
214 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
216 an_annot.start -= start_index;
217 an_annot.end -= start_index;
218 annots.push_back(an_annot);
221 cout << "FAILED!!!!!!\n";
228 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
230 cout << (*list_i).start << "," << (*list_i).end << "\t";
231 cout << (*list_i).name << "\t" << (*list_i).type << endl;
242 return sequence.length();
254 Sequence::subseq(int start, int end)
256 return sequence.substr(start, end);
263 return sequence.c_str();
270 char conversionTable[257];
271 int seq_i, table_i, len;
273 len = sequence.length();
274 rev_comp.reserve(len);
275 // make a conversion table
276 // init all parts of conversion table to '~' character
277 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
278 // and may the fleas of 1000 camels infest the genitals of any biologist (and
279 // seven generations of their progeny) who decides to make it mean
280 // something special!!!
281 // PS - double the curse for any smartass non-biologist who tries it as well
282 for(table_i=0; table_i < 256; table_i++)
284 conversionTable[table_i] = '~';
286 // add end of string character for printing out table for testing purposes
287 conversionTable[256] = '\0';
289 // add in the characters for the bases we want to convert
290 conversionTable['A'] = 'T';
291 conversionTable['T'] = 'A';
292 conversionTable['G'] = 'C';
293 conversionTable['C'] = 'G';
294 conversionTable['N'] = 'N';
296 // finally, the actual conversion loop
297 for(seq_i = len - 1; seq_i >= 0; seq_i--)
299 table_i = (int) sequence[seq_i];
300 rev_comp += conversionTable[table_i];
321 Sequence::set_seq(string a_seq)
324 length = sequence.length();
348 Sequence::save(fstream &save_file)
349 //string save_file_path)
352 list<annot>::iterator annots_i;
355 // not sure why, or if i'm doing something wrong, but can't seem to pass
356 // file pointers down to this method from the mussa control class
357 // so each call to save a sequence appends to the file started by mussa_class
358 //save_file.open(save_file_path.c_str(), ios::app);
360 save_file << "<Sequence>" << endl;
361 save_file << sequence << endl;
362 save_file << "</Sequence>" << endl;
364 save_file << "<Annotations>" << endl;
365 save_file << species << endl;
366 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
368 save_file << annots_i->start << " " << annots_i->end << " " ;
369 save_file << annots_i->name << " " << annots_i->type << endl;
371 save_file << "</Annotations>" << endl;
376 Sequence::load_museq(string load_file_path, int seq_num)
379 string file_data_line;
386 load_file.open(load_file_path.c_str(), ios::in);
389 // search for the seq_num-th sequence
390 while ( (!load_file.eof()) && (seq_counter < seq_num) )
392 getline(load_file,file_data_line);
393 if (file_data_line == "<Sequence>")
396 getline(load_file, file_data_line);
397 sequence = file_data_line;
398 length = sequence.length();
399 getline(load_file, file_data_line);
400 getline(load_file, file_data_line);
401 if (file_data_line == "<Annotations>")
403 getline(load_file, file_data_line);
404 species = file_data_line;
405 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
407 getline(load_file,file_data_line);
408 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
410 // need to get 4 values...almost same code 4 times...
411 // get annot start index
412 space_split_i = file_data_line.find(" ");
413 annot_value = file_data_line.substr(0,space_split_i);
414 an_annot.start = atoi (annot_value.c_str());
415 file_data_line = file_data_line.substr(space_split_i+1);
416 // get annot end index
417 space_split_i = file_data_line.find(" ");
418 annot_value = file_data_line.substr(0,space_split_i);
419 an_annot.end = atoi (annot_value.c_str());
421 if (space_split_i == string::npos) // no entry for type or name
423 cout << "seq, annots - no type or name\n";
427 else // else get annot type
429 file_data_line = file_data_line.substr(space_split_i+1);
430 space_split_i = file_data_line.find(" ");
431 annot_value = file_data_line.substr(0,space_split_i);
432 an_annot.type = annot_value;
433 if (space_split_i == string::npos) // no entry for name
435 cout << "seq, annots - no name\n";
438 else // get annot name
440 file_data_line = file_data_line.substr(space_split_i+1);
441 space_split_i = file_data_line.find(" ");
442 annot_value = file_data_line.substr(0,space_split_i);
443 an_annot.type = annot_value;
446 annots.push_back(an_annot); // don't forget to actually add the annot
448 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
449 << "-->" << an_annot.type << "::" << an_annot.name << endl;
457 Sequence::rc_motif(string a_motif)
460 char conversionTable[257];
461 int seq_i, table_i, len;
463 len = a_motif.length();
464 rev_comp.reserve(len);
466 for(table_i=0; table_i < 256; table_i++)
468 conversionTable[table_i] = '~';
470 // add end of string character for printing out table for testing purposes
471 conversionTable[256] = '\0';
473 // add in the characters for the bases we want to convert (IUPAC)
474 conversionTable['A'] = 'T';
475 conversionTable['T'] = 'A';
476 conversionTable['G'] = 'C';
477 conversionTable['C'] = 'G';
478 conversionTable['N'] = 'N';
479 conversionTable['M'] = 'K';
480 conversionTable['R'] = 'Y';
481 conversionTable['W'] = 'W';
482 conversionTable['S'] = 'S';
483 conversionTable['Y'] = 'R';
484 conversionTable['K'] = 'M';
485 conversionTable['V'] = 'B';
486 conversionTable['H'] = 'D';
487 conversionTable['D'] = 'H';
488 conversionTable['B'] = 'V';
490 // finally, the actual conversion loop
491 for(seq_i = len - 1; seq_i >= 0; seq_i--)
493 //cout << "** i = " << seq_i << " bp = " <<
494 table_i = (int) a_motif[seq_i];
495 rev_comp += conversionTable[table_i];
498 cout << "seq: " << a_motif << endl;
499 cout << "rc: " << rev_comp << endl;
505 Sequence::motif_validate(string a_motif)
511 len = a_motif.length();
512 valid_motif.reserve(len);
514 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
515 // current nonIUPAC symbols are omitted, which is not reported atm
516 for(seq_i = 0; seq_i < len; seq_i++)
518 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
520 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
522 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
524 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
526 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
528 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
530 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
532 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
534 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
536 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
538 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
540 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
542 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
544 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
546 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
550 cout << "valid_motif is: " << valid_motif << endl;
557 Sequence::find_motif(string a_motif)
559 vector<int> motif_match_starts;
563 motif_match_starts.clear();
565 cout << "motif is: " << a_motif << endl;
566 a_motif = motif_validate(a_motif);
567 //cout << "motif is: " << a_motif << endl;
572 cout << "Sequence: none blank motif\n";
573 motif_scan(a_motif, &motif_match_starts);
575 a_motif_rc = rc_motif(a_motif);
576 // make sure not to do search again if it is a palindrome
577 if (a_motif_rc != a_motif)
578 motif_scan(a_motif_rc, &motif_match_starts);
580 return motif_match_starts;
584 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
587 int seq_i, motif_i, motif_len;
589 // faster to loop thru the sequence as a old c string (ie char array)
590 seq_c = (char*)sequence.c_str();
591 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
592 motif_len = a_motif.length();
594 //cout << "motif_length: " << motif_len << endl;
595 //cout << "RAAARRRRR\n";
599 //cout << "motif: " << a_motif << endl;
601 //cout << "Sequence: motif, length= " << length << endl;
603 while (seq_i < length)
605 cout << seq_c[seq_i];
606 //if ((seq_i > 10885) && (seq_i < 10917))
607 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
608 // this is pretty much a straight translation of Nora's python code
609 // to match iupac letter codes
610 if (a_motif[motif_i] =='N')
612 else if (a_motif[motif_i] == seq_c[seq_i])
614 else if ((a_motif[motif_i] =='M') &&
615 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
617 else if ((a_motif[motif_i] =='R') &&
618 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
620 else if ((a_motif[motif_i] =='W') &&
621 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
623 else if ((a_motif[motif_i] =='S') &&
624 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
626 else if ((a_motif[motif_i] =='Y') &&
627 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
629 else if ((a_motif[motif_i] =='K') &&
630 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
632 else if ((a_motif[motif_i] =='V') &&
633 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
634 (seq_c[seq_i]=='G')))
636 else if ((a_motif[seq_i] =='H') &&
637 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
638 (seq_c[seq_i]=='T')))
640 else if ((a_motif[motif_i] =='D') &&
641 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
642 (seq_c[seq_i]=='T')))
644 else if ((a_motif[motif_i] =='B') &&
645 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
646 (seq_c[seq_i]=='T')))
655 // end Nora stuff, now we see if a match is found this pass
656 if (motif_i == motif_len)
659 motif_match_starts->push_back(seq_i - motif_len + 1);
669 // get annot start index
670 space_split_i = file_data_line.find(" ");
671 annot_value = file_data_line.substr(0,space_split_i);
672 an_annot.name = annot_value;
673 file_data_line = file_data_line.substr(space_split_i+1);
674 // get annot start index
675 space_split_i = file_data_line.find(" ");
676 annot_value = file_data_line.substr(0,space_split_i);
677 an_annot.type = annot_value;