1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
25 #include "sequence.hh"
36 Sequence::load_fasta(string file_path, int seq_num,
37 int start_index, int end_index)
40 string file_data_line;
41 int header_counter = 0;
44 char conversionTable[257];
45 int seq_i, table_i, len;
47 char * seq_tmp; // holds sequence during basic filtering
50 data_file.open(file_path.c_str(), ios::in);
54 err_msg = "Sequence File: " + file_path + " not found";
57 // if file opened okay, read it
60 // search for the header of the fasta sequence we want
61 while ( (!data_file.eof()) && (header_counter < seq_num) )
63 getline(data_file,file_data_line);
64 if (file_data_line.substr(0,1) == ">")
68 header = file_data_line.substr(1);
72 while ( !data_file.eof() && read_seq )
74 getline(data_file,file_data_line);
75 if (file_data_line.substr(0,1) == ">")
77 else sequence_raw += file_data_line;
82 // sequence filtering for upcasing agctn and convert non AGCTN to N
84 len = sequence_raw.length();
85 seq_tmp = (char*)sequence_raw.c_str();
89 // Make a conversion table
91 // everything we don't specify below will become 'N'
92 for(table_i=0; table_i < 256; table_i++)
94 conversionTable[table_i] = 'N';
96 // add end of string character for printing out table for testing purposes
97 conversionTable[256] = '\0';
99 // we want these to map to themselves - ie not to change
100 conversionTable['A'] = 'A';
101 conversionTable['T'] = 'T';
102 conversionTable['G'] = 'G';
103 conversionTable['C'] = 'C';
105 conversionTable['a'] = 'A';
106 conversionTable['t'] = 'T';
107 conversionTable['g'] = 'G';
108 conversionTable['c'] = 'C';
110 // finally, the actual conversion loop
111 for(seq_i = 0; seq_i < len; seq_i++)
113 table_i = (int) seq_tmp[seq_i];
114 sequence += conversionTable[table_i];
118 // Lastly, if subselection of the sequence was specified we keep cut out
119 // and only keep that part
121 // end_index = 0 means no end was specified, so cut to the end
125 // start_index = 0 if no start was specified
126 sequence = sequence.substr(start_index, end_index - start_index);
133 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
134 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
138 Sequence::load_annot(string file_path, int start_index, int end_index)
141 string file_data_line;
145 list<annot>::iterator list_i;
150 data_file.open(file_path.c_str(), ios::in);
154 err_msg = "Sequence File: " + file_path + " not found";
157 // if file opened okay, read it
160 getline(data_file,file_data_line);
161 species = file_data_line;
163 // end_index = 0 means no end was specified, so cut to the end
165 end_index = sequence.length();
167 //cout << "START: " << start_index << " END: " << end_index << endl;
169 while ( !data_file.eof() )
171 getline(data_file,file_data_line);
172 if (file_data_line != "")
174 // need to get 4 values...almost same code 4 times...
175 // get annot start index
176 space_split_i = file_data_line.find(" ");
177 annot_value = file_data_line.substr(0,space_split_i);
178 an_annot.start = atoi (annot_value.c_str());
179 file_data_line = file_data_line.substr(space_split_i+1);
180 // get annot end index
181 space_split_i = file_data_line.find(" ");
182 annot_value = file_data_line.substr(0,space_split_i);
183 an_annot.end = atoi (annot_value.c_str());
184 file_data_line = file_data_line.substr(space_split_i+1);
186 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
190 space_split_i = file_data_line.find(" ");
191 if (space_split_i == string::npos) // no entries for name & type
193 cout << "seq, annots - no name or type\n";
199 annot_value = file_data_line.substr(0,space_split_i);
200 an_annot.name = annot_value;
201 file_data_line = file_data_line.substr(space_split_i+1);
203 space_split_i = file_data_line.find(" ");
204 if (space_split_i == string::npos) // no entry for type
208 annot_value = file_data_line.substr(0,space_split_i);
209 an_annot.type = annot_value;
214 // add annot to list if it falls within the range of sequence specified
215 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
217 an_annot.start -= start_index;
218 an_annot.end -= start_index;
219 annots.push_back(an_annot);
222 cout << "FAILED!!!!!!\n";
229 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
231 cout << (*list_i).start << "," << (*list_i).end << "\t";
232 cout << (*list_i).name << "\t" << (*list_i).type << endl;
243 return sequence.length();
255 Sequence::subseq(int start, int end)
257 return sequence.substr(start, end);
264 return sequence.c_str();
271 char conversionTable[257];
272 int seq_i, table_i, len;
274 len = sequence.length();
275 rev_comp.reserve(len);
276 // make a conversion table
277 // init all parts of conversion table to '~' character
278 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
279 // and may the fleas of 1000 camels infest the genitals of any biologist (and
280 // seven generations of their progeny) who decides to make it mean
281 // something special!!!
282 // PS - double the curse for any smartass non-biologist who tries it as well
283 for(table_i=0; table_i < 256; table_i++)
285 conversionTable[table_i] = '~';
287 // add end of string character for printing out table for testing purposes
288 conversionTable[256] = '\0';
290 // add in the characters for the bases we want to convert
291 conversionTable['A'] = 'T';
292 conversionTable['T'] = 'A';
293 conversionTable['G'] = 'C';
294 conversionTable['C'] = 'G';
295 conversionTable['N'] = 'N';
297 // finally, the actual conversion loop
298 for(seq_i = len - 1; seq_i >= 0; seq_i--)
300 table_i = (int) sequence[seq_i];
301 rev_comp += conversionTable[table_i];
322 Sequence::set_seq(string a_seq)
347 Sequence::save(fstream &save_file)
348 //string save_file_path)
351 list<annot>::iterator annots_i;
354 // not sure why, or if i'm doing something wrong, but can't seem to pass
355 // file pointers down to this method from the mussa control class
356 // so each call to save a sequence appends to the file started by mussa_class
357 //save_file.open(save_file_path.c_str(), ios::app);
359 save_file << "<Sequence>" << endl;
360 save_file << sequence << endl;
361 save_file << "</Sequence>" << endl;
363 save_file << "<Annotations>" << endl;
364 save_file << species << endl;
365 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
367 save_file << annots_i->start << " " << annots_i->end << " " ;
368 save_file << annots_i->name << " " << annots_i->type << endl;
370 save_file << "</Annotations>" << endl;
375 Sequence::load_museq(string load_file_path, int seq_num)
378 string file_data_line;
385 load_file.open(load_file_path.c_str(), ios::in);
388 // search for the seq_num-th sequence
389 while ( (!load_file.eof()) && (seq_counter < seq_num) )
391 getline(load_file,file_data_line);
392 if (file_data_line == "<Sequence>")
395 getline(load_file, file_data_line);
396 sequence = file_data_line;
397 getline(load_file, file_data_line);
398 getline(load_file, file_data_line);
399 if (file_data_line == "<Annotations>")
401 getline(load_file, file_data_line);
402 species = file_data_line;
403 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
405 getline(load_file,file_data_line);
406 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
408 // need to get 4 values...almost same code 4 times...
409 // get annot start index
410 space_split_i = file_data_line.find(" ");
411 annot_value = file_data_line.substr(0,space_split_i);
412 an_annot.start = atoi (annot_value.c_str());
413 file_data_line = file_data_line.substr(space_split_i+1);
414 // get annot end index
415 space_split_i = file_data_line.find(" ");
416 annot_value = file_data_line.substr(0,space_split_i);
417 an_annot.end = atoi (annot_value.c_str());
419 if (space_split_i == string::npos) // no entry for type or name
421 cout << "seq, annots - no type or name\n";
425 else // else get annot type
427 file_data_line = file_data_line.substr(space_split_i+1);
428 space_split_i = file_data_line.find(" ");
429 annot_value = file_data_line.substr(0,space_split_i);
430 an_annot.type = annot_value;
431 if (space_split_i == string::npos) // no entry for name
433 cout << "seq, annots - no name\n";
436 else // get annot name
438 file_data_line = file_data_line.substr(space_split_i+1);
439 space_split_i = file_data_line.find(" ");
440 annot_value = file_data_line.substr(0,space_split_i);
441 an_annot.type = annot_value;
444 annots.push_back(an_annot); // don't forget to actually add the annot
446 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
447 << "-->" << an_annot.type << "::" << an_annot.name << endl;
455 Sequence::rc_motif(string a_motif)
458 char conversionTable[257];
459 int seq_i, table_i, len;
461 len = a_motif.length();
462 rev_comp.reserve(len);
464 for(table_i=0; table_i < 256; table_i++)
466 conversionTable[table_i] = '~';
468 // add end of string character for printing out table for testing purposes
469 conversionTable[256] = '\0';
471 // add in the characters for the bases we want to convert (IUPAC)
472 conversionTable['A'] = 'T';
473 conversionTable['T'] = 'A';
474 conversionTable['G'] = 'C';
475 conversionTable['C'] = 'G';
476 conversionTable['N'] = 'N';
477 conversionTable['M'] = 'K';
478 conversionTable['R'] = 'Y';
479 conversionTable['W'] = 'W';
480 conversionTable['S'] = 'S';
481 conversionTable['Y'] = 'R';
482 conversionTable['K'] = 'M';
483 conversionTable['V'] = 'B';
484 conversionTable['H'] = 'D';
485 conversionTable['D'] = 'H';
486 conversionTable['B'] = 'V';
488 // finally, the actual conversion loop
489 for(seq_i = len - 1; seq_i >= 0; seq_i--)
491 //cout << "** i = " << seq_i << " bp = " <<
492 table_i = (int) a_motif[seq_i];
493 rev_comp += conversionTable[table_i];
496 cout << "seq: " << a_motif << endl;
497 cout << "rc: " << rev_comp << endl;
503 Sequence::motif_validate(string a_motif)
508 len = a_motif.length();
509 valid_motif.reserve(len);
511 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
512 // current nonIUPAC symbols are omitted, which is not reported atm
513 for(seq_i = 0; seq_i < len; seq_i++)
515 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
517 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
519 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
521 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
523 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
525 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
527 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
529 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
531 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
533 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
535 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
537 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
539 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
541 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
543 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
547 cout << "valid_motif is: " << valid_motif << endl;
554 Sequence::find_motif(string a_motif)
556 vector<int> motif_match_starts;
560 motif_match_starts.clear();
562 cout << "motif is: " << a_motif << endl;
563 a_motif = motif_validate(a_motif);
564 //cout << "motif is: " << a_motif << endl;
569 cout << "Sequence: none blank motif\n";
570 motif_scan(a_motif, &motif_match_starts);
572 a_motif_rc = rc_motif(a_motif);
573 // make sure not to do search again if it is a palindrome
574 if (a_motif_rc != a_motif)
575 motif_scan(a_motif_rc, &motif_match_starts);
577 return motif_match_starts;
581 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
584 int seq_i, motif_i, motif_len;
586 // faster to loop thru the sequence as a old c string (ie char array)
587 seq_c = (char*)sequence.c_str();
588 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
589 motif_len = a_motif.length();
591 //cout << "motif_length: " << motif_len << endl;
592 //cout << "RAAARRRRR\n";
596 //cout << "motif: " << a_motif << endl;
598 //cout << "Sequence: motif, length= " << length << endl;
600 while (seq_i < sequence.length())
602 cout << seq_c[seq_i];
603 //if ((seq_i > 10885) && (seq_i < 10917))
604 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
605 // this is pretty much a straight translation of Nora's python code
606 // to match iupac letter codes
607 if (a_motif[motif_i] =='N')
609 else if (a_motif[motif_i] == seq_c[seq_i])
611 else if ((a_motif[motif_i] =='M') &&
612 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
614 else if ((a_motif[motif_i] =='R') &&
615 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
617 else if ((a_motif[motif_i] =='W') &&
618 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
620 else if ((a_motif[motif_i] =='S') &&
621 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
623 else if ((a_motif[motif_i] =='Y') &&
624 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
626 else if ((a_motif[motif_i] =='K') &&
627 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
629 else if ((a_motif[motif_i] =='V') &&
630 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
631 (seq_c[seq_i]=='G')))
633 else if ((a_motif[seq_i] =='H') &&
634 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
635 (seq_c[seq_i]=='T')))
637 else if ((a_motif[motif_i] =='D') &&
638 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
639 (seq_c[seq_i]=='T')))
641 else if ((a_motif[motif_i] =='B') &&
642 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
643 (seq_c[seq_i]=='T')))
652 // end Nora stuff, now we see if a match is found this pass
653 if (motif_i == motif_len)
656 motif_match_starts->push_back(seq_i - motif_len + 1);
666 // get annot start index
667 space_split_i = file_data_line.find(" ");
668 annot_value = file_data_line.substr(0,space_split_i);
669 an_annot.name = annot_value;
670 file_data_line = file_data_line.substr(space_split_i+1);
671 // get annot start index
672 space_split_i = file_data_line.find(" ");
673 annot_value = file_data_line.substr(0,space_split_i);
674 an_annot.type = annot_value;