1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
25 #include "sequence.hh"
34 //! load a fasta file into a sequence
36 * \param file_path the location of the fasta file in the filesystem
37 * \param seq_num which sequence in the file to load
38 * \param start_index starting position in the fasta sequence, 0 for beginning
39 * \param end_index ending position in the fasta sequence, 0 for end
40 * \return error message, empty string if no error. (gag!)
43 Sequence::load_fasta(string file_path, int seq_num,
44 int start_index, int end_index)
47 string file_data_line;
48 int header_counter = 0;
51 char conversionTable[257];
52 int seq_i, table_i, len;
54 char * seq_tmp; // holds sequence during basic filtering
57 data_file.open(file_path.c_str(), ios::in);
61 err_msg = "Sequence File: " + file_path + " not found";
64 // if file opened okay, read it
67 // search for the header of the fasta sequence we want
68 while ( (!data_file.eof()) && (header_counter < seq_num) )
70 getline(data_file,file_data_line);
71 if (file_data_line.substr(0,1) == ">")
75 header = file_data_line.substr(1);
79 while ( !data_file.eof() && read_seq )
81 getline(data_file,file_data_line);
82 if (file_data_line.substr(0,1) == ">")
84 else sequence_raw += file_data_line;
89 // sequence filtering for upcasing agctn and convert non AGCTN to N
91 len = sequence_raw.length();
92 seq_tmp = (char*)sequence_raw.c_str();
96 // Make a conversion table
98 // everything we don't specify below will become 'N'
99 for(table_i=0; table_i < 256; table_i++)
101 conversionTable[table_i] = 'N';
103 // add end of string character for printing out table for testing purposes
104 conversionTable[256] = '\0';
106 // we want these to map to themselves - ie not to change
107 conversionTable['A'] = 'A';
108 conversionTable['T'] = 'T';
109 conversionTable['G'] = 'G';
110 conversionTable['C'] = 'C';
112 conversionTable['a'] = 'A';
113 conversionTable['t'] = 'T';
114 conversionTable['g'] = 'G';
115 conversionTable['c'] = 'C';
117 // finally, the actual conversion loop
118 for(seq_i = 0; seq_i < len; seq_i++)
120 table_i = (int) seq_tmp[seq_i];
121 sequence += conversionTable[table_i];
125 // Lastly, if subselection of the sequence was specified we keep cut out
126 // and only keep that part
128 // end_index = 0 means no end was specified, so cut to the end
132 // start_index = 0 if no start was specified
133 sequence = sequence.substr(start_index, end_index - start_index);
140 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
141 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
145 Sequence::load_annot(string file_path, int start_index, int end_index)
148 string file_data_line;
152 list<annot>::iterator list_i;
157 data_file.open(file_path.c_str(), ios::in);
161 err_msg = "Sequence File: " + file_path + " not found";
164 // if file opened okay, read it
167 getline(data_file,file_data_line);
168 species = file_data_line;
170 // end_index = 0 means no end was specified, so cut to the end
172 end_index = sequence.length();
174 //cout << "START: " << start_index << " END: " << end_index << endl;
176 while ( !data_file.eof() )
178 getline(data_file,file_data_line);
179 if (file_data_line != "")
181 // need to get 4 values...almost same code 4 times...
182 // get annot start index
183 space_split_i = file_data_line.find(" ");
184 annot_value = file_data_line.substr(0,space_split_i);
185 an_annot.start = atoi (annot_value.c_str());
186 file_data_line = file_data_line.substr(space_split_i+1);
187 // get annot end index
188 space_split_i = file_data_line.find(" ");
189 annot_value = file_data_line.substr(0,space_split_i);
190 an_annot.end = atoi (annot_value.c_str());
191 file_data_line = file_data_line.substr(space_split_i+1);
193 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
197 space_split_i = file_data_line.find(" ");
198 if (space_split_i == string::npos) // no entries for name & type
200 cout << "seq, annots - no name or type\n";
206 annot_value = file_data_line.substr(0,space_split_i);
207 an_annot.name = annot_value;
208 file_data_line = file_data_line.substr(space_split_i+1);
210 space_split_i = file_data_line.find(" ");
211 if (space_split_i == string::npos) // no entry for type
215 annot_value = file_data_line.substr(0,space_split_i);
216 an_annot.type = annot_value;
221 // add annot to list if it falls within the range of sequence specified
222 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
224 an_annot.start -= start_index;
225 an_annot.end -= start_index;
226 annots.push_back(an_annot);
229 cout << "FAILED!!!!!!\n";
236 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
238 cout << (*list_i).start << "," << (*list_i).end << "\t";
239 cout << (*list_i).name << "\t" << (*list_i).type << endl;
250 return sequence.length();
262 Sequence::subseq(int start, int end)
264 return sequence.substr(start, end);
271 return sequence.c_str();
278 char conversionTable[257];
279 int seq_i, table_i, len;
281 len = sequence.length();
282 rev_comp.reserve(len);
283 // make a conversion table
284 // init all parts of conversion table to '~' character
285 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
286 // and may the fleas of 1000 camels infest the genitals of any biologist (and
287 // seven generations of their progeny) who decides to make it mean
288 // something special!!!
289 // PS - double the curse for any smartass non-biologist who tries it as well
290 for(table_i=0; table_i < 256; table_i++)
292 conversionTable[table_i] = '~';
294 // add end of string character for printing out table for testing purposes
295 conversionTable[256] = '\0';
297 // add in the characters for the bases we want to convert
298 conversionTable['A'] = 'T';
299 conversionTable['T'] = 'A';
300 conversionTable['G'] = 'C';
301 conversionTable['C'] = 'G';
302 conversionTable['N'] = 'N';
304 // finally, the actual conversion loop
305 for(seq_i = len - 1; seq_i >= 0; seq_i--)
307 table_i = (int) sequence[seq_i];
308 rev_comp += conversionTable[table_i];
329 Sequence::set_seq(string a_seq)
354 Sequence::save(fstream &save_file)
355 //string save_file_path)
358 list<annot>::iterator annots_i;
361 // not sure why, or if i'm doing something wrong, but can't seem to pass
362 // file pointers down to this method from the mussa control class
363 // so each call to save a sequence appends to the file started by mussa_class
364 //save_file.open(save_file_path.c_str(), ios::app);
366 save_file << "<Sequence>" << endl;
367 save_file << sequence << endl;
368 save_file << "</Sequence>" << endl;
370 save_file << "<Annotations>" << endl;
371 save_file << species << endl;
372 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
374 save_file << annots_i->start << " " << annots_i->end << " " ;
375 save_file << annots_i->name << " " << annots_i->type << endl;
377 save_file << "</Annotations>" << endl;
382 Sequence::load_museq(string load_file_path, int seq_num)
385 string file_data_line;
392 load_file.open(load_file_path.c_str(), ios::in);
395 // search for the seq_num-th sequence
396 while ( (!load_file.eof()) && (seq_counter < seq_num) )
398 getline(load_file,file_data_line);
399 if (file_data_line == "<Sequence>")
402 getline(load_file, file_data_line);
403 sequence = file_data_line;
404 getline(load_file, file_data_line);
405 getline(load_file, file_data_line);
406 if (file_data_line == "<Annotations>")
408 getline(load_file, file_data_line);
409 species = file_data_line;
410 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
412 getline(load_file,file_data_line);
413 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
415 // need to get 4 values...almost same code 4 times...
416 // get annot start index
417 space_split_i = file_data_line.find(" ");
418 annot_value = file_data_line.substr(0,space_split_i);
419 an_annot.start = atoi (annot_value.c_str());
420 file_data_line = file_data_line.substr(space_split_i+1);
421 // get annot end index
422 space_split_i = file_data_line.find(" ");
423 annot_value = file_data_line.substr(0,space_split_i);
424 an_annot.end = atoi (annot_value.c_str());
426 if (space_split_i == string::npos) // no entry for type or name
428 cout << "seq, annots - no type or name\n";
432 else // else get annot type
434 file_data_line = file_data_line.substr(space_split_i+1);
435 space_split_i = file_data_line.find(" ");
436 annot_value = file_data_line.substr(0,space_split_i);
437 an_annot.type = annot_value;
438 if (space_split_i == string::npos) // no entry for name
440 cout << "seq, annots - no name\n";
443 else // get annot name
445 file_data_line = file_data_line.substr(space_split_i+1);
446 space_split_i = file_data_line.find(" ");
447 annot_value = file_data_line.substr(0,space_split_i);
448 an_annot.type = annot_value;
451 annots.push_back(an_annot); // don't forget to actually add the annot
453 cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
454 << "-->" << an_annot.type << "::" << an_annot.name << endl;
462 Sequence::rc_motif(string a_motif)
465 char conversionTable[257];
466 int seq_i, table_i, len;
468 len = a_motif.length();
469 rev_comp.reserve(len);
471 for(table_i=0; table_i < 256; table_i++)
473 conversionTable[table_i] = '~';
475 // add end of string character for printing out table for testing purposes
476 conversionTable[256] = '\0';
478 // add in the characters for the bases we want to convert (IUPAC)
479 conversionTable['A'] = 'T';
480 conversionTable['T'] = 'A';
481 conversionTable['G'] = 'C';
482 conversionTable['C'] = 'G';
483 conversionTable['N'] = 'N';
484 conversionTable['M'] = 'K';
485 conversionTable['R'] = 'Y';
486 conversionTable['W'] = 'W';
487 conversionTable['S'] = 'S';
488 conversionTable['Y'] = 'R';
489 conversionTable['K'] = 'M';
490 conversionTable['V'] = 'B';
491 conversionTable['H'] = 'D';
492 conversionTable['D'] = 'H';
493 conversionTable['B'] = 'V';
495 // finally, the actual conversion loop
496 for(seq_i = len - 1; seq_i >= 0; seq_i--)
498 //cout << "** i = " << seq_i << " bp = " <<
499 table_i = (int) a_motif[seq_i];
500 rev_comp += conversionTable[table_i];
503 cout << "seq: " << a_motif << endl;
504 cout << "rc: " << rev_comp << endl;
510 Sequence::motif_validate(string a_motif)
515 len = a_motif.length();
516 valid_motif.reserve(len);
518 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
519 // current nonIUPAC symbols are omitted, which is not reported atm
520 for(seq_i = 0; seq_i < len; seq_i++)
522 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
524 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
526 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
528 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
530 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
532 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
534 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
536 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
538 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
540 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
542 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
544 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
546 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
548 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
550 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
554 cout << "valid_motif is: " << valid_motif << endl;
561 Sequence::find_motif(string a_motif)
563 vector<int> motif_match_starts;
567 motif_match_starts.clear();
569 cout << "motif is: " << a_motif << endl;
570 a_motif = motif_validate(a_motif);
571 //cout << "motif is: " << a_motif << endl;
576 cout << "Sequence: none blank motif\n";
577 motif_scan(a_motif, &motif_match_starts);
579 a_motif_rc = rc_motif(a_motif);
580 // make sure not to do search again if it is a palindrome
581 if (a_motif_rc != a_motif)
582 motif_scan(a_motif_rc, &motif_match_starts);
584 return motif_match_starts;
588 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
591 int seq_i, motif_i, motif_len;
593 // faster to loop thru the sequence as a old c string (ie char array)
594 seq_c = (char*)sequence.c_str();
595 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
596 motif_len = a_motif.length();
598 //cout << "motif_length: " << motif_len << endl;
599 //cout << "RAAARRRRR\n";
603 //cout << "motif: " << a_motif << endl;
605 //cout << "Sequence: motif, length= " << length << endl;
607 while (seq_i < sequence.length())
609 cout << seq_c[seq_i];
610 //if ((seq_i > 10885) && (seq_i < 10917))
611 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
612 // this is pretty much a straight translation of Nora's python code
613 // to match iupac letter codes
614 if (a_motif[motif_i] =='N')
616 else if (a_motif[motif_i] == seq_c[seq_i])
618 else if ((a_motif[motif_i] =='M') &&
619 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
621 else if ((a_motif[motif_i] =='R') &&
622 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
624 else if ((a_motif[motif_i] =='W') &&
625 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
627 else if ((a_motif[motif_i] =='S') &&
628 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
630 else if ((a_motif[motif_i] =='Y') &&
631 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
633 else if ((a_motif[motif_i] =='K') &&
634 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
636 else if ((a_motif[motif_i] =='V') &&
637 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
638 (seq_c[seq_i]=='G')))
640 else if ((a_motif[seq_i] =='H') &&
641 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
642 (seq_c[seq_i]=='T')))
644 else if ((a_motif[motif_i] =='D') &&
645 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
646 (seq_c[seq_i]=='T')))
648 else if ((a_motif[motif_i] =='B') &&
649 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
650 (seq_c[seq_i]=='T')))
659 // end Nora stuff, now we see if a match is found this pass
660 if (motif_i == motif_len)
663 motif_match_starts->push_back(seq_i - motif_len + 1);
673 // get annot start index
674 space_split_i = file_data_line.find(" ");
675 annot_value = file_data_line.substr(0,space_split_i);
676 an_annot.name = annot_value;
677 file_data_line = file_data_line.substr(space_split_i+1);
678 // get annot start index
679 space_split_i = file_data_line.find(" ");
680 annot_value = file_data_line.substr(0,space_split_i);
681 an_annot.type = annot_value;