1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
25 #include "alg/sequence.hpp"
26 #include "mussa_exceptions.hpp"
41 annot::annot(int start, int end, std::string type, std::string name)
49 motif::motif(int start, std::string motif)
50 : annot(start, start+motif.size(), "motif", motif),
64 Sequence::Sequence(string seq)
66 set_filtered_sequence(seq);
69 Sequence &Sequence::operator=(const Sequence& s)
72 sequence = s.sequence;
80 Sequence &Sequence::operator=(const std::string& s)
82 set_filtered_sequence(s);
86 //! load a fasta file into a sequence
88 * \param file_path the location of the fasta file in the filesystem
89 * \param seq_num which sequence in the file to load
90 * \param start_index starting position in the fasta sequence, 0 for beginning
91 * \param end_index ending position in the fasta sequence, 0 for end
92 * \return error message, empty string if no error. (gag!)
95 Sequence::load_fasta(string file_path, int seq_num,
96 int start_index, int end_index)
99 string file_data_line;
100 int header_counter = 0;
101 bool read_seq = true;
104 string seq_tmp; // holds sequence during basic filtering
106 data_file.open(file_path.c_str(), ios::in);
110 throw mussa_load_error("Sequence File: " + file_path + " not found");
112 // if file opened okay, read it
115 // search for the header of the fasta sequence we want
116 while ( (!data_file.eof()) && (header_counter < seq_num) )
118 getline(data_file,file_data_line);
119 if (file_data_line.substr(0,1) == ">")
123 header = file_data_line.substr(1);
127 while ( !data_file.eof() && read_seq )
129 getline(data_file,file_data_line);
130 if (file_data_line.substr(0,1) == ">")
132 else sequence_raw += file_data_line;
137 // Lastly, if subselection of the sequence was specified we keep cut out
138 // and only keep that part
139 // end_index = 0 means no end was specified, so cut to the end
141 end_index = sequence_raw.size();
143 // sequence filtering for upcasing agctn and convert non AGCTN to N
144 set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
148 void Sequence::set_filtered_sequence(const string &old_seq,
149 string::size_type start,
150 string::size_type count)
152 char conversionTable[257];
155 count = old_seq.size() - start;
157 sequence.reserve(count);
159 // Make a conversion table
161 // everything we don't specify below will become 'N'
162 for(int table_i=0; table_i < 256; table_i++)
164 conversionTable[table_i] = 'N';
166 // add end of string character for printing out table for testing purposes
167 conversionTable[256] = '\0';
169 // we want these to map to themselves - ie not to change
170 conversionTable[(int)'A'] = 'A';
171 conversionTable[(int)'T'] = 'T';
172 conversionTable[(int)'G'] = 'G';
173 conversionTable[(int)'C'] = 'C';
175 conversionTable[(int)'a'] = 'A';
176 conversionTable[(int)'t'] = 'T';
177 conversionTable[(int)'g'] = 'G';
178 conversionTable[(int)'c'] = 'C';
180 // finally, the actual conversion loop
181 for(string::size_type seq_index = 0; seq_index < count; seq_index++)
183 sequence += conversionTable[ (int)old_seq[seq_index+start]];
187 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
188 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
192 Sequence::load_annot(string file_path, int start_index, int end_index)
195 string file_data_line;
197 string::size_type space_split_i;
199 list<annot>::iterator list_i;
204 data_file.open(file_path.c_str(), ios::in);
208 throw mussa_load_error("Sequence File: " + file_path + " not found");
210 // if file opened okay, read it
213 getline(data_file,file_data_line);
214 species = file_data_line;
216 // end_index = 0 means no end was specified, so cut to the end
218 end_index = sequence.length();
220 //cout << "START: " << start_index << " END: " << end_index << endl;
222 while ( !data_file.eof() )
224 getline(data_file,file_data_line);
225 if (file_data_line != "")
227 // need to get 4 values...almost same code 4 times...
228 // get annot start index
229 space_split_i = file_data_line.find(" ");
230 annot_value = file_data_line.substr(0,space_split_i);
231 an_annot.start = atoi (annot_value.c_str());
232 file_data_line = file_data_line.substr(space_split_i+1);
233 // get annot end index
234 space_split_i = file_data_line.find(" ");
235 annot_value = file_data_line.substr(0,space_split_i);
236 an_annot.end = atoi (annot_value.c_str());
237 file_data_line = file_data_line.substr(space_split_i+1);
239 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
243 space_split_i = file_data_line.find(" ");
244 if (space_split_i == string::npos) // no entries for name & type
246 cout << "seq, annots - no name or type\n";
252 annot_value = file_data_line.substr(0,space_split_i);
253 an_annot.name = annot_value;
254 file_data_line = file_data_line.substr(space_split_i+1);
256 space_split_i = file_data_line.find(" ");
257 if (space_split_i == string::npos) // no entry for type
261 annot_value = file_data_line.substr(0,space_split_i);
262 an_annot.type = annot_value;
267 // add annot to list if it falls within the range of sequence specified
268 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
270 an_annot.start -= start_index;
271 an_annot.end -= start_index;
272 annots.push_back(an_annot);
275 cout << "FAILED!!!!!!\n";
282 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
284 cout << (*list_i).start << "," << (*list_i).end << "\t";
285 cout << (*list_i).name << "\t" << (*list_i).type << endl;
291 bool Sequence::empty() const
293 return (size() == 0);
296 const std::list<annot>& Sequence::annotations() const
301 string::size_type Sequence::length() const
306 string::size_type Sequence::size() const
308 return sequence.size();
311 Sequence::iterator Sequence::begin()
313 return sequence.begin();
316 Sequence::const_iterator Sequence::begin() const
318 return sequence.begin();
321 Sequence::iterator Sequence::end()
323 return sequence.end();
326 Sequence::const_iterator Sequence::end() const
328 return sequence.end();
333 Sequence::get_seq() const
340 Sequence::subseq(int start, int end) const
342 return sequence.substr(start, end);
347 Sequence::c_seq() const
349 return sequence.c_str();
353 Sequence::rev_comp() const
356 char conversionTable[257];
357 int seq_i, table_i, len;
359 len = sequence.length();
360 rev_comp.reserve(len);
361 // make a conversion table
362 // init all parts of conversion table to '~' character
363 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
364 // and may the fleas of 1000 camels infest the genitals of any biologist (and
365 // seven generations of their progeny) who decides to make it mean
366 // something special!!!
367 // PS - double the curse for any smartass non-biologist who tries it as well
368 for(table_i=0; table_i < 256; table_i++)
370 conversionTable[table_i] = '~';
372 // add end of string character for printing out table for testing purposes
373 conversionTable[256] = '\0';
375 // add in the characters for the bases we want to convert
376 conversionTable[(int)'A'] = 'T';
377 conversionTable[(int)'T'] = 'A';
378 conversionTable[(int)'G'] = 'C';
379 conversionTable[(int)'C'] = 'G';
380 conversionTable[(int)'N'] = 'N';
382 // finally, the actual conversion loop
383 for(seq_i = len - 1; seq_i >= 0; seq_i--)
385 table_i = (int) sequence[seq_i];
386 rev_comp += conversionTable[table_i];
394 Sequence::get_header() const
399 //FIXME: i don't think this code is callable
401 Sequence::sp_name() const
408 Sequence::set_seq(const string& a_seq)
410 set_filtered_sequence(a_seq);
432 Sequence::save(fstream &save_file)
433 //string save_file_path)
436 list<annot>::iterator annots_i;
438 // not sure why, or if i'm doing something wrong, but can't seem to pass
439 // file pointers down to this method from the mussa control class
440 // so each call to save a sequence appends to the file started by mussa_class
441 //save_file.open(save_file_path.c_str(), ios::app);
443 save_file << "<Sequence>" << endl;
444 save_file << sequence << endl;
445 save_file << "</Sequence>" << endl;
447 save_file << "<Annotations>" << endl;
448 save_file << species << endl;
449 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
451 save_file << annots_i->start << " " << annots_i->end << " " ;
452 save_file << annots_i->name << " " << annots_i->type << endl;
454 save_file << "</Annotations>" << endl;
459 Sequence::load_museq(string load_file_path, int seq_num)
462 string file_data_line;
465 string::size_type space_split_i;
469 load_file.open(load_file_path.c_str(), ios::in);
472 // search for the seq_num-th sequence
473 while ( (!load_file.eof()) && (seq_counter < seq_num) )
475 getline(load_file,file_data_line);
476 if (file_data_line == "<Sequence>")
479 getline(load_file, file_data_line);
480 sequence = file_data_line;
481 getline(load_file, file_data_line);
482 getline(load_file, file_data_line);
483 if (file_data_line == "<Annotations>")
485 getline(load_file, file_data_line);
486 species = file_data_line;
487 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
489 getline(load_file,file_data_line);
490 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
492 // need to get 4 values...almost same code 4 times...
493 // get annot start index
494 space_split_i = file_data_line.find(" ");
495 annot_value = file_data_line.substr(0,space_split_i);
496 an_annot.start = atoi (annot_value.c_str());
497 file_data_line = file_data_line.substr(space_split_i+1);
498 // get annot end index
499 space_split_i = file_data_line.find(" ");
500 annot_value = file_data_line.substr(0,space_split_i);
501 an_annot.end = atoi (annot_value.c_str());
503 if (space_split_i == string::npos) // no entry for type or name
505 cout << "seq, annots - no type or name\n";
509 else // else get annot type
511 file_data_line = file_data_line.substr(space_split_i+1);
512 space_split_i = file_data_line.find(" ");
513 annot_value = file_data_line.substr(0,space_split_i);
514 an_annot.type = annot_value;
515 if (space_split_i == string::npos) // no entry for name
517 cout << "seq, annots - no name\n";
520 else // get annot name
522 file_data_line = file_data_line.substr(space_split_i+1);
523 space_split_i = file_data_line.find(" ");
524 annot_value = file_data_line.substr(0,space_split_i);
525 an_annot.type = annot_value;
528 annots.push_back(an_annot); // don't forget to actually add the annot
530 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
531 // << "-->" << an_annot.type << "::" << an_annot.name << endl;
539 Sequence::rc_motif(string a_motif)
542 char conversionTable[257];
543 int seq_i, table_i, len;
545 len = a_motif.length();
546 rev_comp.reserve(len);
548 for(table_i=0; table_i < 256; table_i++)
550 conversionTable[table_i] = '~';
552 // add end of string character for printing out table for testing purposes
553 conversionTable[256] = '\0';
555 // add in the characters for the bases we want to convert (IUPAC)
556 conversionTable[(int)'A'] = 'T';
557 conversionTable[(int)'T'] = 'A';
558 conversionTable[(int)'G'] = 'C';
559 conversionTable[(int)'C'] = 'G';
560 conversionTable[(int)'N'] = 'N';
561 conversionTable[(int)'M'] = 'K';
562 conversionTable[(int)'R'] = 'Y';
563 conversionTable[(int)'W'] = 'W';
564 conversionTable[(int)'S'] = 'S';
565 conversionTable[(int)'Y'] = 'R';
566 conversionTable[(int)'K'] = 'M';
567 conversionTable[(int)'V'] = 'B';
568 conversionTable[(int)'H'] = 'D';
569 conversionTable[(int)'D'] = 'H';
570 conversionTable[(int)'B'] = 'V';
572 // finally, the actual conversion loop
573 for(seq_i = len - 1; seq_i >= 0; seq_i--)
575 //cout << "** i = " << seq_i << " bp = " <<
576 table_i = (int) a_motif[seq_i];
577 rev_comp += conversionTable[table_i];
580 //cout << "seq: " << a_motif << endl;
581 //cout << "rc: " << rev_comp << endl;
587 Sequence::motif_validate(string a_motif)
592 len = a_motif.length();
593 valid_motif.reserve(len);
595 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
596 // current nonIUPAC symbols are omitted, which is not reported atm
597 for(seq_i = 0; seq_i < len; seq_i++)
599 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
601 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
603 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
605 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
607 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
609 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
611 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
613 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
615 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
617 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
619 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
621 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
623 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
625 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
627 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
631 //cout << "valid_motif is: " << valid_motif << endl;
636 void Sequence::add_motif(string a_motif)
638 vector<int> motif_starts = find_motif(a_motif);
641 for(vector<int>::iterator motif_start_i = motif_starts.begin();
642 motif_start_i != motif_starts.end();
645 motif_list.push_back(motif(*motif_start_i, a_motif));
649 const list<motif>& Sequence::motifs() const
655 Sequence::find_motif(string a_motif)
657 vector<int> motif_match_starts;
660 motif_match_starts.clear();
662 //cout << "motif is: " << a_motif << endl;
663 a_motif = motif_validate(a_motif);
664 //cout << "motif is: " << a_motif << endl;
669 //cout << "Sequence: none blank motif\n";
670 motif_scan(a_motif, &motif_match_starts);
672 a_motif_rc = rc_motif(a_motif);
673 // make sure not to do search again if it is a palindrome
674 if (a_motif_rc != a_motif)
675 motif_scan(a_motif_rc, &motif_match_starts);
677 return motif_match_starts;
681 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
684 string::size_type seq_i;
685 int motif_i, motif_len;
687 // faster to loop thru the sequence as a old c string (ie char array)
688 seq_c = (char*)sequence.c_str();
689 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
690 motif_len = a_motif.length();
692 //cout << "motif_length: " << motif_len << endl;
693 //cout << "RAAARRRRR\n";
697 //cout << "motif: " << a_motif << endl;
699 //cout << "Sequence: motif, length= " << length << endl;
701 while (seq_i < sequence.length())
703 //cout << seq_c[seq_i];
704 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
705 // this is pretty much a straight translation of Nora's python code
706 // to match iupac letter codes
707 if (a_motif[motif_i] =='N')
709 else if (a_motif[motif_i] == seq_c[seq_i])
711 else if ((a_motif[motif_i] =='M') &&
712 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
714 else if ((a_motif[motif_i] =='R') &&
715 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
717 else if ((a_motif[motif_i] =='W') &&
718 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
720 else if ((a_motif[motif_i] =='S') &&
721 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
723 else if ((a_motif[motif_i] =='Y') &&
724 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
726 else if ((a_motif[motif_i] =='K') &&
727 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
729 else if ((a_motif[motif_i] =='V') &&
730 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
731 (seq_c[seq_i]=='G')))
733 else if ((a_motif[seq_i] =='H') &&
734 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
735 (seq_c[seq_i]=='T')))
737 else if ((a_motif[motif_i] =='D') &&
738 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
739 (seq_c[seq_i]=='T')))
741 else if ((a_motif[motif_i] =='B') &&
742 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
743 (seq_c[seq_i]=='T')))
752 // end Nora stuff, now we see if a match is found this pass
753 if (motif_i == motif_len)
757 motif_match_starts->push_back(seq_i - motif_len + 1);