1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
25 #include "alg/sequence.hpp"
26 #include "mussa_exceptions.hpp"
37 Sequence::Sequence(string seq)
39 set_filtered_sequence(seq);
42 Sequence &Sequence::operator=(const Sequence& s)
45 sequence = s.sequence;
53 Sequence &Sequence::operator=(const std::string& s)
55 set_filtered_sequence(s);
59 //! load a fasta file into a sequence
61 * \param file_path the location of the fasta file in the filesystem
62 * \param seq_num which sequence in the file to load
63 * \param start_index starting position in the fasta sequence, 0 for beginning
64 * \param end_index ending position in the fasta sequence, 0 for end
65 * \return error message, empty string if no error. (gag!)
68 Sequence::load_fasta(string file_path, int seq_num,
69 int start_index, int end_index)
72 string file_data_line;
73 int header_counter = 0;
77 string seq_tmp; // holds sequence during basic filtering
79 data_file.open(file_path.c_str(), ios::in);
83 throw mussa_load_error("Sequence File: " + file_path + " not found");
85 // if file opened okay, read it
88 // search for the header of the fasta sequence we want
89 while ( (!data_file.eof()) && (header_counter < seq_num) )
91 getline(data_file,file_data_line);
92 if (file_data_line.substr(0,1) == ">")
96 header = file_data_line.substr(1);
100 while ( !data_file.eof() && read_seq )
102 getline(data_file,file_data_line);
103 if (file_data_line.substr(0,1) == ">")
105 else sequence_raw += file_data_line;
110 // Lastly, if subselection of the sequence was specified we keep cut out
111 // and only keep that part
112 // end_index = 0 means no end was specified, so cut to the end
114 end_index = sequence_raw.size();
116 // sequence filtering for upcasing agctn and convert non AGCTN to N
117 set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
121 void Sequence::set_filtered_sequence(const string &old_seq,
122 string::size_type start,
123 string::size_type count)
125 char conversionTable[257];
128 count = old_seq.size() - start;
130 sequence.reserve(count);
132 // Make a conversion table
134 // everything we don't specify below will become 'N'
135 for(int table_i=0; table_i < 256; table_i++)
137 conversionTable[table_i] = 'N';
139 // add end of string character for printing out table for testing purposes
140 conversionTable[256] = '\0';
142 // we want these to map to themselves - ie not to change
143 conversionTable[(int)'A'] = 'A';
144 conversionTable[(int)'T'] = 'T';
145 conversionTable[(int)'G'] = 'G';
146 conversionTable[(int)'C'] = 'C';
148 conversionTable[(int)'a'] = 'A';
149 conversionTable[(int)'t'] = 'T';
150 conversionTable[(int)'g'] = 'G';
151 conversionTable[(int)'c'] = 'C';
153 // finally, the actual conversion loop
154 for(string::size_type seq_index = 0; seq_index < count; seq_index++)
156 sequence += conversionTable[ (int)old_seq[seq_index+start]];
160 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
161 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
165 Sequence::load_annot(string file_path, int start_index, int end_index)
168 string file_data_line;
170 string::size_type space_split_i;
172 list<annot>::iterator list_i;
177 data_file.open(file_path.c_str(), ios::in);
181 throw mussa_load_error("Sequence File: " + file_path + " not found");
183 // if file opened okay, read it
186 getline(data_file,file_data_line);
187 species = file_data_line;
189 // end_index = 0 means no end was specified, so cut to the end
191 end_index = sequence.length();
193 //cout << "START: " << start_index << " END: " << end_index << endl;
195 while ( !data_file.eof() )
197 getline(data_file,file_data_line);
198 if (file_data_line != "")
200 // need to get 4 values...almost same code 4 times...
201 // get annot start index
202 space_split_i = file_data_line.find(" ");
203 annot_value = file_data_line.substr(0,space_split_i);
204 an_annot.start = atoi (annot_value.c_str());
205 file_data_line = file_data_line.substr(space_split_i+1);
206 // get annot end index
207 space_split_i = file_data_line.find(" ");
208 annot_value = file_data_line.substr(0,space_split_i);
209 an_annot.end = atoi (annot_value.c_str());
210 file_data_line = file_data_line.substr(space_split_i+1);
212 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
216 space_split_i = file_data_line.find(" ");
217 if (space_split_i == string::npos) // no entries for name & type
219 cout << "seq, annots - no name or type\n";
225 annot_value = file_data_line.substr(0,space_split_i);
226 an_annot.name = annot_value;
227 file_data_line = file_data_line.substr(space_split_i+1);
229 space_split_i = file_data_line.find(" ");
230 if (space_split_i == string::npos) // no entry for type
234 annot_value = file_data_line.substr(0,space_split_i);
235 an_annot.type = annot_value;
240 // add annot to list if it falls within the range of sequence specified
241 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
243 an_annot.start -= start_index;
244 an_annot.end -= start_index;
245 annots.push_back(an_annot);
248 cout << "FAILED!!!!!!\n";
255 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
257 cout << (*list_i).start << "," << (*list_i).end << "\t";
258 cout << (*list_i).name << "\t" << (*list_i).type << endl;
264 bool Sequence::empty() const
266 return (size() == 0);
269 const std::list<annot> Sequence::annotations() const
274 string::size_type Sequence::length() const
279 string::size_type Sequence::size() const
281 return sequence.size();
284 Sequence::iterator Sequence::begin()
286 return sequence.begin();
289 Sequence::const_iterator Sequence::begin() const
291 return sequence.begin();
294 Sequence::iterator Sequence::end()
296 return sequence.end();
299 Sequence::const_iterator Sequence::end() const
301 return sequence.end();
306 Sequence::get_seq() const
313 Sequence::subseq(int start, int end) const
315 return sequence.substr(start, end);
320 Sequence::c_seq() const
322 return sequence.c_str();
326 Sequence::rev_comp() const
329 char conversionTable[257];
330 int seq_i, table_i, len;
332 len = sequence.length();
333 rev_comp.reserve(len);
334 // make a conversion table
335 // init all parts of conversion table to '~' character
336 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
337 // and may the fleas of 1000 camels infest the genitals of any biologist (and
338 // seven generations of their progeny) who decides to make it mean
339 // something special!!!
340 // PS - double the curse for any smartass non-biologist who tries it as well
341 for(table_i=0; table_i < 256; table_i++)
343 conversionTable[table_i] = '~';
345 // add end of string character for printing out table for testing purposes
346 conversionTable[256] = '\0';
348 // add in the characters for the bases we want to convert
349 conversionTable[(int)'A'] = 'T';
350 conversionTable[(int)'T'] = 'A';
351 conversionTable[(int)'G'] = 'C';
352 conversionTable[(int)'C'] = 'G';
353 conversionTable[(int)'N'] = 'N';
355 // finally, the actual conversion loop
356 for(seq_i = len - 1; seq_i >= 0; seq_i--)
358 table_i = (int) sequence[seq_i];
359 rev_comp += conversionTable[table_i];
367 Sequence::get_header() const
372 //FIXME: i don't think this code is callable
374 Sequence::sp_name() const
381 Sequence::set_seq(const string& a_seq)
383 set_filtered_sequence(a_seq);
405 Sequence::save(fstream &save_file)
406 //string save_file_path)
409 list<annot>::iterator annots_i;
411 // not sure why, or if i'm doing something wrong, but can't seem to pass
412 // file pointers down to this method from the mussa control class
413 // so each call to save a sequence appends to the file started by mussa_class
414 //save_file.open(save_file_path.c_str(), ios::app);
416 save_file << "<Sequence>" << endl;
417 save_file << sequence << endl;
418 save_file << "</Sequence>" << endl;
420 save_file << "<Annotations>" << endl;
421 save_file << species << endl;
422 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
424 save_file << annots_i->start << " " << annots_i->end << " " ;
425 save_file << annots_i->name << " " << annots_i->type << endl;
427 save_file << "</Annotations>" << endl;
432 Sequence::load_museq(string load_file_path, int seq_num)
435 string file_data_line;
438 string::size_type space_split_i;
442 load_file.open(load_file_path.c_str(), ios::in);
445 // search for the seq_num-th sequence
446 while ( (!load_file.eof()) && (seq_counter < seq_num) )
448 getline(load_file,file_data_line);
449 if (file_data_line == "<Sequence>")
452 getline(load_file, file_data_line);
453 sequence = file_data_line;
454 getline(load_file, file_data_line);
455 getline(load_file, file_data_line);
456 if (file_data_line == "<Annotations>")
458 getline(load_file, file_data_line);
459 species = file_data_line;
460 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
462 getline(load_file,file_data_line);
463 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
465 // need to get 4 values...almost same code 4 times...
466 // get annot start index
467 space_split_i = file_data_line.find(" ");
468 annot_value = file_data_line.substr(0,space_split_i);
469 an_annot.start = atoi (annot_value.c_str());
470 file_data_line = file_data_line.substr(space_split_i+1);
471 // get annot end index
472 space_split_i = file_data_line.find(" ");
473 annot_value = file_data_line.substr(0,space_split_i);
474 an_annot.end = atoi (annot_value.c_str());
476 if (space_split_i == string::npos) // no entry for type or name
478 cout << "seq, annots - no type or name\n";
482 else // else get annot type
484 file_data_line = file_data_line.substr(space_split_i+1);
485 space_split_i = file_data_line.find(" ");
486 annot_value = file_data_line.substr(0,space_split_i);
487 an_annot.type = annot_value;
488 if (space_split_i == string::npos) // no entry for name
490 cout << "seq, annots - no name\n";
493 else // get annot name
495 file_data_line = file_data_line.substr(space_split_i+1);
496 space_split_i = file_data_line.find(" ");
497 annot_value = file_data_line.substr(0,space_split_i);
498 an_annot.type = annot_value;
501 annots.push_back(an_annot); // don't forget to actually add the annot
503 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
504 // << "-->" << an_annot.type << "::" << an_annot.name << endl;
512 Sequence::rc_motif(string a_motif)
515 char conversionTable[257];
516 int seq_i, table_i, len;
518 len = a_motif.length();
519 rev_comp.reserve(len);
521 for(table_i=0; table_i < 256; table_i++)
523 conversionTable[table_i] = '~';
525 // add end of string character for printing out table for testing purposes
526 conversionTable[256] = '\0';
528 // add in the characters for the bases we want to convert (IUPAC)
529 conversionTable[(int)'A'] = 'T';
530 conversionTable[(int)'T'] = 'A';
531 conversionTable[(int)'G'] = 'C';
532 conversionTable[(int)'C'] = 'G';
533 conversionTable[(int)'N'] = 'N';
534 conversionTable[(int)'M'] = 'K';
535 conversionTable[(int)'R'] = 'Y';
536 conversionTable[(int)'W'] = 'W';
537 conversionTable[(int)'S'] = 'S';
538 conversionTable[(int)'Y'] = 'R';
539 conversionTable[(int)'K'] = 'M';
540 conversionTable[(int)'V'] = 'B';
541 conversionTable[(int)'H'] = 'D';
542 conversionTable[(int)'D'] = 'H';
543 conversionTable[(int)'B'] = 'V';
545 // finally, the actual conversion loop
546 for(seq_i = len - 1; seq_i >= 0; seq_i--)
548 //cout << "** i = " << seq_i << " bp = " <<
549 table_i = (int) a_motif[seq_i];
550 rev_comp += conversionTable[table_i];
553 //cout << "seq: " << a_motif << endl;
554 //cout << "rc: " << rev_comp << endl;
560 Sequence::motif_validate(string a_motif)
565 len = a_motif.length();
566 valid_motif.reserve(len);
568 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
569 // current nonIUPAC symbols are omitted, which is not reported atm
570 for(seq_i = 0; seq_i < len; seq_i++)
572 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
574 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
576 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
578 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
580 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
582 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
584 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
586 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
588 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
590 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
592 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
594 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
596 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
598 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
600 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
604 //cout << "valid_motif is: " << valid_motif << endl;
611 Sequence::find_motif(string a_motif)
613 vector<int> motif_match_starts;
617 motif_match_starts.clear();
619 //cout << "motif is: " << a_motif << endl;
620 a_motif = motif_validate(a_motif);
621 //cout << "motif is: " << a_motif << endl;
626 //cout << "Sequence: none blank motif\n";
627 motif_scan(a_motif, &motif_match_starts);
629 a_motif_rc = rc_motif(a_motif);
630 // make sure not to do search again if it is a palindrome
631 if (a_motif_rc != a_motif)
632 motif_scan(a_motif_rc, &motif_match_starts);
634 return motif_match_starts;
638 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
641 string::size_type seq_i;
642 int motif_i, motif_len;
644 // faster to loop thru the sequence as a old c string (ie char array)
645 seq_c = (char*)sequence.c_str();
646 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
647 motif_len = a_motif.length();
649 //cout << "motif_length: " << motif_len << endl;
650 //cout << "RAAARRRRR\n";
654 //cout << "motif: " << a_motif << endl;
656 //cout << "Sequence: motif, length= " << length << endl;
658 while (seq_i < sequence.length())
660 //cout << seq_c[seq_i];
661 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
662 // this is pretty much a straight translation of Nora's python code
663 // to match iupac letter codes
664 if (a_motif[motif_i] =='N')
666 else if (a_motif[motif_i] == seq_c[seq_i])
668 else if ((a_motif[motif_i] =='M') &&
669 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
671 else if ((a_motif[motif_i] =='R') &&
672 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
674 else if ((a_motif[motif_i] =='W') &&
675 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
677 else if ((a_motif[motif_i] =='S') &&
678 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
680 else if ((a_motif[motif_i] =='Y') &&
681 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
683 else if ((a_motif[motif_i] =='K') &&
684 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
686 else if ((a_motif[motif_i] =='V') &&
687 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
688 (seq_c[seq_i]=='G')))
690 else if ((a_motif[seq_i] =='H') &&
691 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
692 (seq_c[seq_i]=='T')))
694 else if ((a_motif[motif_i] =='D') &&
695 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
696 (seq_c[seq_i]=='T')))
698 else if ((a_motif[motif_i] =='B') &&
699 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
700 (seq_c[seq_i]=='T')))
709 // end Nora stuff, now we see if a match is found this pass
710 if (motif_i == motif_len)
713 motif_match_starts->push_back(seq_i - motif_len + 1);