1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // This file is part of the Mussa source distribution.
12 // http://mussa.caltech.edu/
13 // Contact author: Tristan De Buysscher, tristan@caltech.edu
15 // This program and all associated source code files are Copyright (C) 2005
16 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
17 // under the GNU Public License; please see the included LICENSE.txt
18 // file for more information, or contact Tristan directly.
21 // ----------------------------------------
22 // ---------- sequence.cc -----------
23 // ----------------------------------------
24 #include <boost/filesystem/fstream.hpp>
26 #include "alg/sequence.hpp"
27 #include "mussa_exceptions.hpp"
33 namespace fs = boost::filesystem;
44 annot::annot(int start, int end, std::string type, std::string name)
52 motif::motif(int start, std::string motif)
53 : annot(start, start+motif.size(), "motif", motif),
67 Sequence::Sequence(string seq)
69 set_filtered_sequence(seq);
72 Sequence &Sequence::operator=(const Sequence& s)
75 sequence = s.sequence;
83 Sequence &Sequence::operator=(const std::string& s)
85 set_filtered_sequence(s);
89 char Sequence::operator[](int index) const
91 return sequence[index];
94 ostream& operator<<(ostream& out, const Sequence& seq)
96 out << "Sequence(" << seq.get_seq() << ")";
100 //! load a fasta file into a sequence
102 * \param file_path the location of the fasta file in the filesystem
103 * \param seq_num which sequence in the file to load
104 * \param start_index starting position in the fasta sequence, 0 for beginning
105 * \param end_index ending position in the fasta sequence, 0 for end
106 * \return error message, empty string if no error. (gag!)
109 Sequence::load_fasta(fs::path file_path, int seq_num,
110 int start_index, int end_index)
112 fs::fstream data_file;
113 string file_data_line;
114 int header_counter = 0;
115 bool read_seq = true;
118 string seq_tmp; // holds sequence during basic filtering
120 data_file.open(file_path, ios::in);
123 throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
127 throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
129 // if file opened okay, read it
132 // search for the header of the fasta sequence we want
133 while ( (!data_file.eof()) && (header_counter < seq_num) )
135 getline(data_file,file_data_line);
136 if (file_data_line.substr(0,1) == ">")
140 if (header_counter > 0) {
141 header = file_data_line.substr(1);
145 while ( !data_file.eof() && read_seq ) {
146 getline(data_file,file_data_line);
147 if (file_data_line.substr(0,1) == ">")
149 else sequence_raw += file_data_line;
152 // Lastly, if subselection of the sequence was specified we keep cut out
153 // and only keep that part
154 // end_index = 0 means no end was specified, so cut to the end
156 end_index = sequence_raw.size();
158 // sequence filtering for upcasing agctn and convert non AGCTN to N
159 set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
161 stringstream errormsg;
162 errormsg << file_path.native_file_string()
163 << " did not have any fasta sequences" << endl;
164 throw mussa_load_error(errormsg.str());
170 void Sequence::set_filtered_sequence(const string &old_seq,
171 string::size_type start,
172 string::size_type count)
174 char conversionTable[257];
177 count = old_seq.size() - start;
179 sequence.reserve(count);
181 // Make a conversion table
183 // everything we don't specify below will become 'N'
184 for(int table_i=0; table_i < 256; table_i++)
186 conversionTable[table_i] = 'N';
188 // add end of string character for printing out table for testing purposes
189 conversionTable[256] = '\0';
191 // we want these to map to themselves - ie not to change
192 conversionTable[(int)'A'] = 'A';
193 conversionTable[(int)'T'] = 'T';
194 conversionTable[(int)'G'] = 'G';
195 conversionTable[(int)'C'] = 'C';
197 conversionTable[(int)'a'] = 'A';
198 conversionTable[(int)'t'] = 'T';
199 conversionTable[(int)'g'] = 'G';
200 conversionTable[(int)'c'] = 'C';
202 // finally, the actual conversion loop
203 for(string::size_type seq_index = 0; seq_index < count; seq_index++)
205 sequence += conversionTable[ (int)old_seq[seq_index+start]];
209 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
210 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
214 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
216 fs::fstream data_file;
217 string file_data_line;
219 string::size_type space_split_i;
221 list<annot>::iterator list_i;
226 data_file.open(file_path, ios::in);
230 throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
232 // if file opened okay, read it
235 getline(data_file,file_data_line);
236 species = file_data_line;
238 // end_index = 0 means no end was specified, so cut to the end
240 end_index = sequence.length();
242 //cout << "START: " << start_index << " END: " << end_index << endl;
244 while ( !data_file.eof() )
246 getline(data_file,file_data_line);
247 if (file_data_line != "")
249 // need to get 4 values...almost same code 4 times...
250 // get annot start index
251 space_split_i = file_data_line.find(" ");
252 annot_value = file_data_line.substr(0,space_split_i);
253 an_annot.start = atoi (annot_value.c_str());
254 file_data_line = file_data_line.substr(space_split_i+1);
255 // get annot end index
256 space_split_i = file_data_line.find(" ");
257 annot_value = file_data_line.substr(0,space_split_i);
258 an_annot.end = atoi (annot_value.c_str());
259 file_data_line = file_data_line.substr(space_split_i+1);
261 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
265 space_split_i = file_data_line.find(" ");
266 if (space_split_i == string::npos) // no entries for name & type
268 cout << "seq, annots - no name or type\n";
274 annot_value = file_data_line.substr(0,space_split_i);
275 an_annot.name = annot_value;
276 file_data_line = file_data_line.substr(space_split_i+1);
278 space_split_i = file_data_line.find(" ");
279 if (space_split_i == string::npos) // no entry for type
283 annot_value = file_data_line.substr(0,space_split_i);
284 an_annot.type = annot_value;
289 // add annot to list if it falls within the range of sequence specified
290 if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
292 an_annot.start -= start_index;
293 an_annot.end -= start_index;
294 annots.push_back(an_annot);
296 // else no (or bogus) annotations
303 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
305 cout << (*list_i).start << "," << (*list_i).end << "\t";
306 cout << (*list_i).name << "\t" << (*list_i).type << endl;
312 const std::string& Sequence::get_species() const
317 bool Sequence::empty() const
319 return (size() == 0);
322 const std::list<annot>& Sequence::annotations() const
327 string::size_type Sequence::length() const
332 string::size_type Sequence::size() const
334 return sequence.size();
337 Sequence::iterator Sequence::begin()
339 return sequence.begin();
342 Sequence::const_iterator Sequence::begin() const
344 return sequence.begin();
347 Sequence::iterator Sequence::end()
349 return sequence.end();
352 Sequence::const_iterator Sequence::end() const
354 return sequence.end();
359 Sequence::get_seq() const
366 Sequence::subseq(int start, int end) const
368 return sequence.substr(start, end);
373 Sequence::c_seq() const
375 return sequence.c_str();
379 Sequence::rev_comp() const
382 char conversionTable[257];
383 int seq_i, table_i, len;
385 len = sequence.length();
386 rev_comp.reserve(len);
387 // make a conversion table
388 // init all parts of conversion table to '~' character
389 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
390 // and may the fleas of 1000 camels infest the genitals of any biologist (and
391 // seven generations of their progeny) who decides to make it mean
392 // something special!!!
393 // PS - double the curse for any smartass non-biologist who tries it as well
394 for(table_i=0; table_i < 256; table_i++)
396 conversionTable[table_i] = '~';
398 // add end of string character for printing out table for testing purposes
399 conversionTable[256] = '\0';
401 // add in the characters for the bases we want to convert
402 conversionTable[(int)'A'] = 'T';
403 conversionTable[(int)'T'] = 'A';
404 conversionTable[(int)'G'] = 'C';
405 conversionTable[(int)'C'] = 'G';
406 conversionTable[(int)'N'] = 'N';
408 // finally, the actual conversion loop
409 for(seq_i = len - 1; seq_i >= 0; seq_i--)
411 table_i = (int) sequence[seq_i];
412 rev_comp += conversionTable[table_i];
420 Sequence::get_header() const
425 //FIXME: i don't think this code is callable
427 Sequence::sp_name() const
434 Sequence::set_seq(const string& a_seq)
436 set_filtered_sequence(a_seq);
458 Sequence::save(fs::fstream &save_file)
459 //string save_file_path)
462 list<annot>::iterator annots_i;
464 // not sure why, or if i'm doing something wrong, but can't seem to pass
465 // file pointers down to this method from the mussa control class
466 // so each call to save a sequence appends to the file started by mussa_class
467 //save_file.open(save_file_path.c_str(), ios::app);
469 save_file << "<Sequence>" << endl;
470 save_file << sequence << endl;
471 save_file << "</Sequence>" << endl;
473 save_file << "<Annotations>" << endl;
474 save_file << species << endl;
475 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
477 save_file << annots_i->start << " " << annots_i->end << " " ;
478 save_file << annots_i->name << " " << annots_i->type << endl;
480 save_file << "</Annotations>" << endl;
485 Sequence::load_museq(fs::path load_file_path, int seq_num)
487 fs::fstream load_file;
488 string file_data_line;
491 string::size_type space_split_i;
495 load_file.open(load_file_path, ios::in);
498 // search for the seq_num-th sequence
499 while ( (!load_file.eof()) && (seq_counter < seq_num) )
501 getline(load_file,file_data_line);
502 if (file_data_line == "<Sequence>")
505 getline(load_file, file_data_line);
506 sequence = file_data_line;
507 getline(load_file, file_data_line);
508 getline(load_file, file_data_line);
509 if (file_data_line == "<Annotations>")
511 getline(load_file, file_data_line);
512 species = file_data_line;
513 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
515 getline(load_file,file_data_line);
516 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
518 // need to get 4 values...almost same code 4 times...
519 // get annot start index
520 space_split_i = file_data_line.find(" ");
521 annot_value = file_data_line.substr(0,space_split_i);
522 an_annot.start = atoi (annot_value.c_str());
523 file_data_line = file_data_line.substr(space_split_i+1);
524 // get annot end index
525 space_split_i = file_data_line.find(" ");
526 annot_value = file_data_line.substr(0,space_split_i);
527 an_annot.end = atoi (annot_value.c_str());
529 if (space_split_i == string::npos) // no entry for type or name
531 cout << "seq, annots - no type or name\n";
535 else // else get annot type
537 file_data_line = file_data_line.substr(space_split_i+1);
538 space_split_i = file_data_line.find(" ");
539 annot_value = file_data_line.substr(0,space_split_i);
540 an_annot.type = annot_value;
541 if (space_split_i == string::npos) // no entry for name
543 cout << "seq, annots - no name\n";
546 else // get annot name
548 file_data_line = file_data_line.substr(space_split_i+1);
549 space_split_i = file_data_line.find(" ");
550 annot_value = file_data_line.substr(0,space_split_i);
551 an_annot.type = annot_value;
554 annots.push_back(an_annot); // don't forget to actually add the annot
556 //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
557 // << "-->" << an_annot.type << "::" << an_annot.name << endl;
565 Sequence::rc_motif(string a_motif)
568 char conversionTable[257];
569 int seq_i, table_i, len;
571 len = a_motif.length();
572 rev_comp.reserve(len);
574 for(table_i=0; table_i < 256; table_i++)
576 conversionTable[table_i] = '~';
578 // add end of string character for printing out table for testing purposes
579 conversionTable[256] = '\0';
581 // add in the characters for the bases we want to convert (IUPAC)
582 conversionTable[(int)'A'] = 'T';
583 conversionTable[(int)'T'] = 'A';
584 conversionTable[(int)'G'] = 'C';
585 conversionTable[(int)'C'] = 'G';
586 conversionTable[(int)'N'] = 'N';
587 conversionTable[(int)'M'] = 'K';
588 conversionTable[(int)'R'] = 'Y';
589 conversionTable[(int)'W'] = 'W';
590 conversionTable[(int)'S'] = 'S';
591 conversionTable[(int)'Y'] = 'R';
592 conversionTable[(int)'K'] = 'M';
593 conversionTable[(int)'V'] = 'B';
594 conversionTable[(int)'H'] = 'D';
595 conversionTable[(int)'D'] = 'H';
596 conversionTable[(int)'B'] = 'V';
598 // finally, the actual conversion loop
599 for(seq_i = len - 1; seq_i >= 0; seq_i--)
601 //cout << "** i = " << seq_i << " bp = " <<
602 table_i = (int) a_motif[seq_i];
603 rev_comp += conversionTable[table_i];
606 //cout << "seq: " << a_motif << endl;
607 //cout << "rc: " << rev_comp << endl;
613 Sequence::motif_normalize(string a_motif)
618 len = a_motif.length();
619 valid_motif.reserve(len);
621 // this just upcases IUPAC symbols. Eventually should return an error if non IUPAC is present.
622 // current nonIUPAC symbols are omitted, which is not reported atm
623 for(seq_i = 0; seq_i < len; seq_i++)
625 if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
627 else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
629 else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
631 else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
633 else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
635 else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
637 else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
639 else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
641 else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
643 else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
645 else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
647 else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
649 else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
651 else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
653 else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
656 string msg = "Letter ";
657 msg += a_motif[seq_i];
658 msg += " is not a valid IUPAC symbol";
659 throw motif_normalize_error(msg);
662 //cout << "valid_motif is: " << valid_motif << endl;
666 void Sequence::add_motif(string a_motif)
668 vector<int> motif_starts = find_motif(a_motif);
670 for(vector<int>::iterator motif_start_i = motif_starts.begin();
671 motif_start_i != motif_starts.end();
674 motif_list.push_back(motif(*motif_start_i, a_motif));
678 void Sequence::clear_motifs()
683 const list<motif>& Sequence::motifs() const
689 Sequence::find_motif(string a_motif)
691 vector<int> motif_match_starts;
694 motif_match_starts.clear();
696 //cout << "motif is: " << a_motif << endl;
697 a_motif = motif_normalize(a_motif);
698 //cout << "motif is: " << a_motif << endl;
702 //cout << "Sequence: none blank motif\n";
703 motif_scan(a_motif, &motif_match_starts);
705 a_motif_rc = rc_motif(a_motif);
706 // make sure not to do search again if it is a palindrome
707 if (a_motif_rc != a_motif)
708 motif_scan(a_motif_rc, &motif_match_starts);
710 return motif_match_starts;
714 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
717 string::size_type seq_i;
718 int motif_i, motif_len;
720 // faster to loop thru the sequence as a old c string (ie char array)
721 seq_c = (char*)sequence.c_str();
722 //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
723 motif_len = a_motif.length();
725 //cout << "motif_length: " << motif_len << endl;
726 //cout << "RAAARRRRR\n";
730 //cout << "motif: " << a_motif << endl;
732 //cout << "Sequence: motif, length= " << length << endl;
734 while (seq_i < sequence.length())
736 //cout << seq_c[seq_i];
737 //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
738 // this is pretty much a straight translation of Nora's python code
739 // to match iupac letter codes
740 if (a_motif[motif_i] =='N')
742 else if (a_motif[motif_i] == seq_c[seq_i])
744 else if ((a_motif[motif_i] =='M') &&
745 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
747 else if ((a_motif[motif_i] =='R') &&
748 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
750 else if ((a_motif[motif_i] =='W') &&
751 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
753 else if ((a_motif[motif_i] =='S') &&
754 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
756 else if ((a_motif[motif_i] =='Y') &&
757 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
759 else if ((a_motif[motif_i] =='K') &&
760 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
762 else if ((a_motif[motif_i] =='V') &&
763 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
764 (seq_c[seq_i]=='G')))
766 else if ((a_motif[seq_i] =='H') &&
767 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
768 (seq_c[seq_i]=='T')))
770 else if ((a_motif[motif_i] =='D') &&
771 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
772 (seq_c[seq_i]=='T')))
774 else if ((a_motif[motif_i] =='B') &&
775 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
776 (seq_c[seq_i]=='T')))
785 // end Nora stuff, now we see if a match is found this pass
786 if (motif_i == motif_len)
790 motif_match_starts->push_back(seq_i - motif_len + 1);
799 void Sequence::add_string_annotation(std::string a_seq,
802 vector<int> seq_starts = find_motif(a_seq);
804 for(vector<int>::iterator seq_start_i = seq_starts.begin();
805 seq_start_i != seq_starts.end();
808 annots.push_back(annot(*seq_start_i,
809 *seq_start_i+a_seq.size(),
815 void Sequence::find_sequences(std::list<Sequence>::iterator start,
816 std::list<Sequence>::iterator end)
818 while (start != end) {
819 add_string_annotation(start->get_seq(), start->get_header());