alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25
  26 #include "alg/sequence.hpp"
  27 #include "mussa_exceptions.hpp"
  28
  29 #include <string>
  30 #include <iostream>
  31 #include <sstream>
  32
  33 namespace fs = boost::filesystem;
  34 using namespace std;
  35
  36 annot::annot()
  37  : start(0),
  38    end(0),
  39    type(""),
  40    name("")
  41 {
  42 }
  43
  44 annot::annot(int start, int end, std::string type, std::string name)
  45  : start(start),
  46    end(end),
  47    type(type),
  48    name(name)
  49 {
  50 }
  51
  52 motif::motif(int start, std::string motif)
  53  : annot(start, start+motif.size(), "motif", motif),
  54    sequence(motif)
  55 {
  56 }
  57
  58 Sequence::Sequence()
  59   : sequence(""),
  60     header(""),
  61     species("")
  62 {
  63   annots.clear();
  64   motif_list.clear();
  65 }
  66
  67 Sequence::Sequence(string seq)
  68 {
  69   set_filtered_sequence(seq);
  70 }
  71
  72 Sequence &Sequence::operator=(const Sequence& s)
  73 {
  74   if (this != &s) {
  75     sequence = s.sequence;
  76     header = s.header;
  77     species = s.species;
  78     annots = s.annots;
  79   }
  80   return *this;
  81 }
  82
  83 Sequence &Sequence::operator=(const std::string& s)
  84 {
  85   set_filtered_sequence(s);
  86   return *this;
  87 }
  88
  89 char Sequence::operator[](int index) const
  90 {
  91   return sequence[index];
  92 }
  93
  94 ostream& operator<<(ostream& out, const Sequence& seq)
  95 {
  96   out << "Sequence(" << seq.get_seq() << ")";
  97   return out;
  98 }
  99
 100 //! load a fasta file into a sequence
 101 /*!
 102  * \param file_path the location of the fasta file in the filesystem
 103  * \param seq_num which sequence in the file to load
 104  * \param start_index starting position in the fasta sequence, 0 for beginning
 105  * \param end_index ending position in the fasta sequence, 0 for end
 106  * \return error message, empty string if no error. (gag!)
 107  */
 108 void
 109 Sequence::load_fasta(fs::path file_path, int seq_num,
 110                      int start_index, int end_index)
 111 {
 112   fs::fstream data_file;
 113   string file_data_line;
 114   int header_counter = 0;
 115   bool read_seq = true;
 116   string rev_comp;
 117   string sequence_raw;
 118   string seq_tmp;             // holds sequence during basic filtering
 119
 120   data_file.open(file_path, ios::in);
 121
 122   if (seq_num == 0) {
 123     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 124   }
 125   if (!data_file)
 126   {
 127     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 128   }
 129   // if file opened okay, read it
 130   else
 131   {
 132     // search for the header of the fasta sequence we want
 133     while ( (!data_file.eof()) && (header_counter < seq_num) )
 134     {
 135       getline(data_file,file_data_line);
 136       if (file_data_line.substr(0,1) == ">")
 137         header_counter++;
 138     }
 139
 140     if (header_counter > 0) {
 141       header = file_data_line.substr(1);
 142
 143       sequence_raw = "";
 144
 145       while ( !data_file.eof() && read_seq ) {
 146         getline(data_file,file_data_line);
 147         if (file_data_line.substr(0,1) == ">")
 148           read_seq = false;
 149         else sequence_raw += file_data_line;
 150       }
 151
 152       // Lastly, if subselection of the sequence was specified we keep cut out
 153       // and only keep that part
 154       // end_index = 0 means no end was specified, so cut to the end
 155       if (end_index == 0)
 156         end_index = sequence_raw.size();
 157
 158       // sequence filtering for upcasing agctn and convert non AGCTN to N
 159       set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 160     } else {
 161       stringstream errormsg;
 162       errormsg << file_path.native_file_string()
 163                << " did not have any fasta sequences" << endl;
 164       throw mussa_load_error(errormsg.str());
 165     }
 166     data_file.close();
 167   }
 168 }
 169
 170 void Sequence::set_filtered_sequence(const string &old_seq,
 171                                      string::size_type start,
 172                                      string::size_type count)
 173 {
 174   char conversionTable[257];
 175
 176   if ( count == 0)
 177     count = old_seq.size() - start;
 178   sequence.clear();
 179   sequence.reserve(count);
 180
 181   // Make a conversion table
 182
 183   // everything we don't specify below will become 'N'
 184   for(int table_i=0; table_i < 256; table_i++)
 185   {
 186     conversionTable[table_i] = 'N';
 187   }
 188   // add end of string character for printing out table for testing purposes
 189   conversionTable[256] = '\0';
 190
 191   // we want these to map to themselves - ie not to change
 192   conversionTable[(int)'A'] = 'A';
 193   conversionTable[(int)'T'] = 'T';
 194   conversionTable[(int)'G'] = 'G';
 195   conversionTable[(int)'C'] = 'C';
 196   // this is to upcase
 197   conversionTable[(int)'a'] = 'A';
 198   conversionTable[(int)'t'] = 'T';
 199   conversionTable[(int)'g'] = 'G';
 200   conversionTable[(int)'c'] = 'C';
 201
 202   // finally, the actual conversion loop
 203   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 204   {
 205     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 206   }
 207 }
 208
 209   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 210   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 211
 212
 213 void
 214 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 215 {
 216   fs::fstream data_file;
 217   string file_data_line;
 218   annot an_annot;
 219   string::size_type space_split_i;
 220   string annot_value;
 221   list<annot>::iterator list_i;
 222   string err_msg;
 223
 224
 225   annots.clear();
 226   data_file.open(file_path, ios::in);
 227
 228   if (!data_file)
 229   {
 230     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 231   }
 232   // if file opened okay, read it
 233   else
 234   {
 235     getline(data_file,file_data_line);
 236     species = file_data_line;
 237
 238     // end_index = 0 means no end was specified, so cut to the end
 239     if (end_index == 0)
 240       end_index = sequence.length();
 241
 242     //cout << "START: " << start_index << " END: " << end_index << endl;
 243
 244     while ( !data_file.eof() )
 245     {
 246       getline(data_file,file_data_line);
 247       if (file_data_line != "")
 248       {
 249         // need to get 4 values...almost same code 4 times...
 250         // get annot start index
 251         space_split_i = file_data_line.find(" ");
 252         annot_value = file_data_line.substr(0,space_split_i);
 253         an_annot.start = atoi (annot_value.c_str());
 254         file_data_line = file_data_line.substr(space_split_i+1);
 255         // get annot end index
 256         space_split_i = file_data_line.find(" ");
 257         annot_value = file_data_line.substr(0,space_split_i);
 258         an_annot.end = atoi (annot_value.c_str());
 259         file_data_line = file_data_line.substr(space_split_i+1);
 260
 261         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 262               //     << endl;
 263
 264         // get annot name
 265         space_split_i = file_data_line.find(" ");
 266         if (space_split_i == string::npos)  // no entries for name & type
 267         {
 268           cout << "seq, annots - no name or type\n";
 269           an_annot.name = "";
 270           an_annot.type = "";
 271         }
 272         else
 273         {
 274           annot_value = file_data_line.substr(0,space_split_i);
 275           an_annot.name = annot_value;
 276           file_data_line = file_data_line.substr(space_split_i+1);
 277           // get annot type
 278           space_split_i = file_data_line.find(" ");
 279           if (space_split_i == string::npos)  // no entry for type
 280             an_annot.type = "";
 281           else
 282           {
 283             annot_value = file_data_line.substr(0,space_split_i);
 284             an_annot.type = annot_value;
 285           }
 286         }
 287
 288
 289         // add annot to list if it falls within the range of sequence specified
 290         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 291         {
 292           an_annot.start -= start_index;
 293           an_annot.end -= start_index;
 294           annots.push_back(an_annot);
 295         }
 296         // else no (or bogus) annotations
 297       }
 298     }
 299
 300     data_file.close();
 301     /*
 302     // debugging check
 303     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 304     {
 305       cout << (*list_i).start << "," << (*list_i).end << "\t";
 306       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 307     }
 308     */
 309   }
 310 }
 311
 312 const std::string& Sequence::get_species() const
 313 {
 314   return species;
 315 }
 316
 317 bool Sequence::empty() const
 318 {
 319   return (size() == 0);
 320 }
 321
 322 const std::list<annot>& Sequence::annotations() const
 323 {
 324   return annots;
 325 }
 326
 327 string::size_type Sequence::length() const
 328 {
 329   return size();
 330 }
 331
 332 string::size_type Sequence::size() const
 333 {
 334   return sequence.size();
 335 }
 336
 337 Sequence::iterator Sequence::begin()
 338 {
 339   return sequence.begin();
 340 }
 341
 342 Sequence::const_iterator Sequence::begin() const
 343 {
 344   return sequence.begin();
 345 }
 346
 347 Sequence::iterator Sequence::end()
 348 {
 349   return sequence.end();
 350 }
 351
 352 Sequence::const_iterator Sequence::end() const
 353 {
 354   return sequence.end();
 355 }
 356
 357
 358 const string&
 359 Sequence::get_seq() const
 360 {
 361   return sequence;
 362 }
 363
 364
 365 string
 366 Sequence::subseq(int start, int end) const
 367 {
 368   return sequence.substr(start, end);
 369 }
 370
 371
 372 const char *
 373 Sequence::c_seq() const
 374 {
 375   return sequence.c_str();
 376 }
 377
 378 string
 379 Sequence::rev_comp() const
 380 {
 381   string rev_comp;
 382   char conversionTable[257];
 383   int seq_i, table_i, len;
 384
 385   len = sequence.length();
 386   rev_comp.reserve(len);
 387   // make a conversion table
 388   // init all parts of conversion table to '~' character
 389   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 390   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 391   // seven generations of their progeny) who decides to make it mean
 392   // something special!!!
 393   // PS - double the curse for any smartass non-biologist who tries it as well
 394   for(table_i=0; table_i < 256; table_i++)
 395   {
 396     conversionTable[table_i] = '~';
 397   }
 398   // add end of string character for printing out table for testing purposes
 399   conversionTable[256] = '\0';
 400
 401   // add in the characters for the bases we want to convert
 402   conversionTable[(int)'A'] = 'T';
 403   conversionTable[(int)'T'] = 'A';
 404   conversionTable[(int)'G'] = 'C';
 405   conversionTable[(int)'C'] = 'G';
 406   conversionTable[(int)'N'] = 'N';
 407
 408   // finally, the actual conversion loop
 409   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 410   {
 411     table_i = (int) sequence[seq_i];
 412     rev_comp += conversionTable[table_i];
 413   }
 414
 415   return rev_comp;
 416 }
 417
 418
 419 const string&
 420 Sequence::get_header() const
 421 {
 422   return header;
 423 }
 424 /*
 425 //FIXME: i don't think this code is callable
 426 string
 427 Sequence::sp_name() const
 428 {
 429   return species;
 430 }
 431 */
 432
 433 void
 434 Sequence::set_seq(const string& a_seq)
 435 {
 436   set_filtered_sequence(a_seq);
 437 }
 438
 439
 440 /*
 441 string
 442 Sequence::species()
 443 {
 444   return species;
 445 }
 446 */
 447
 448 void
 449 Sequence::clear()
 450 {
 451   sequence = "";
 452   header = "";
 453   species = "";
 454   annots.clear();
 455 }
 456
 457 void
 458 Sequence::save(fs::fstream &save_file)
 459                //string save_file_path)
 460 {
 461   //fstream save_file;
 462   list<annot>::iterator annots_i;
 463
 464   // not sure why, or if i'm doing something wrong, but can't seem to pass
 465   // file pointers down to this method from the mussa control class
 466   // so each call to save a sequence appends to the file started by mussa_class
 467   //save_file.open(save_file_path.c_str(), ios::app);
 468
 469   save_file << "<Sequence>" << endl;
 470   save_file << sequence << endl;
 471   save_file << "</Sequence>" << endl;
 472
 473   save_file << "<Annotations>" << endl;
 474   save_file << species << endl;
 475   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 476   {
 477     save_file << annots_i->start << " " << annots_i->end << " " ;
 478     save_file << annots_i->name << " " << annots_i->type << endl;
 479   }
 480   save_file << "</Annotations>" << endl;
 481   //save_file.close();
 482 }
 483
 484 void
 485 Sequence::load_museq(fs::path load_file_path, int seq_num)
 486 {
 487   fs::fstream load_file;
 488   string file_data_line;
 489   int seq_counter;
 490   annot an_annot;
 491   string::size_type space_split_i;
 492   string annot_value;
 493
 494   annots.clear();
 495   load_file.open(load_file_path, ios::in);
 496
 497   seq_counter = 0;
 498   // search for the seq_num-th sequence
 499   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 500   {
 501     getline(load_file,file_data_line);
 502     if (file_data_line == "<Sequence>")
 503       seq_counter++;
 504   }
 505   getline(load_file, file_data_line);
 506   sequence = file_data_line;
 507   getline(load_file, file_data_line);
 508   getline(load_file, file_data_line);
 509   if (file_data_line == "<Annotations>")
 510   {
 511     getline(load_file, file_data_line);
 512     species = file_data_line;
 513     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 514     {
 515       getline(load_file,file_data_line);
 516       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 517       {
 518         // need to get 4 values...almost same code 4 times...
 519         // get annot start index
 520         space_split_i = file_data_line.find(" ");
 521         annot_value = file_data_line.substr(0,space_split_i);
 522         an_annot.start = atoi (annot_value.c_str());
 523         file_data_line = file_data_line.substr(space_split_i+1);
 524         // get annot end index
 525         space_split_i = file_data_line.find(" ");
 526         annot_value = file_data_line.substr(0,space_split_i);
 527         an_annot.end = atoi (annot_value.c_str());
 528
 529         if (space_split_i == string::npos)  // no entry for type or name
 530         {
 531           cout << "seq, annots - no type or name\n";
 532           an_annot.type = "";
 533           an_annot.name = "";
 534         }
 535         else   // else get annot type
 536         {
 537           file_data_line = file_data_line.substr(space_split_i+1);
 538           space_split_i = file_data_line.find(" ");
 539           annot_value = file_data_line.substr(0,space_split_i);
 540           an_annot.type = annot_value;
 541           if (space_split_i == string::npos)  // no entry for name
 542           {
 543             cout << "seq, annots - no name\n";
 544             an_annot.name = "";
 545           }
 546           else          // get annot name
 547           {
 548             file_data_line = file_data_line.substr(space_split_i+1);
 549             space_split_i = file_data_line.find(" ");
 550             annot_value = file_data_line.substr(0,space_split_i);
 551             an_annot.type = annot_value;
 552           }
 553         }
 554         annots.push_back(an_annot);  // don't forget to actually add the annot
 555       }
 556       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 557       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 558     }
 559   }
 560   load_file.close();
 561 }
 562
 563
 564 string
 565 Sequence::rc_motif(string a_motif)
 566 {
 567   string rev_comp;
 568   char conversionTable[257];
 569   int seq_i, table_i, len;
 570
 571   len = a_motif.length();
 572   rev_comp.reserve(len);
 573
 574   for(table_i=0; table_i < 256; table_i++)
 575   {
 576     conversionTable[table_i] = '~';
 577   }
 578   // add end of string character for printing out table for testing purposes
 579   conversionTable[256] = '\0';
 580
 581   // add in the characters for the bases we want to convert (IUPAC)
 582   conversionTable[(int)'A'] = 'T';
 583   conversionTable[(int)'T'] = 'A';
 584   conversionTable[(int)'G'] = 'C';
 585   conversionTable[(int)'C'] = 'G';
 586   conversionTable[(int)'N'] = 'N';
 587   conversionTable[(int)'M'] = 'K';
 588   conversionTable[(int)'R'] = 'Y';
 589   conversionTable[(int)'W'] = 'W';
 590   conversionTable[(int)'S'] = 'S';
 591   conversionTable[(int)'Y'] = 'R';
 592   conversionTable[(int)'K'] = 'M';
 593   conversionTable[(int)'V'] = 'B';
 594   conversionTable[(int)'H'] = 'D';
 595   conversionTable[(int)'D'] = 'H';
 596   conversionTable[(int)'B'] = 'V';
 597
 598   // finally, the actual conversion loop
 599   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 600   {
 601     //cout << "** i = " << seq_i << " bp = " <<
 602     table_i = (int) a_motif[seq_i];
 603     rev_comp += conversionTable[table_i];
 604   }
 605
 606   //cout << "seq: " << a_motif << endl;
 607   //cout << "rc:  " << rev_comp << endl;
 608
 609   return rev_comp;
 610 }
 611
 612 string
 613 Sequence::motif_normalize(string a_motif)
 614 {
 615   string valid_motif;
 616   int seq_i, len;
 617
 618   len = a_motif.length();
 619   valid_motif.reserve(len);
 620
 621   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 622   // current nonIUPAC symbols are omitted, which is not reported atm
 623   for(seq_i = 0; seq_i < len; seq_i++)
 624   {
 625     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 626       valid_motif += 'A';
 627     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 628       valid_motif += 'T';
 629     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 630       valid_motif += 'G';
 631     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 632       valid_motif += 'C';
 633     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 634       valid_motif += 'N';
 635     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 636       valid_motif += 'M';
 637     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 638       valid_motif += 'R';
 639     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 640       valid_motif += 'W';
 641     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 642       valid_motif += 'S';
 643     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 644       valid_motif += 'Y';
 645     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 646       valid_motif += 'G';
 647     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 648       valid_motif += 'V';
 649     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 650       valid_motif += 'H';
 651     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 652       valid_motif += 'D';
 653     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 654       valid_motif += 'B';
 655     else {
 656       string msg = "Letter ";
 657       msg += a_motif[seq_i];
 658       msg += " is not a valid IUPAC symbol";
 659       throw motif_normalize_error(msg);
 660     }
 661   }
 662   //cout << "valid_motif is: " << valid_motif << endl;
 663   return valid_motif;
 664 }
 665
 666 void Sequence::add_motif(string a_motif)
 667 {
 668   vector<int> motif_starts = find_motif(a_motif);
 669
 670   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 671       motif_start_i != motif_starts.end();
 672       ++motif_start_i)
 673   {
 674     motif_list.push_back(motif(*motif_start_i, a_motif));
 675   }
 676 }
 677
 678 void Sequence::clear_motifs()
 679 {
 680   motif_list.clear();
 681 }
 682
 683 const list<motif>& Sequence::motifs() const
 684 {
 685   return motif_list;
 686 }
 687
 688 vector<int>
 689 Sequence::find_motif(string a_motif)
 690 {
 691   vector<int> motif_match_starts;
 692   string a_motif_rc;
 693
 694   motif_match_starts.clear();
 695
 696   //cout << "motif is: " << a_motif << endl;
 697   a_motif = motif_normalize(a_motif);
 698   //cout << "motif is: " << a_motif << endl;
 699
 700   if (a_motif != "")
 701   {
 702     //cout << "Sequence: none blank motif\n";
 703     motif_scan(a_motif, &motif_match_starts);
 704
 705     a_motif_rc = rc_motif(a_motif);
 706     // make sure not to do search again if it is a palindrome
 707     if (a_motif_rc != a_motif)
 708       motif_scan(a_motif_rc, &motif_match_starts);
 709   }
 710   return motif_match_starts;
 711 }
 712
 713 void
 714 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 715 {
 716   char * seq_c;
 717   string::size_type seq_i;
 718   int motif_i, motif_len;
 719
 720   // faster to loop thru the sequence as a old c string (ie char array)
 721   seq_c = (char*)sequence.c_str();
 722   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 723   motif_len = a_motif.length();
 724
 725   //cout << "motif_length: " << motif_len << endl;
 726   //cout << "RAAARRRRR\n";
 727
 728   motif_i = 0;
 729
 730   //cout << "motif: " << a_motif << endl;
 731
 732   //cout << "Sequence: motif, length= " << length << endl;
 733   seq_i = 0;
 734   while (seq_i < sequence.length())
 735   {
 736     //cout << seq_c[seq_i];
 737     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 738     // this is pretty much a straight translation of Nora's python code
 739     // to match iupac letter codes
 740     if (a_motif[motif_i] =='N')
 741       motif_i++;
 742     else if (a_motif[motif_i] == seq_c[seq_i])
 743       motif_i++;
 744     else if ((a_motif[motif_i] =='M') &&
 745              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 746       motif_i++;
 747     else if ((a_motif[motif_i] =='R') &&
 748              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 749       motif_i++;
 750     else if ((a_motif[motif_i] =='W') &&
 751              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 752       motif_i++;
 753     else if ((a_motif[motif_i] =='S') &&
 754              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 755       motif_i++;
 756     else if ((a_motif[motif_i] =='Y') &&
 757              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 758       motif_i++;
 759     else if ((a_motif[motif_i] =='K') &&
 760              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 761       motif_i++;
 762     else if ((a_motif[motif_i] =='V') &&
 763              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 764               (seq_c[seq_i]=='G')))
 765       motif_i++;
 766     else if ((a_motif[seq_i] =='H') &&
 767              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 768               (seq_c[seq_i]=='T')))
 769       motif_i++;
 770     else if ((a_motif[motif_i] =='D') &&
 771              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 772               (seq_c[seq_i]=='T')))
 773       motif_i++;
 774     else if ((a_motif[motif_i] =='B') &&
 775              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 776               (seq_c[seq_i]=='T')))
 777       motif_i++;
 778
 779     else
 780     {
 781       seq_i -= motif_i;
 782       motif_i = 0;
 783     }
 784
 785     // end Nora stuff, now we see if a match is found this pass
 786     if (motif_i == motif_len)
 787     {
 788       //cout << "!!";
 789       annot new_motif;
 790       motif_match_starts->push_back(seq_i - motif_len + 1);
 791       motif_i = 0;
 792     }
 793
 794     seq_i++;
 795   }
 796   //cout << endl;
 797 }
 798