alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25
  26 #include "alg/sequence.hpp"
  27 #include "mussa_exceptions.hpp"
  28
  29 #include <string>
  30 #include <iostream>
  31
  32 namespace fs = boost::filesystem;
  33 using namespace std;
  34
  35 annot::annot()
  36  : start(0),
  37    end(0),
  38    type(""),
  39    name("")
  40 {
  41 }
  42
  43 annot::annot(int start, int end, std::string type, std::string name)
  44  : start(start),
  45    end(end),
  46    type(type),
  47    name(name)
  48 {
  49 }
  50
  51 motif::motif(int start, std::string motif)
  52  : annot(start, start+motif.size(), "motif", motif),
  53    sequence(motif)
  54 {
  55 }
  56
  57 Sequence::Sequence()
  58   : sequence(""),
  59     header(""),
  60     species("")
  61 {
  62   annots.clear();
  63   motif_list.clear();
  64 }
  65
  66 Sequence::Sequence(string seq)
  67 {
  68   set_filtered_sequence(seq);
  69 }
  70
  71 Sequence &Sequence::operator=(const Sequence& s)
  72 {
  73   if (this != &s) {
  74     sequence = s.sequence;
  75     header = s.header;
  76     species = s.species;
  77     annots = s.annots;
  78   }
  79   return *this;
  80 }
  81
  82 Sequence &Sequence::operator=(const std::string& s)
  83 {
  84   set_filtered_sequence(s);
  85   return *this;
  86 }
  87
  88 char Sequence::operator[](int index) const
  89 {
  90   return sequence[index];
  91 }
  92
  93 ostream& operator<<(ostream& out, const Sequence& seq)
  94 {
  95   out << "Sequence(" << seq.get_seq() << ")";
  96   return out;
  97 }
  98
  99 //! load a fasta file into a sequence
 100 /*!
 101  * \param file_path the location of the fasta file in the filesystem
 102  * \param seq_num which sequence in the file to load
 103  * \param start_index starting position in the fasta sequence, 0 for beginning
 104  * \param end_index ending position in the fasta sequence, 0 for end
 105  * \return error message, empty string if no error. (gag!)
 106  */
 107 void
 108 Sequence::load_fasta(fs::path file_path, int seq_num,
 109                      int start_index, int end_index)
 110 {
 111   fs::fstream data_file;
 112   string file_data_line;
 113   int header_counter = 0;
 114   bool read_seq = true;
 115   string rev_comp;
 116   string sequence_raw;
 117   string seq_tmp;             // holds sequence during basic filtering
 118
 119   data_file.open(file_path, ios::in);
 120
 121   if (seq_num == 0) {
 122     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 123   }
 124   if (!data_file)
 125   {
 126     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 127   }
 128   // if file opened okay, read it
 129   else
 130   {
 131     // search for the header of the fasta sequence we want
 132     while ( (!data_file.eof()) && (header_counter < seq_num) )
 133     {
 134       getline(data_file,file_data_line);
 135       if (file_data_line.substr(0,1) == ">")
 136         header_counter++;
 137     }
 138
 139     if (header_counter > 0) {
 140       header = file_data_line.substr(1);
 141
 142       sequence_raw = "";
 143
 144       while ( !data_file.eof() && read_seq ) {
 145         getline(data_file,file_data_line);
 146         if (file_data_line.substr(0,1) == ">")
 147           read_seq = false;
 148         else sequence_raw += file_data_line;
 149       }
 150
 151       // Lastly, if subselection of the sequence was specified we keep cut out
 152       // and only keep that part
 153       // end_index = 0 means no end was specified, so cut to the end
 154       if (end_index == 0)
 155         end_index = sequence_raw.size();
 156
 157       // sequence filtering for upcasing agctn and convert non AGCTN to N
 158       set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 159     } else {
 160       throw mussa_load_error("%s did not have a fasta header");
 161     }
 162     data_file.close();
 163   }
 164 }
 165
 166 void Sequence::set_filtered_sequence(const string &old_seq,
 167                                      string::size_type start,
 168                                      string::size_type count)
 169 {
 170   char conversionTable[257];
 171
 172   if ( count == 0)
 173     count = old_seq.size() - start;
 174   sequence.clear();
 175   sequence.reserve(count);
 176
 177   // Make a conversion table
 178
 179   // everything we don't specify below will become 'N'
 180   for(int table_i=0; table_i < 256; table_i++)
 181   {
 182     conversionTable[table_i] = 'N';
 183   }
 184   // add end of string character for printing out table for testing purposes
 185   conversionTable[256] = '\0';
 186
 187   // we want these to map to themselves - ie not to change
 188   conversionTable[(int)'A'] = 'A';
 189   conversionTable[(int)'T'] = 'T';
 190   conversionTable[(int)'G'] = 'G';
 191   conversionTable[(int)'C'] = 'C';
 192   // this is to upcase
 193   conversionTable[(int)'a'] = 'A';
 194   conversionTable[(int)'t'] = 'T';
 195   conversionTable[(int)'g'] = 'G';
 196   conversionTable[(int)'c'] = 'C';
 197
 198   // finally, the actual conversion loop
 199   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 200   {
 201     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 202   }
 203 }
 204
 205   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 206   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 207
 208
 209 void
 210 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 211 {
 212   fs::fstream data_file;
 213   string file_data_line;
 214   annot an_annot;
 215   string::size_type space_split_i;
 216   string annot_value;
 217   list<annot>::iterator list_i;
 218   string err_msg;
 219
 220
 221   annots.clear();
 222   data_file.open(file_path, ios::in);
 223
 224   if (!data_file)
 225   {
 226     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 227   }
 228   // if file opened okay, read it
 229   else
 230   {
 231     getline(data_file,file_data_line);
 232     species = file_data_line;
 233
 234     // end_index = 0 means no end was specified, so cut to the end
 235     if (end_index == 0)
 236       end_index = sequence.length();
 237
 238     //cout << "START: " << start_index << " END: " << end_index << endl;
 239
 240     while ( !data_file.eof() )
 241     {
 242       getline(data_file,file_data_line);
 243       if (file_data_line != "")
 244       {
 245         // need to get 4 values...almost same code 4 times...
 246         // get annot start index
 247         space_split_i = file_data_line.find(" ");
 248         annot_value = file_data_line.substr(0,space_split_i);
 249         an_annot.start = atoi (annot_value.c_str());
 250         file_data_line = file_data_line.substr(space_split_i+1);
 251         // get annot end index
 252         space_split_i = file_data_line.find(" ");
 253         annot_value = file_data_line.substr(0,space_split_i);
 254         an_annot.end = atoi (annot_value.c_str());
 255         file_data_line = file_data_line.substr(space_split_i+1);
 256
 257         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 258               //     << endl;
 259
 260         // get annot name
 261         space_split_i = file_data_line.find(" ");
 262         if (space_split_i == string::npos)  // no entries for name & type
 263         {
 264           cout << "seq, annots - no name or type\n";
 265           an_annot.name = "";
 266           an_annot.type = "";
 267         }
 268         else
 269         {
 270           annot_value = file_data_line.substr(0,space_split_i);
 271           an_annot.name = annot_value;
 272           file_data_line = file_data_line.substr(space_split_i+1);
 273           // get annot type
 274           space_split_i = file_data_line.find(" ");
 275           if (space_split_i == string::npos)  // no entry for type
 276             an_annot.type = "";
 277           else
 278           {
 279             annot_value = file_data_line.substr(0,space_split_i);
 280             an_annot.type = annot_value;
 281           }
 282         }
 283
 284
 285         // add annot to list if it falls within the range of sequence specified
 286         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 287         {
 288           an_annot.start -= start_index;
 289           an_annot.end -= start_index;
 290           annots.push_back(an_annot);
 291         }
 292         // else no (or bogus) annotations
 293       }
 294     }
 295
 296     data_file.close();
 297     /*
 298     // debugging check
 299     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 300     {
 301       cout << (*list_i).start << "," << (*list_i).end << "\t";
 302       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 303     }
 304     */
 305   }
 306 }
 307
 308 const std::string& Sequence::get_species() const
 309 {
 310   return species;
 311 }
 312
 313 bool Sequence::empty() const
 314 {
 315   return (size() == 0);
 316 }
 317
 318 const std::list<annot>& Sequence::annotations() const
 319 {
 320   return annots;
 321 }
 322
 323 string::size_type Sequence::length() const
 324 {
 325   return size();
 326 }
 327
 328 string::size_type Sequence::size() const
 329 {
 330   return sequence.size();
 331 }
 332
 333 Sequence::iterator Sequence::begin()
 334 {
 335   return sequence.begin();
 336 }
 337
 338 Sequence::const_iterator Sequence::begin() const
 339 {
 340   return sequence.begin();
 341 }
 342
 343 Sequence::iterator Sequence::end()
 344 {
 345   return sequence.end();
 346 }
 347
 348 Sequence::const_iterator Sequence::end() const
 349 {
 350   return sequence.end();
 351 }
 352
 353
 354 const string&
 355 Sequence::get_seq() const
 356 {
 357   return sequence;
 358 }
 359
 360
 361 string
 362 Sequence::subseq(int start, int end) const
 363 {
 364   return sequence.substr(start, end);
 365 }
 366
 367
 368 const char *
 369 Sequence::c_seq() const
 370 {
 371   return sequence.c_str();
 372 }
 373
 374 string
 375 Sequence::rev_comp() const
 376 {
 377   string rev_comp;
 378   char conversionTable[257];
 379   int seq_i, table_i, len;
 380
 381   len = sequence.length();
 382   rev_comp.reserve(len);
 383   // make a conversion table
 384   // init all parts of conversion table to '~' character
 385   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 386   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 387   // seven generations of their progeny) who decides to make it mean
 388   // something special!!!
 389   // PS - double the curse for any smartass non-biologist who tries it as well
 390   for(table_i=0; table_i < 256; table_i++)
 391   {
 392     conversionTable[table_i] = '~';
 393   }
 394   // add end of string character for printing out table for testing purposes
 395   conversionTable[256] = '\0';
 396
 397   // add in the characters for the bases we want to convert
 398   conversionTable[(int)'A'] = 'T';
 399   conversionTable[(int)'T'] = 'A';
 400   conversionTable[(int)'G'] = 'C';
 401   conversionTable[(int)'C'] = 'G';
 402   conversionTable[(int)'N'] = 'N';
 403
 404   // finally, the actual conversion loop
 405   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 406   {
 407     table_i = (int) sequence[seq_i];
 408     rev_comp += conversionTable[table_i];
 409   }
 410
 411   return rev_comp;
 412 }
 413
 414
 415 const string&
 416 Sequence::get_header() const
 417 {
 418   return header;
 419 }
 420 /*
 421 //FIXME: i don't think this code is callable
 422 string
 423 Sequence::sp_name() const
 424 {
 425   return species;
 426 }
 427 */
 428
 429 void
 430 Sequence::set_seq(const string& a_seq)
 431 {
 432   set_filtered_sequence(a_seq);
 433 }
 434
 435
 436 /*
 437 string
 438 Sequence::species()
 439 {
 440   return species;
 441 }
 442 */
 443
 444 void
 445 Sequence::clear()
 446 {
 447   sequence = "";
 448   header = "";
 449   species = "";
 450   annots.clear();
 451 }
 452
 453 void
 454 Sequence::save(fs::fstream &save_file)
 455                //string save_file_path)
 456 {
 457   //fstream save_file;
 458   list<annot>::iterator annots_i;
 459
 460   // not sure why, or if i'm doing something wrong, but can't seem to pass
 461   // file pointers down to this method from the mussa control class
 462   // so each call to save a sequence appends to the file started by mussa_class
 463   //save_file.open(save_file_path.c_str(), ios::app);
 464
 465   save_file << "<Sequence>" << endl;
 466   save_file << sequence << endl;
 467   save_file << "</Sequence>" << endl;
 468
 469   save_file << "<Annotations>" << endl;
 470   save_file << species << endl;
 471   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 472   {
 473     save_file << annots_i->start << " " << annots_i->end << " " ;
 474     save_file << annots_i->name << " " << annots_i->type << endl;
 475   }
 476   save_file << "</Annotations>" << endl;
 477   //save_file.close();
 478 }
 479
 480 void
 481 Sequence::load_museq(fs::path load_file_path, int seq_num)
 482 {
 483   fs::fstream load_file;
 484   string file_data_line;
 485   int seq_counter;
 486   annot an_annot;
 487   string::size_type space_split_i;
 488   string annot_value;
 489
 490   annots.clear();
 491   load_file.open(load_file_path, ios::in);
 492
 493   seq_counter = 0;
 494   // search for the seq_num-th sequence
 495   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 496   {
 497     getline(load_file,file_data_line);
 498     if (file_data_line == "<Sequence>")
 499       seq_counter++;
 500   }
 501   getline(load_file, file_data_line);
 502   sequence = file_data_line;
 503   getline(load_file, file_data_line);
 504   getline(load_file, file_data_line);
 505   if (file_data_line == "<Annotations>")
 506   {
 507     getline(load_file, file_data_line);
 508     species = file_data_line;
 509     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 510     {
 511       getline(load_file,file_data_line);
 512       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 513       {
 514         // need to get 4 values...almost same code 4 times...
 515         // get annot start index
 516         space_split_i = file_data_line.find(" ");
 517         annot_value = file_data_line.substr(0,space_split_i);
 518         an_annot.start = atoi (annot_value.c_str());
 519         file_data_line = file_data_line.substr(space_split_i+1);
 520         // get annot end index
 521         space_split_i = file_data_line.find(" ");
 522         annot_value = file_data_line.substr(0,space_split_i);
 523         an_annot.end = atoi (annot_value.c_str());
 524
 525         if (space_split_i == string::npos)  // no entry for type or name
 526         {
 527           cout << "seq, annots - no type or name\n";
 528           an_annot.type = "";
 529           an_annot.name = "";
 530         }
 531         else   // else get annot type
 532         {
 533           file_data_line = file_data_line.substr(space_split_i+1);
 534           space_split_i = file_data_line.find(" ");
 535           annot_value = file_data_line.substr(0,space_split_i);
 536           an_annot.type = annot_value;
 537           if (space_split_i == string::npos)  // no entry for name
 538           {
 539             cout << "seq, annots - no name\n";
 540             an_annot.name = "";
 541           }
 542           else          // get annot name
 543           {
 544             file_data_line = file_data_line.substr(space_split_i+1);
 545             space_split_i = file_data_line.find(" ");
 546             annot_value = file_data_line.substr(0,space_split_i);
 547             an_annot.type = annot_value;
 548           }
 549         }
 550         annots.push_back(an_annot);  // don't forget to actually add the annot
 551       }
 552       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 553       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 554     }
 555   }
 556   load_file.close();
 557 }
 558
 559
 560 string
 561 Sequence::rc_motif(string a_motif)
 562 {
 563   string rev_comp;
 564   char conversionTable[257];
 565   int seq_i, table_i, len;
 566
 567   len = a_motif.length();
 568   rev_comp.reserve(len);
 569
 570   for(table_i=0; table_i < 256; table_i++)
 571   {
 572     conversionTable[table_i] = '~';
 573   }
 574   // add end of string character for printing out table for testing purposes
 575   conversionTable[256] = '\0';
 576
 577   // add in the characters for the bases we want to convert (IUPAC)
 578   conversionTable[(int)'A'] = 'T';
 579   conversionTable[(int)'T'] = 'A';
 580   conversionTable[(int)'G'] = 'C';
 581   conversionTable[(int)'C'] = 'G';
 582   conversionTable[(int)'N'] = 'N';
 583   conversionTable[(int)'M'] = 'K';
 584   conversionTable[(int)'R'] = 'Y';
 585   conversionTable[(int)'W'] = 'W';
 586   conversionTable[(int)'S'] = 'S';
 587   conversionTable[(int)'Y'] = 'R';
 588   conversionTable[(int)'K'] = 'M';
 589   conversionTable[(int)'V'] = 'B';
 590   conversionTable[(int)'H'] = 'D';
 591   conversionTable[(int)'D'] = 'H';
 592   conversionTable[(int)'B'] = 'V';
 593
 594   // finally, the actual conversion loop
 595   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 596   {
 597     //cout << "** i = " << seq_i << " bp = " <<
 598     table_i = (int) a_motif[seq_i];
 599     rev_comp += conversionTable[table_i];
 600   }
 601
 602   //cout << "seq: " << a_motif << endl;
 603   //cout << "rc:  " << rev_comp << endl;
 604
 605   return rev_comp;
 606 }
 607
 608 string
 609 Sequence::motif_normalize(string a_motif)
 610 {
 611   string valid_motif;
 612   int seq_i, len;
 613
 614   len = a_motif.length();
 615   valid_motif.reserve(len);
 616
 617   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 618   // current nonIUPAC symbols are omitted, which is not reported atm
 619   for(seq_i = 0; seq_i < len; seq_i++)
 620   {
 621     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 622       valid_motif += 'A';
 623     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 624       valid_motif += 'T';
 625     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 626       valid_motif += 'G';
 627     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 628       valid_motif += 'C';
 629     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 630       valid_motif += 'N';
 631     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 632       valid_motif += 'M';
 633     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 634       valid_motif += 'R';
 635     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 636       valid_motif += 'W';
 637     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 638       valid_motif += 'S';
 639     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 640       valid_motif += 'Y';
 641     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 642       valid_motif += 'G';
 643     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 644       valid_motif += 'V';
 645     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 646       valid_motif += 'H';
 647     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 648       valid_motif += 'D';
 649     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 650       valid_motif += 'B';
 651     else {
 652       string msg = "Letter ";
 653       msg += a_motif[seq_i];
 654       msg += " is not a valid IUPAC symbol";
 655       throw motif_normalize_error(msg);
 656     }
 657   }
 658   //cout << "valid_motif is: " << valid_motif << endl;
 659   return valid_motif;
 660 }
 661
 662 void Sequence::add_motif(string a_motif)
 663 {
 664   vector<int> motif_starts = find_motif(a_motif);
 665
 666   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 667       motif_start_i != motif_starts.end();
 668       ++motif_start_i)
 669   {
 670     motif_list.push_back(motif(*motif_start_i, a_motif));
 671   }
 672 }
 673
 674 void Sequence::clear_motifs()
 675 {
 676   motif_list.clear();
 677 }
 678
 679 const list<motif>& Sequence::motifs() const
 680 {
 681   return motif_list;
 682 }
 683
 684 vector<int>
 685 Sequence::find_motif(string a_motif)
 686 {
 687   vector<int> motif_match_starts;
 688   string a_motif_rc;
 689
 690   motif_match_starts.clear();
 691
 692   //cout << "motif is: " << a_motif << endl;
 693   a_motif = motif_normalize(a_motif);
 694   //cout << "motif is: " << a_motif << endl;
 695
 696   if (a_motif != "")
 697   {
 698     //cout << "Sequence: none blank motif\n";
 699     motif_scan(a_motif, &motif_match_starts);
 700
 701     a_motif_rc = rc_motif(a_motif);
 702     // make sure not to do search again if it is a palindrome
 703     if (a_motif_rc != a_motif)
 704       motif_scan(a_motif_rc, &motif_match_starts);
 705   }
 706   return motif_match_starts;
 707 }
 708
 709 void
 710 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 711 {
 712   char * seq_c;
 713   string::size_type seq_i;
 714   int motif_i, motif_len;
 715
 716   // faster to loop thru the sequence as a old c string (ie char array)
 717   seq_c = (char*)sequence.c_str();
 718   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 719   motif_len = a_motif.length();
 720
 721   //cout << "motif_length: " << motif_len << endl;
 722   //cout << "RAAARRRRR\n";
 723
 724   motif_i = 0;
 725
 726   //cout << "motif: " << a_motif << endl;
 727
 728   //cout << "Sequence: motif, length= " << length << endl;
 729   seq_i = 0;
 730   while (seq_i < sequence.length())
 731   {
 732     //cout << seq_c[seq_i];
 733     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 734     // this is pretty much a straight translation of Nora's python code
 735     // to match iupac letter codes
 736     if (a_motif[motif_i] =='N')
 737       motif_i++;
 738     else if (a_motif[motif_i] == seq_c[seq_i])
 739       motif_i++;
 740     else if ((a_motif[motif_i] =='M') &&
 741              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 742       motif_i++;
 743     else if ((a_motif[motif_i] =='R') &&
 744              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 745       motif_i++;
 746     else if ((a_motif[motif_i] =='W') &&
 747              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 748       motif_i++;
 749     else if ((a_motif[motif_i] =='S') &&
 750              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 751       motif_i++;
 752     else if ((a_motif[motif_i] =='Y') &&
 753              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 754       motif_i++;
 755     else if ((a_motif[motif_i] =='K') &&
 756              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 757       motif_i++;
 758     else if ((a_motif[motif_i] =='V') &&
 759              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 760               (seq_c[seq_i]=='G')))
 761       motif_i++;
 762     else if ((a_motif[seq_i] =='H') &&
 763              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 764               (seq_c[seq_i]=='T')))
 765       motif_i++;
 766     else if ((a_motif[motif_i] =='D') &&
 767              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 768               (seq_c[seq_i]=='T')))
 769       motif_i++;
 770     else if ((a_motif[motif_i] =='B') &&
 771              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 772               (seq_c[seq_i]=='T')))
 773       motif_i++;
 774
 775     else
 776     {
 777       seq_i -= motif_i;
 778       motif_i = 0;
 779     }
 780
 781     // end Nora stuff, now we see if a match is found this pass
 782     if (motif_i == motif_len)
 783     {
 784       //cout << "!!";
 785       annot new_motif;
 786       motif_match_starts->push_back(seq_i - motif_len + 1);
 787       motif_i = 0;
 788     }
 789
 790     seq_i++;
 791   }
 792   //cout << endl;
 793 }
 794