alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "alg/sequence.hpp"
  26 #include "mussa_exceptions.hpp"
  27
  28 #include <string>
  29 #include <iostream>
  30
  31 using namespace std;
  32
  33 annot::annot()
  34  : start(0),
  35    end(0),
  36    type(""),
  37    name("")
  38 {
  39 }
  40
  41 annot::annot(int start, int end, std::string type, std::string name)
  42  : start(start),
  43    end(end),
  44    type(type),
  45    name(name)
  46 {
  47 }
  48
  49 motif::motif(int start, std::string motif)
  50  : annot(start, start+motif.size(), "motif", motif),
  51    sequence(motif)
  52 {
  53 }
  54
  55 Sequence::Sequence()
  56   : sequence(""),
  57     header(""),
  58     species("")
  59 {
  60   annots.clear();
  61   motif_list.clear();
  62 }
  63
  64 Sequence::Sequence(string seq)
  65 {
  66   set_filtered_sequence(seq);
  67 }
  68
  69 Sequence &Sequence::operator=(const Sequence& s)
  70 {
  71   if (this != &s) {
  72     sequence = s.sequence;
  73     header = s.header;
  74     species = s.species;
  75     annots = s.annots;
  76   }
  77   return *this;
  78 }
  79
  80 Sequence &Sequence::operator=(const std::string& s)
  81 {
  82   set_filtered_sequence(s);
  83   return *this;
  84 }
  85
  86 char Sequence::operator[](int index) const
  87 {
  88   return sequence[index];
  89 }
  90
  91 ostream& operator<<(ostream& out, const Sequence& seq)
  92 {
  93   out << "Sequence(" << seq.get_seq() << ")";
  94   return out;
  95 }
  96
  97 //! load a fasta file into a sequence
  98 /*!
  99  * \param file_path the location of the fasta file in the filesystem
 100  * \param seq_num which sequence in the file to load
 101  * \param start_index starting position in the fasta sequence, 0 for beginning
 102  * \param end_index ending position in the fasta sequence, 0 for end
 103  * \return error message, empty string if no error. (gag!)
 104  */
 105 void
 106 Sequence::load_fasta(string file_path, int seq_num,
 107                      int start_index, int end_index)
 108 {
 109   fstream data_file;
 110   string file_data_line;
 111   int header_counter = 0;
 112   bool read_seq = true;
 113   string rev_comp;
 114   string sequence_raw;
 115   string seq_tmp;             // holds sequence during basic filtering
 116
 117   data_file.open(file_path.c_str(), ios::in);
 118
 119   if (seq_num == 0) {
 120     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 121   }
 122   if (!data_file)
 123   {
 124     throw mussa_load_error("Sequence File: " + file_path + " not found");
 125   }
 126   // if file opened okay, read it
 127   else
 128   {
 129     // search for the header of the fasta sequence we want
 130     while ( (!data_file.eof()) && (header_counter < seq_num) )
 131     {
 132       getline(data_file,file_data_line);
 133       if (file_data_line.substr(0,1) == ">")
 134         header_counter++;
 135     }
 136
 137     header = file_data_line.substr(1);
 138
 139     sequence_raw = "";
 140
 141     while ( !data_file.eof() && read_seq )
 142     {
 143       getline(data_file,file_data_line);
 144       if (file_data_line.substr(0,1) == ">")
 145         read_seq = false;
 146       else sequence_raw += file_data_line;
 147     }
 148
 149     data_file.close();
 150
 151     // Lastly, if subselection of the sequence was specified we keep cut out
 152     // and only keep that part
 153     // end_index = 0 means no end was specified, so cut to the end
 154     if (end_index == 0)
 155       end_index = sequence_raw.size();
 156
 157     // sequence filtering for upcasing agctn and convert non AGCTN to N
 158     set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 159   }
 160 }
 161
 162 void Sequence::set_filtered_sequence(const string &old_seq,
 163                                      string::size_type start,
 164                                      string::size_type count)
 165 {
 166   char conversionTable[257];
 167
 168   if ( count == 0)
 169     count = old_seq.size() - start;
 170   sequence.clear();
 171   sequence.reserve(count);
 172
 173   // Make a conversion table
 174
 175   // everything we don't specify below will become 'N'
 176   for(int table_i=0; table_i < 256; table_i++)
 177   {
 178     conversionTable[table_i] = 'N';
 179   }
 180   // add end of string character for printing out table for testing purposes
 181   conversionTable[256] = '\0';
 182
 183   // we want these to map to themselves - ie not to change
 184   conversionTable[(int)'A'] = 'A';
 185   conversionTable[(int)'T'] = 'T';
 186   conversionTable[(int)'G'] = 'G';
 187   conversionTable[(int)'C'] = 'C';
 188   // this is to upcase
 189   conversionTable[(int)'a'] = 'A';
 190   conversionTable[(int)'t'] = 'T';
 191   conversionTable[(int)'g'] = 'G';
 192   conversionTable[(int)'c'] = 'C';
 193
 194   // finally, the actual conversion loop
 195   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 196   {
 197     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 198   }
 199 }
 200
 201   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 202   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 203
 204
 205 void
 206 Sequence::load_annot(string file_path, int start_index, int end_index)
 207 {
 208   fstream data_file;
 209   string file_data_line;
 210   annot an_annot;
 211   string::size_type space_split_i;
 212   string annot_value;
 213   list<annot>::iterator list_i;
 214   string err_msg;
 215
 216
 217   annots.clear();
 218   data_file.open(file_path.c_str(), ios::in);
 219
 220   if (!data_file)
 221   {
 222     throw mussa_load_error("Sequence File: " + file_path + " not found");
 223   }
 224   // if file opened okay, read it
 225   else
 226   {
 227     getline(data_file,file_data_line);
 228     species = file_data_line;
 229
 230     // end_index = 0 means no end was specified, so cut to the end
 231     if (end_index == 0)
 232       end_index = sequence.length();
 233
 234     //cout << "START: " << start_index << " END: " << end_index << endl;
 235
 236     while ( !data_file.eof() )
 237     {
 238       getline(data_file,file_data_line);
 239       if (file_data_line != "")
 240       {
 241         // need to get 4 values...almost same code 4 times...
 242         // get annot start index
 243         space_split_i = file_data_line.find(" ");
 244         annot_value = file_data_line.substr(0,space_split_i);
 245         an_annot.start = atoi (annot_value.c_str());
 246         file_data_line = file_data_line.substr(space_split_i+1);
 247         // get annot end index
 248         space_split_i = file_data_line.find(" ");
 249         annot_value = file_data_line.substr(0,space_split_i);
 250         an_annot.end = atoi (annot_value.c_str());
 251         file_data_line = file_data_line.substr(space_split_i+1);
 252
 253         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 254               //     << endl;
 255
 256         // get annot name
 257         space_split_i = file_data_line.find(" ");
 258         if (space_split_i == string::npos)  // no entries for name & type
 259         {
 260           cout << "seq, annots - no name or type\n";
 261           an_annot.name = "";
 262           an_annot.type = "";
 263         }
 264         else
 265         {
 266           annot_value = file_data_line.substr(0,space_split_i);
 267           an_annot.name = annot_value;
 268           file_data_line = file_data_line.substr(space_split_i+1);
 269           // get annot type
 270           space_split_i = file_data_line.find(" ");
 271           if (space_split_i == string::npos)  // no entry for type
 272             an_annot.type = "";
 273           else
 274           {
 275             annot_value = file_data_line.substr(0,space_split_i);
 276             an_annot.type = annot_value;
 277           }
 278         }
 279
 280
 281         // add annot to list if it falls within the range of sequence specified
 282         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 283         {
 284           an_annot.start -= start_index;
 285           an_annot.end -= start_index;
 286           annots.push_back(an_annot);
 287         }
 288         else
 289           cout << "FAILED!!!!!!\n";
 290       }
 291     }
 292
 293     data_file.close();
 294     /*
 295     // debugging check
 296     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 297     {
 298       cout << (*list_i).start << "," << (*list_i).end << "\t";
 299       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 300     }
 301     */
 302   }
 303 }
 304
 305 const std::string& Sequence::get_species() const
 306 {
 307   return species;
 308 }
 309
 310 bool Sequence::empty() const
 311 {
 312   return (size() == 0);
 313 }
 314
 315 const std::list<annot>& Sequence::annotations() const
 316 {
 317   return annots;
 318 }
 319
 320 string::size_type Sequence::length() const
 321 {
 322   return size();
 323 }
 324
 325 string::size_type Sequence::size() const
 326 {
 327   return sequence.size();
 328 }
 329
 330 Sequence::iterator Sequence::begin()
 331 {
 332   return sequence.begin();
 333 }
 334
 335 Sequence::const_iterator Sequence::begin() const
 336 {
 337   return sequence.begin();
 338 }
 339
 340 Sequence::iterator Sequence::end()
 341 {
 342   return sequence.end();
 343 }
 344
 345 Sequence::const_iterator Sequence::end() const
 346 {
 347   return sequence.end();
 348 }
 349
 350
 351 const string&
 352 Sequence::get_seq() const
 353 {
 354   return sequence;
 355 }
 356
 357
 358 string
 359 Sequence::subseq(int start, int end) const
 360 {
 361   return sequence.substr(start, end);
 362 }
 363
 364
 365 const char *
 366 Sequence::c_seq() const
 367 {
 368   return sequence.c_str();
 369 }
 370
 371 string
 372 Sequence::rev_comp() const
 373 {
 374   string rev_comp;
 375   char conversionTable[257];
 376   int seq_i, table_i, len;
 377
 378   len = sequence.length();
 379   rev_comp.reserve(len);
 380   // make a conversion table
 381   // init all parts of conversion table to '~' character
 382   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 383   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 384   // seven generations of their progeny) who decides to make it mean
 385   // something special!!!
 386   // PS - double the curse for any smartass non-biologist who tries it as well
 387   for(table_i=0; table_i < 256; table_i++)
 388   {
 389     conversionTable[table_i] = '~';
 390   }
 391   // add end of string character for printing out table for testing purposes
 392   conversionTable[256] = '\0';
 393
 394   // add in the characters for the bases we want to convert
 395   conversionTable[(int)'A'] = 'T';
 396   conversionTable[(int)'T'] = 'A';
 397   conversionTable[(int)'G'] = 'C';
 398   conversionTable[(int)'C'] = 'G';
 399   conversionTable[(int)'N'] = 'N';
 400
 401   // finally, the actual conversion loop
 402   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 403   {
 404     table_i = (int) sequence[seq_i];
 405     rev_comp += conversionTable[table_i];
 406   }
 407
 408   return rev_comp;
 409 }
 410
 411
 412 const string&
 413 Sequence::get_header() const
 414 {
 415   return header;
 416 }
 417 /*
 418 //FIXME: i don't think this code is callable
 419 string
 420 Sequence::sp_name() const
 421 {
 422   return species;
 423 }
 424 */
 425
 426 void
 427 Sequence::set_seq(const string& a_seq)
 428 {
 429   set_filtered_sequence(a_seq);
 430 }
 431
 432
 433 /*
 434 string
 435 Sequence::species()
 436 {
 437   return species;
 438 }
 439 */
 440
 441 void
 442 Sequence::clear()
 443 {
 444   sequence = "";
 445   header = "";
 446   species = "";
 447   annots.clear();
 448 }
 449
 450 void
 451 Sequence::save(fstream &save_file)
 452                //string save_file_path)
 453 {
 454   //fstream save_file;
 455   list<annot>::iterator annots_i;
 456
 457   // not sure why, or if i'm doing something wrong, but can't seem to pass
 458   // file pointers down to this method from the mussa control class
 459   // so each call to save a sequence appends to the file started by mussa_class
 460   //save_file.open(save_file_path.c_str(), ios::app);
 461
 462   save_file << "<Sequence>" << endl;
 463   save_file << sequence << endl;
 464   save_file << "</Sequence>" << endl;
 465
 466   save_file << "<Annotations>" << endl;
 467   save_file << species << endl;
 468   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 469   {
 470     save_file << annots_i->start << " " << annots_i->end << " " ;
 471     save_file << annots_i->name << " " << annots_i->type << endl;
 472   }
 473   save_file << "</Annotations>" << endl;
 474   //save_file.close();
 475 }
 476
 477 void
 478 Sequence::load_museq(string load_file_path, int seq_num)
 479 {
 480   fstream load_file;
 481   string file_data_line;
 482   int seq_counter;
 483   annot an_annot;
 484   string::size_type space_split_i;
 485   string annot_value;
 486
 487   annots.clear();
 488   load_file.open(load_file_path.c_str(), ios::in);
 489
 490   seq_counter = 0;
 491   // search for the seq_num-th sequence
 492   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 493   {
 494     getline(load_file,file_data_line);
 495     if (file_data_line == "<Sequence>")
 496       seq_counter++;
 497   }
 498   getline(load_file, file_data_line);
 499   sequence = file_data_line;
 500   getline(load_file, file_data_line);
 501   getline(load_file, file_data_line);
 502   if (file_data_line == "<Annotations>")
 503   {
 504     getline(load_file, file_data_line);
 505     species = file_data_line;
 506     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 507     {
 508       getline(load_file,file_data_line);
 509       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 510       {
 511         // need to get 4 values...almost same code 4 times...
 512         // get annot start index
 513         space_split_i = file_data_line.find(" ");
 514         annot_value = file_data_line.substr(0,space_split_i);
 515         an_annot.start = atoi (annot_value.c_str());
 516         file_data_line = file_data_line.substr(space_split_i+1);
 517         // get annot end index
 518         space_split_i = file_data_line.find(" ");
 519         annot_value = file_data_line.substr(0,space_split_i);
 520         an_annot.end = atoi (annot_value.c_str());
 521
 522         if (space_split_i == string::npos)  // no entry for type or name
 523         {
 524           cout << "seq, annots - no type or name\n";
 525           an_annot.type = "";
 526           an_annot.name = "";
 527         }
 528         else   // else get annot type
 529         {
 530           file_data_line = file_data_line.substr(space_split_i+1);
 531           space_split_i = file_data_line.find(" ");
 532           annot_value = file_data_line.substr(0,space_split_i);
 533           an_annot.type = annot_value;
 534           if (space_split_i == string::npos)  // no entry for name
 535           {
 536             cout << "seq, annots - no name\n";
 537             an_annot.name = "";
 538           }
 539           else          // get annot name
 540           {
 541             file_data_line = file_data_line.substr(space_split_i+1);
 542             space_split_i = file_data_line.find(" ");
 543             annot_value = file_data_line.substr(0,space_split_i);
 544             an_annot.type = annot_value;
 545           }
 546         }
 547         annots.push_back(an_annot);  // don't forget to actually add the annot
 548       }
 549       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 550       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 551     }
 552   }
 553   load_file.close();
 554 }
 555
 556
 557 string
 558 Sequence::rc_motif(string a_motif)
 559 {
 560   string rev_comp;
 561   char conversionTable[257];
 562   int seq_i, table_i, len;
 563
 564   len = a_motif.length();
 565   rev_comp.reserve(len);
 566
 567   for(table_i=0; table_i < 256; table_i++)
 568   {
 569     conversionTable[table_i] = '~';
 570   }
 571   // add end of string character for printing out table for testing purposes
 572   conversionTable[256] = '\0';
 573
 574   // add in the characters for the bases we want to convert (IUPAC)
 575   conversionTable[(int)'A'] = 'T';
 576   conversionTable[(int)'T'] = 'A';
 577   conversionTable[(int)'G'] = 'C';
 578   conversionTable[(int)'C'] = 'G';
 579   conversionTable[(int)'N'] = 'N';
 580   conversionTable[(int)'M'] = 'K';
 581   conversionTable[(int)'R'] = 'Y';
 582   conversionTable[(int)'W'] = 'W';
 583   conversionTable[(int)'S'] = 'S';
 584   conversionTable[(int)'Y'] = 'R';
 585   conversionTable[(int)'K'] = 'M';
 586   conversionTable[(int)'V'] = 'B';
 587   conversionTable[(int)'H'] = 'D';
 588   conversionTable[(int)'D'] = 'H';
 589   conversionTable[(int)'B'] = 'V';
 590
 591   // finally, the actual conversion loop
 592   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 593   {
 594     //cout << "** i = " << seq_i << " bp = " <<
 595     table_i = (int) a_motif[seq_i];
 596     rev_comp += conversionTable[table_i];
 597   }
 598
 599   //cout << "seq: " << a_motif << endl;
 600   //cout << "rc:  " << rev_comp << endl;
 601
 602   return rev_comp;
 603 }
 604
 605 string
 606 Sequence::motif_normalize(string a_motif)
 607 {
 608   string valid_motif;
 609   int seq_i, len;
 610
 611   len = a_motif.length();
 612   valid_motif.reserve(len);
 613
 614   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 615   // current nonIUPAC symbols are omitted, which is not reported atm
 616   for(seq_i = 0; seq_i < len; seq_i++)
 617   {
 618     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 619       valid_motif += 'A';
 620     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 621       valid_motif += 'T';
 622     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 623       valid_motif += 'G';
 624     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 625       valid_motif += 'C';
 626     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 627       valid_motif += 'N';
 628     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 629       valid_motif += 'M';
 630     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 631       valid_motif += 'R';
 632     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 633       valid_motif += 'W';
 634     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 635       valid_motif += 'S';
 636     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 637       valid_motif += 'Y';
 638     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 639       valid_motif += 'G';
 640     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 641       valid_motif += 'V';
 642     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 643       valid_motif += 'H';
 644     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 645       valid_motif += 'D';
 646     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 647       valid_motif += 'B';
 648     else {
 649       string msg = "Letter ";
 650       msg += a_motif[seq_i];
 651       msg += " is not a valid IUPAC symbol";
 652       throw motif_normalize_error(msg);
 653     }
 654   }
 655   //cout << "valid_motif is: " << valid_motif << endl;
 656   return valid_motif;
 657 }
 658
 659 void Sequence::add_motif(string a_motif)
 660 {
 661   vector<int> motif_starts = find_motif(a_motif);
 662
 663   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 664       motif_start_i != motif_starts.end();
 665       ++motif_start_i)
 666   {
 667     motif_list.push_back(motif(*motif_start_i, a_motif));
 668   }
 669 }
 670
 671 void Sequence::clear_motifs()
 672 {
 673   motif_list.clear();
 674 }
 675
 676 const list<motif>& Sequence::motifs() const
 677 {
 678   return motif_list;
 679 }
 680
 681 vector<int>
 682 Sequence::find_motif(string a_motif)
 683 {
 684   vector<int> motif_match_starts;
 685   string a_motif_rc;
 686
 687   motif_match_starts.clear();
 688
 689   //cout << "motif is: " << a_motif << endl;
 690   a_motif = motif_normalize(a_motif);
 691   //cout << "motif is: " << a_motif << endl;
 692
 693   if (a_motif != "")
 694   {
 695     //cout << "Sequence: none blank motif\n";
 696     motif_scan(a_motif, &motif_match_starts);
 697
 698     a_motif_rc = rc_motif(a_motif);
 699     // make sure not to do search again if it is a palindrome
 700     if (a_motif_rc != a_motif)
 701       motif_scan(a_motif_rc, &motif_match_starts);
 702   }
 703   return motif_match_starts;
 704 }
 705
 706 void
 707 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 708 {
 709   char * seq_c;
 710   string::size_type seq_i;
 711   int motif_i, motif_len;
 712
 713   // faster to loop thru the sequence as a old c string (ie char array)
 714   seq_c = (char*)sequence.c_str();
 715   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 716   motif_len = a_motif.length();
 717
 718   //cout << "motif_length: " << motif_len << endl;
 719   //cout << "RAAARRRRR\n";
 720
 721   motif_i = 0;
 722
 723   //cout << "motif: " << a_motif << endl;
 724
 725   //cout << "Sequence: motif, length= " << length << endl;
 726   seq_i = 0;
 727   while (seq_i < sequence.length())
 728   {
 729     //cout << seq_c[seq_i];
 730     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 731     // this is pretty much a straight translation of Nora's python code
 732     // to match iupac letter codes
 733     if (a_motif[motif_i] =='N')
 734       motif_i++;
 735     else if (a_motif[motif_i] == seq_c[seq_i])
 736       motif_i++;
 737     else if ((a_motif[motif_i] =='M') &&
 738              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 739       motif_i++;
 740     else if ((a_motif[motif_i] =='R') &&
 741              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 742       motif_i++;
 743     else if ((a_motif[motif_i] =='W') &&
 744              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 745       motif_i++;
 746     else if ((a_motif[motif_i] =='S') &&
 747              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 748       motif_i++;
 749     else if ((a_motif[motif_i] =='Y') &&
 750              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 751       motif_i++;
 752     else if ((a_motif[motif_i] =='K') &&
 753              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 754       motif_i++;
 755     else if ((a_motif[motif_i] =='V') &&
 756              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 757               (seq_c[seq_i]=='G')))
 758       motif_i++;
 759     else if ((a_motif[seq_i] =='H') &&
 760              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 761               (seq_c[seq_i]=='T')))
 762       motif_i++;
 763     else if ((a_motif[motif_i] =='D') &&
 764              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 765               (seq_c[seq_i]=='T')))
 766       motif_i++;
 767     else if ((a_motif[motif_i] =='B') &&
 768              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 769               (seq_c[seq_i]=='T')))
 770       motif_i++;
 771
 772     else
 773     {
 774       seq_i -= motif_i;
 775       motif_i = 0;
 776     }
 777
 778     // end Nora stuff, now we see if a match is found this pass
 779     if (motif_i == motif_len)
 780     {
 781       //cout << "!!";
 782       annot new_motif;
 783       motif_match_starts->push_back(seq_i - motif_len + 1);
 784       motif_i = 0;
 785     }
 786
 787     seq_i++;
 788   }
 789   //cout << endl;
 790 }
 791