alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "alg/sequence.hpp"
  26 #include "mussa_exceptions.hpp"
  27
  28 #include <string>
  29 #include <iostream>
  30
  31 using namespace std;
  32
  33 annot::annot()
  34  : start(0),
  35    end(0),
  36    type(""),
  37    name("")
  38 {
  39 }
  40
  41 annot::annot(int start, int end, std::string type, std::string name)
  42  : start(start),
  43    end(end),
  44    type(type),
  45    name(name)
  46 {
  47 }
  48
  49 motif::motif(int start, std::string motif)
  50  : annot(start, start+motif.size(), "motif", motif),
  51    sequence(motif)
  52 {
  53 }
  54
  55 Sequence::Sequence()
  56   : sequence(""),
  57     header(""),
  58     species("")
  59 {
  60   annots.clear();
  61   motif_list.clear();
  62 }
  63
  64 Sequence::Sequence(string seq)
  65 {
  66   set_filtered_sequence(seq);
  67 }
  68
  69 Sequence &Sequence::operator=(const Sequence& s)
  70 {
  71   if (this != &s) {
  72     sequence = s.sequence;
  73     header = s.header;
  74     species = s.species;
  75     annots = s.annots;
  76   }
  77   return *this;
  78 }
  79
  80 Sequence &Sequence::operator=(const std::string& s)
  81 {
  82   set_filtered_sequence(s);
  83   return *this;
  84 }
  85
  86 ostream& operator<<(ostream& out, const Sequence& seq)
  87 {
  88   out << "Sequence(" << seq.get_seq() << ")";
  89   return out;
  90 }
  91
  92 //! load a fasta file into a sequence
  93 /*!
  94  * \param file_path the location of the fasta file in the filesystem
  95  * \param seq_num which sequence in the file to load
  96  * \param start_index starting position in the fasta sequence, 0 for beginning
  97  * \param end_index ending position in the fasta sequence, 0 for end
  98  * \return error message, empty string if no error. (gag!)
  99  */
 100 void
 101 Sequence::load_fasta(string file_path, int seq_num,
 102                      int start_index, int end_index)
 103 {
 104   fstream data_file;
 105   string file_data_line;
 106   int header_counter = 0;
 107   bool read_seq = true;
 108   string rev_comp;
 109   string sequence_raw;
 110   string seq_tmp;             // holds sequence during basic filtering
 111
 112   data_file.open(file_path.c_str(), ios::in);
 113
 114   if (!data_file)
 115   {
 116     throw mussa_load_error("Sequence File: " + file_path + " not found");
 117   }
 118   // if file opened okay, read it
 119   else
 120   {
 121     // search for the header of the fasta sequence we want
 122     while ( (!data_file.eof()) && (header_counter < seq_num) )
 123     {
 124       getline(data_file,file_data_line);
 125       if (file_data_line.substr(0,1) == ">")
 126         header_counter++;
 127     }
 128
 129     header = file_data_line.substr(1);
 130
 131     sequence_raw = "";
 132
 133     while ( !data_file.eof() && read_seq )
 134     {
 135       getline(data_file,file_data_line);
 136       if (file_data_line.substr(0,1) == ">")
 137         read_seq = false;
 138       else sequence_raw += file_data_line;
 139     }
 140
 141     data_file.close();
 142
 143     // Lastly, if subselection of the sequence was specified we keep cut out
 144     // and only keep that part
 145     // end_index = 0 means no end was specified, so cut to the end
 146     if (end_index == 0)
 147       end_index = sequence_raw.size();
 148
 149     // sequence filtering for upcasing agctn and convert non AGCTN to N
 150     set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 151   }
 152 }
 153
 154 void Sequence::set_filtered_sequence(const string &old_seq,
 155                                      string::size_type start,
 156                                      string::size_type count)
 157 {
 158   char conversionTable[257];
 159
 160   if ( count == 0)
 161     count = old_seq.size() - start;
 162   sequence.clear();
 163   sequence.reserve(count);
 164
 165   // Make a conversion table
 166
 167   // everything we don't specify below will become 'N'
 168   for(int table_i=0; table_i < 256; table_i++)
 169   {
 170     conversionTable[table_i] = 'N';
 171   }
 172   // add end of string character for printing out table for testing purposes
 173   conversionTable[256] = '\0';
 174
 175   // we want these to map to themselves - ie not to change
 176   conversionTable[(int)'A'] = 'A';
 177   conversionTable[(int)'T'] = 'T';
 178   conversionTable[(int)'G'] = 'G';
 179   conversionTable[(int)'C'] = 'C';
 180   // this is to upcase
 181   conversionTable[(int)'a'] = 'A';
 182   conversionTable[(int)'t'] = 'T';
 183   conversionTable[(int)'g'] = 'G';
 184   conversionTable[(int)'c'] = 'C';
 185
 186   // finally, the actual conversion loop
 187   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 188   {
 189     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 190   }
 191 }
 192
 193   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 194   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 195
 196
 197 void
 198 Sequence::load_annot(string file_path, int start_index, int end_index)
 199 {
 200   fstream data_file;
 201   string file_data_line;
 202   annot an_annot;
 203   string::size_type space_split_i;
 204   string annot_value;
 205   list<annot>::iterator list_i;
 206   string err_msg;
 207
 208
 209   annots.clear();
 210   data_file.open(file_path.c_str(), ios::in);
 211
 212   if (!data_file)
 213   {
 214     throw mussa_load_error("Sequence File: " + file_path + " not found");
 215   }
 216   // if file opened okay, read it
 217   else
 218   {
 219     getline(data_file,file_data_line);
 220     species = file_data_line;
 221
 222     // end_index = 0 means no end was specified, so cut to the end
 223     if (end_index == 0)
 224       end_index = sequence.length();
 225
 226     //cout << "START: " << start_index << " END: " << end_index << endl;
 227
 228     while ( !data_file.eof() )
 229     {
 230       getline(data_file,file_data_line);
 231       if (file_data_line != "")
 232       {
 233         // need to get 4 values...almost same code 4 times...
 234         // get annot start index
 235         space_split_i = file_data_line.find(" ");
 236         annot_value = file_data_line.substr(0,space_split_i);
 237         an_annot.start = atoi (annot_value.c_str());
 238         file_data_line = file_data_line.substr(space_split_i+1);
 239         // get annot end index
 240         space_split_i = file_data_line.find(" ");
 241         annot_value = file_data_line.substr(0,space_split_i);
 242         an_annot.end = atoi (annot_value.c_str());
 243         file_data_line = file_data_line.substr(space_split_i+1);
 244
 245         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 246               //     << endl;
 247
 248         // get annot name
 249         space_split_i = file_data_line.find(" ");
 250         if (space_split_i == string::npos)  // no entries for name & type
 251         {
 252           cout << "seq, annots - no name or type\n";
 253           an_annot.name = "";
 254           an_annot.type = "";
 255         }
 256         else
 257         {
 258           annot_value = file_data_line.substr(0,space_split_i);
 259           an_annot.name = annot_value;
 260           file_data_line = file_data_line.substr(space_split_i+1);
 261           // get annot type
 262           space_split_i = file_data_line.find(" ");
 263           if (space_split_i == string::npos)  // no entry for type
 264             an_annot.type = "";
 265           else
 266           {
 267             annot_value = file_data_line.substr(0,space_split_i);
 268             an_annot.type = annot_value;
 269           }
 270         }
 271
 272
 273         // add annot to list if it falls within the range of sequence specified
 274         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 275         {
 276           an_annot.start -= start_index;
 277           an_annot.end -= start_index;
 278           annots.push_back(an_annot);
 279         }
 280         else
 281           cout << "FAILED!!!!!!\n";
 282       }
 283     }
 284
 285     data_file.close();
 286     /*
 287     // debugging check
 288     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 289     {
 290       cout << (*list_i).start << "," << (*list_i).end << "\t";
 291       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 292     }
 293     */
 294   }
 295 }
 296
 297 bool Sequence::empty() const
 298 {
 299   return (size() == 0);
 300 }
 301
 302 const std::list<annot>& Sequence::annotations() const
 303 {
 304   return annots;
 305 }
 306
 307 string::size_type Sequence::length() const
 308 {
 309   return size();
 310 }
 311
 312 string::size_type Sequence::size() const
 313 {
 314   return sequence.size();
 315 }
 316
 317 Sequence::iterator Sequence::begin()
 318 {
 319   return sequence.begin();
 320 }
 321
 322 Sequence::const_iterator Sequence::begin() const
 323 {
 324   return sequence.begin();
 325 }
 326
 327 Sequence::iterator Sequence::end()
 328 {
 329   return sequence.end();
 330 }
 331
 332 Sequence::const_iterator Sequence::end() const
 333 {
 334   return sequence.end();
 335 }
 336
 337
 338 const string&
 339 Sequence::get_seq() const
 340 {
 341   return sequence;
 342 }
 343
 344
 345 string
 346 Sequence::subseq(int start, int end) const
 347 {
 348   return sequence.substr(start, end);
 349 }
 350
 351
 352 const char *
 353 Sequence::c_seq() const
 354 {
 355   return sequence.c_str();
 356 }
 357
 358 string
 359 Sequence::rev_comp() const
 360 {
 361   string rev_comp;
 362   char conversionTable[257];
 363   int seq_i, table_i, len;
 364
 365   len = sequence.length();
 366   rev_comp.reserve(len);
 367   // make a conversion table
 368   // init all parts of conversion table to '~' character
 369   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 370   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 371   // seven generations of their progeny) who decides to make it mean
 372   // something special!!!
 373   // PS - double the curse for any smartass non-biologist who tries it as well
 374   for(table_i=0; table_i < 256; table_i++)
 375   {
 376     conversionTable[table_i] = '~';
 377   }
 378   // add end of string character for printing out table for testing purposes
 379   conversionTable[256] = '\0';
 380
 381   // add in the characters for the bases we want to convert
 382   conversionTable[(int)'A'] = 'T';
 383   conversionTable[(int)'T'] = 'A';
 384   conversionTable[(int)'G'] = 'C';
 385   conversionTable[(int)'C'] = 'G';
 386   conversionTable[(int)'N'] = 'N';
 387
 388   // finally, the actual conversion loop
 389   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 390   {
 391     table_i = (int) sequence[seq_i];
 392     rev_comp += conversionTable[table_i];
 393   }
 394
 395   return rev_comp;
 396 }
 397
 398
 399 const string&
 400 Sequence::get_header() const
 401 {
 402   return header;
 403 }
 404 /*
 405 //FIXME: i don't think this code is callable
 406 string
 407 Sequence::sp_name() const
 408 {
 409   return species;
 410 }
 411 */
 412
 413 void
 414 Sequence::set_seq(const string& a_seq)
 415 {
 416   set_filtered_sequence(a_seq);
 417 }
 418
 419
 420 /*
 421 string
 422 Sequence::species()
 423 {
 424   return species;
 425 }
 426 */
 427
 428 void
 429 Sequence::clear()
 430 {
 431   sequence = "";
 432   header = "";
 433   species = "";
 434   annots.clear();
 435 }
 436
 437 void
 438 Sequence::save(fstream &save_file)
 439                //string save_file_path)
 440 {
 441   //fstream save_file;
 442   list<annot>::iterator annots_i;
 443
 444   // not sure why, or if i'm doing something wrong, but can't seem to pass
 445   // file pointers down to this method from the mussa control class
 446   // so each call to save a sequence appends to the file started by mussa_class
 447   //save_file.open(save_file_path.c_str(), ios::app);
 448
 449   save_file << "<Sequence>" << endl;
 450   save_file << sequence << endl;
 451   save_file << "</Sequence>" << endl;
 452
 453   save_file << "<Annotations>" << endl;
 454   save_file << species << endl;
 455   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 456   {
 457     save_file << annots_i->start << " " << annots_i->end << " " ;
 458     save_file << annots_i->name << " " << annots_i->type << endl;
 459   }
 460   save_file << "</Annotations>" << endl;
 461   //save_file.close();
 462 }
 463
 464 void
 465 Sequence::load_museq(string load_file_path, int seq_num)
 466 {
 467   fstream load_file;
 468   string file_data_line;
 469   int seq_counter;
 470   annot an_annot;
 471   string::size_type space_split_i;
 472   string annot_value;
 473
 474   annots.clear();
 475   load_file.open(load_file_path.c_str(), ios::in);
 476
 477   seq_counter = 0;
 478   // search for the seq_num-th sequence
 479   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 480   {
 481     getline(load_file,file_data_line);
 482     if (file_data_line == "<Sequence>")
 483       seq_counter++;
 484   }
 485   getline(load_file, file_data_line);
 486   sequence = file_data_line;
 487   getline(load_file, file_data_line);
 488   getline(load_file, file_data_line);
 489   if (file_data_line == "<Annotations>")
 490   {
 491     getline(load_file, file_data_line);
 492     species = file_data_line;
 493     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 494     {
 495       getline(load_file,file_data_line);
 496       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 497       {
 498         // need to get 4 values...almost same code 4 times...
 499         // get annot start index
 500         space_split_i = file_data_line.find(" ");
 501         annot_value = file_data_line.substr(0,space_split_i);
 502         an_annot.start = atoi (annot_value.c_str());
 503         file_data_line = file_data_line.substr(space_split_i+1);
 504         // get annot end index
 505         space_split_i = file_data_line.find(" ");
 506         annot_value = file_data_line.substr(0,space_split_i);
 507         an_annot.end = atoi (annot_value.c_str());
 508
 509         if (space_split_i == string::npos)  // no entry for type or name
 510         {
 511           cout << "seq, annots - no type or name\n";
 512           an_annot.type = "";
 513           an_annot.name = "";
 514         }
 515         else   // else get annot type
 516         {
 517           file_data_line = file_data_line.substr(space_split_i+1);
 518           space_split_i = file_data_line.find(" ");
 519           annot_value = file_data_line.substr(0,space_split_i);
 520           an_annot.type = annot_value;
 521           if (space_split_i == string::npos)  // no entry for name
 522           {
 523             cout << "seq, annots - no name\n";
 524             an_annot.name = "";
 525           }
 526           else          // get annot name
 527           {
 528             file_data_line = file_data_line.substr(space_split_i+1);
 529             space_split_i = file_data_line.find(" ");
 530             annot_value = file_data_line.substr(0,space_split_i);
 531             an_annot.type = annot_value;
 532           }
 533         }
 534         annots.push_back(an_annot);  // don't forget to actually add the annot
 535       }
 536       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 537       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 538     }
 539   }
 540   load_file.close();
 541 }
 542
 543
 544 string
 545 Sequence::rc_motif(string a_motif)
 546 {
 547   string rev_comp;
 548   char conversionTable[257];
 549   int seq_i, table_i, len;
 550
 551   len = a_motif.length();
 552   rev_comp.reserve(len);
 553
 554   for(table_i=0; table_i < 256; table_i++)
 555   {
 556     conversionTable[table_i] = '~';
 557   }
 558   // add end of string character for printing out table for testing purposes
 559   conversionTable[256] = '\0';
 560
 561   // add in the characters for the bases we want to convert (IUPAC)
 562   conversionTable[(int)'A'] = 'T';
 563   conversionTable[(int)'T'] = 'A';
 564   conversionTable[(int)'G'] = 'C';
 565   conversionTable[(int)'C'] = 'G';
 566   conversionTable[(int)'N'] = 'N';
 567   conversionTable[(int)'M'] = 'K';
 568   conversionTable[(int)'R'] = 'Y';
 569   conversionTable[(int)'W'] = 'W';
 570   conversionTable[(int)'S'] = 'S';
 571   conversionTable[(int)'Y'] = 'R';
 572   conversionTable[(int)'K'] = 'M';
 573   conversionTable[(int)'V'] = 'B';
 574   conversionTable[(int)'H'] = 'D';
 575   conversionTable[(int)'D'] = 'H';
 576   conversionTable[(int)'B'] = 'V';
 577
 578   // finally, the actual conversion loop
 579   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 580   {
 581     //cout << "** i = " << seq_i << " bp = " <<
 582     table_i = (int) a_motif[seq_i];
 583     rev_comp += conversionTable[table_i];
 584   }
 585
 586   //cout << "seq: " << a_motif << endl;
 587   //cout << "rc:  " << rev_comp << endl;
 588
 589   return rev_comp;
 590 }
 591
 592 string
 593 Sequence::motif_normalize(string a_motif)
 594 {
 595   string valid_motif;
 596   int seq_i, len;
 597
 598   len = a_motif.length();
 599   valid_motif.reserve(len);
 600
 601   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 602   // current nonIUPAC symbols are omitted, which is not reported atm
 603   for(seq_i = 0; seq_i < len; seq_i++)
 604   {
 605     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 606       valid_motif += 'A';
 607     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 608       valid_motif += 'T';
 609     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 610       valid_motif += 'G';
 611     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 612       valid_motif += 'C';
 613     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 614       valid_motif += 'N';
 615     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 616       valid_motif += 'M';
 617     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 618       valid_motif += 'R';
 619     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 620       valid_motif += 'W';
 621     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 622       valid_motif += 'S';
 623     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 624       valid_motif += 'Y';
 625     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 626       valid_motif += 'G';
 627     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 628       valid_motif += 'V';
 629     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 630       valid_motif += 'H';
 631     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 632       valid_motif += 'D';
 633     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 634       valid_motif += 'B';
 635     else {
 636       string msg = "Letter ";
 637       msg += a_motif[seq_i];
 638       msg += " is not a valid IUPAC symbol";
 639       throw motif_normalize_error(msg);
 640     }
 641   }
 642   //cout << "valid_motif is: " << valid_motif << endl;
 643   return valid_motif;
 644 }
 645
 646 void Sequence::add_motif(string a_motif)
 647 {
 648   vector<int> motif_starts = find_motif(a_motif);
 649
 650   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 651       motif_start_i != motif_starts.end();
 652       ++motif_start_i)
 653   {
 654     motif_list.push_back(motif(*motif_start_i, a_motif));
 655   }
 656 }
 657
 658 void Sequence::clear_motifs()
 659 {
 660   motif_list.clear();
 661 }
 662
 663 const list<motif>& Sequence::motifs() const
 664 {
 665   return motif_list;
 666 }
 667
 668 vector<int>
 669 Sequence::find_motif(string a_motif)
 670 {
 671   vector<int> motif_match_starts;
 672   string a_motif_rc;
 673
 674   motif_match_starts.clear();
 675
 676   //cout << "motif is: " << a_motif << endl;
 677   a_motif = motif_normalize(a_motif);
 678   //cout << "motif is: " << a_motif << endl;
 679
 680   if (a_motif != "")
 681   {
 682     //cout << "Sequence: none blank motif\n";
 683     motif_scan(a_motif, &motif_match_starts);
 684
 685     a_motif_rc = rc_motif(a_motif);
 686     // make sure not to do search again if it is a palindrome
 687     if (a_motif_rc != a_motif)
 688       motif_scan(a_motif_rc, &motif_match_starts);
 689   }
 690   return motif_match_starts;
 691 }
 692
 693 void
 694 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 695 {
 696   char * seq_c;
 697   string::size_type seq_i;
 698   int motif_i, motif_len;
 699
 700   // faster to loop thru the sequence as a old c string (ie char array)
 701   seq_c = (char*)sequence.c_str();
 702   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 703   motif_len = a_motif.length();
 704
 705   //cout << "motif_length: " << motif_len << endl;
 706   //cout << "RAAARRRRR\n";
 707
 708   motif_i = 0;
 709
 710   //cout << "motif: " << a_motif << endl;
 711
 712   //cout << "Sequence: motif, length= " << length << endl;
 713   seq_i = 0;
 714   while (seq_i < sequence.length())
 715   {
 716     //cout << seq_c[seq_i];
 717     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 718     // this is pretty much a straight translation of Nora's python code
 719     // to match iupac letter codes
 720     if (a_motif[motif_i] =='N')
 721       motif_i++;
 722     else if (a_motif[motif_i] == seq_c[seq_i])
 723       motif_i++;
 724     else if ((a_motif[motif_i] =='M') &&
 725              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 726       motif_i++;
 727     else if ((a_motif[motif_i] =='R') &&
 728              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 729       motif_i++;
 730     else if ((a_motif[motif_i] =='W') &&
 731              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 732       motif_i++;
 733     else if ((a_motif[motif_i] =='S') &&
 734              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 735       motif_i++;
 736     else if ((a_motif[motif_i] =='Y') &&
 737              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 738       motif_i++;
 739     else if ((a_motif[motif_i] =='K') &&
 740              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 741       motif_i++;
 742     else if ((a_motif[motif_i] =='V') &&
 743              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 744               (seq_c[seq_i]=='G')))
 745       motif_i++;
 746     else if ((a_motif[seq_i] =='H') &&
 747              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 748               (seq_c[seq_i]=='T')))
 749       motif_i++;
 750     else if ((a_motif[motif_i] =='D') &&
 751              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 752               (seq_c[seq_i]=='T')))
 753       motif_i++;
 754     else if ((a_motif[motif_i] =='B') &&
 755              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 756               (seq_c[seq_i]=='T')))
 757       motif_i++;
 758
 759     else
 760     {
 761       seq_i -= motif_i;
 762       motif_i = 0;
 763     }
 764
 765     // end Nora stuff, now we see if a match is found this pass
 766     if (motif_i == motif_len)
 767     {
 768       //cout << "!!";
 769       annot new_motif;
 770       motif_match_starts->push_back(seq_i - motif_len + 1);
 771       motif_i = 0;
 772     }
 773
 774     seq_i++;
 775   }
 776   //cout << endl;
 777 }
 778