alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "alg/sequence.hpp"
  26 #include "mussa_exceptions.hpp"
  27
  28 #include <string>
  29 #include <iostream>
  30
  31 using namespace std;
  32
  33 annot::annot()
  34  : start(0),
  35    end(0),
  36    type(""),
  37    name("")
  38 {
  39 }
  40
  41 annot::annot(int start, int end, std::string type, std::string name)
  42  : start(start),
  43    end(end),
  44    type(type),
  45    name(name)
  46 {
  47 }
  48
  49 motif::motif(int start, std::string motif)
  50  : annot(start, start+motif.size(), "motif", motif),
  51    sequence(motif)
  52 {
  53 }
  54
  55 Sequence::Sequence()
  56   : sequence(""),
  57     header(""),
  58     species("")
  59 {
  60   annots.clear();
  61   motif_list.clear();
  62 }
  63
  64 Sequence::Sequence(string seq)
  65 {
  66   set_filtered_sequence(seq);
  67 }
  68
  69 Sequence &Sequence::operator=(const Sequence& s)
  70 {
  71   if (this != &s) {
  72     sequence = s.sequence;
  73     header = s.header;
  74     species = s.species;
  75     annots = s.annots;
  76   }
  77   return *this;
  78 }
  79
  80 Sequence &Sequence::operator=(const std::string& s)
  81 {
  82   set_filtered_sequence(s);
  83   return *this;
  84 }
  85
  86 char Sequence::operator[](int index) const
  87 {
  88   return sequence[index];
  89 }
  90
  91 ostream& operator<<(ostream& out, const Sequence& seq)
  92 {
  93   out << "Sequence(" << seq.get_seq() << ")";
  94   return out;
  95 }
  96
  97 //! load a fasta file into a sequence
  98 /*!
  99  * \param file_path the location of the fasta file in the filesystem
 100  * \param seq_num which sequence in the file to load
 101  * \param start_index starting position in the fasta sequence, 0 for beginning
 102  * \param end_index ending position in the fasta sequence, 0 for end
 103  * \return error message, empty string if no error. (gag!)
 104  */
 105 void
 106 Sequence::load_fasta(string file_path, int seq_num,
 107                      int start_index, int end_index)
 108 {
 109   fstream data_file;
 110   string file_data_line;
 111   int header_counter = 0;
 112   bool read_seq = true;
 113   string rev_comp;
 114   string sequence_raw;
 115   string seq_tmp;             // holds sequence during basic filtering
 116
 117   data_file.open(file_path.c_str(), ios::in);
 118
 119   if (!data_file)
 120   {
 121     throw mussa_load_error("Sequence File: " + file_path + " not found");
 122   }
 123   // if file opened okay, read it
 124   else
 125   {
 126     // search for the header of the fasta sequence we want
 127     while ( (!data_file.eof()) && (header_counter < seq_num) )
 128     {
 129       getline(data_file,file_data_line);
 130       if (file_data_line.substr(0,1) == ">")
 131         header_counter++;
 132     }
 133
 134     header = file_data_line.substr(1);
 135
 136     sequence_raw = "";
 137
 138     while ( !data_file.eof() && read_seq )
 139     {
 140       getline(data_file,file_data_line);
 141       if (file_data_line.substr(0,1) == ">")
 142         read_seq = false;
 143       else sequence_raw += file_data_line;
 144     }
 145
 146     data_file.close();
 147
 148     // Lastly, if subselection of the sequence was specified we keep cut out
 149     // and only keep that part
 150     // end_index = 0 means no end was specified, so cut to the end
 151     if (end_index == 0)
 152       end_index = sequence_raw.size();
 153
 154     // sequence filtering for upcasing agctn and convert non AGCTN to N
 155     set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 156   }
 157 }
 158
 159 void Sequence::set_filtered_sequence(const string &old_seq,
 160                                      string::size_type start,
 161                                      string::size_type count)
 162 {
 163   char conversionTable[257];
 164
 165   if ( count == 0)
 166     count = old_seq.size() - start;
 167   sequence.clear();
 168   sequence.reserve(count);
 169
 170   // Make a conversion table
 171
 172   // everything we don't specify below will become 'N'
 173   for(int table_i=0; table_i < 256; table_i++)
 174   {
 175     conversionTable[table_i] = 'N';
 176   }
 177   // add end of string character for printing out table for testing purposes
 178   conversionTable[256] = '\0';
 179
 180   // we want these to map to themselves - ie not to change
 181   conversionTable[(int)'A'] = 'A';
 182   conversionTable[(int)'T'] = 'T';
 183   conversionTable[(int)'G'] = 'G';
 184   conversionTable[(int)'C'] = 'C';
 185   // this is to upcase
 186   conversionTable[(int)'a'] = 'A';
 187   conversionTable[(int)'t'] = 'T';
 188   conversionTable[(int)'g'] = 'G';
 189   conversionTable[(int)'c'] = 'C';
 190
 191   // finally, the actual conversion loop
 192   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 193   {
 194     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 195   }
 196 }
 197
 198   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 199   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 200
 201
 202 void
 203 Sequence::load_annot(string file_path, int start_index, int end_index)
 204 {
 205   fstream data_file;
 206   string file_data_line;
 207   annot an_annot;
 208   string::size_type space_split_i;
 209   string annot_value;
 210   list<annot>::iterator list_i;
 211   string err_msg;
 212
 213
 214   annots.clear();
 215   data_file.open(file_path.c_str(), ios::in);
 216
 217   if (!data_file)
 218   {
 219     throw mussa_load_error("Sequence File: " + file_path + " not found");
 220   }
 221   // if file opened okay, read it
 222   else
 223   {
 224     getline(data_file,file_data_line);
 225     species = file_data_line;
 226
 227     // end_index = 0 means no end was specified, so cut to the end
 228     if (end_index == 0)
 229       end_index = sequence.length();
 230
 231     //cout << "START: " << start_index << " END: " << end_index << endl;
 232
 233     while ( !data_file.eof() )
 234     {
 235       getline(data_file,file_data_line);
 236       if (file_data_line != "")
 237       {
 238         // need to get 4 values...almost same code 4 times...
 239         // get annot start index
 240         space_split_i = file_data_line.find(" ");
 241         annot_value = file_data_line.substr(0,space_split_i);
 242         an_annot.start = atoi (annot_value.c_str());
 243         file_data_line = file_data_line.substr(space_split_i+1);
 244         // get annot end index
 245         space_split_i = file_data_line.find(" ");
 246         annot_value = file_data_line.substr(0,space_split_i);
 247         an_annot.end = atoi (annot_value.c_str());
 248         file_data_line = file_data_line.substr(space_split_i+1);
 249
 250         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 251               //     << endl;
 252
 253         // get annot name
 254         space_split_i = file_data_line.find(" ");
 255         if (space_split_i == string::npos)  // no entries for name & type
 256         {
 257           cout << "seq, annots - no name or type\n";
 258           an_annot.name = "";
 259           an_annot.type = "";
 260         }
 261         else
 262         {
 263           annot_value = file_data_line.substr(0,space_split_i);
 264           an_annot.name = annot_value;
 265           file_data_line = file_data_line.substr(space_split_i+1);
 266           // get annot type
 267           space_split_i = file_data_line.find(" ");
 268           if (space_split_i == string::npos)  // no entry for type
 269             an_annot.type = "";
 270           else
 271           {
 272             annot_value = file_data_line.substr(0,space_split_i);
 273             an_annot.type = annot_value;
 274           }
 275         }
 276
 277
 278         // add annot to list if it falls within the range of sequence specified
 279         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 280         {
 281           an_annot.start -= start_index;
 282           an_annot.end -= start_index;
 283           annots.push_back(an_annot);
 284         }
 285         else
 286           cout << "FAILED!!!!!!\n";
 287       }
 288     }
 289
 290     data_file.close();
 291     /*
 292     // debugging check
 293     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 294     {
 295       cout << (*list_i).start << "," << (*list_i).end << "\t";
 296       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 297     }
 298     */
 299   }
 300 }
 301
 302 bool Sequence::empty() const
 303 {
 304   return (size() == 0);
 305 }
 306
 307 const std::list<annot>& Sequence::annotations() const
 308 {
 309   return annots;
 310 }
 311
 312 string::size_type Sequence::length() const
 313 {
 314   return size();
 315 }
 316
 317 string::size_type Sequence::size() const
 318 {
 319   return sequence.size();
 320 }
 321
 322 Sequence::iterator Sequence::begin()
 323 {
 324   return sequence.begin();
 325 }
 326
 327 Sequence::const_iterator Sequence::begin() const
 328 {
 329   return sequence.begin();
 330 }
 331
 332 Sequence::iterator Sequence::end()
 333 {
 334   return sequence.end();
 335 }
 336
 337 Sequence::const_iterator Sequence::end() const
 338 {
 339   return sequence.end();
 340 }
 341
 342
 343 const string&
 344 Sequence::get_seq() const
 345 {
 346   return sequence;
 347 }
 348
 349
 350 string
 351 Sequence::subseq(int start, int end) const
 352 {
 353   return sequence.substr(start, end);
 354 }
 355
 356
 357 const char *
 358 Sequence::c_seq() const
 359 {
 360   return sequence.c_str();
 361 }
 362
 363 string
 364 Sequence::rev_comp() const
 365 {
 366   string rev_comp;
 367   char conversionTable[257];
 368   int seq_i, table_i, len;
 369
 370   len = sequence.length();
 371   rev_comp.reserve(len);
 372   // make a conversion table
 373   // init all parts of conversion table to '~' character
 374   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 375   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 376   // seven generations of their progeny) who decides to make it mean
 377   // something special!!!
 378   // PS - double the curse for any smartass non-biologist who tries it as well
 379   for(table_i=0; table_i < 256; table_i++)
 380   {
 381     conversionTable[table_i] = '~';
 382   }
 383   // add end of string character for printing out table for testing purposes
 384   conversionTable[256] = '\0';
 385
 386   // add in the characters for the bases we want to convert
 387   conversionTable[(int)'A'] = 'T';
 388   conversionTable[(int)'T'] = 'A';
 389   conversionTable[(int)'G'] = 'C';
 390   conversionTable[(int)'C'] = 'G';
 391   conversionTable[(int)'N'] = 'N';
 392
 393   // finally, the actual conversion loop
 394   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 395   {
 396     table_i = (int) sequence[seq_i];
 397     rev_comp += conversionTable[table_i];
 398   }
 399
 400   return rev_comp;
 401 }
 402
 403
 404 const string&
 405 Sequence::get_header() const
 406 {
 407   return header;
 408 }
 409 /*
 410 //FIXME: i don't think this code is callable
 411 string
 412 Sequence::sp_name() const
 413 {
 414   return species;
 415 }
 416 */
 417
 418 void
 419 Sequence::set_seq(const string& a_seq)
 420 {
 421   set_filtered_sequence(a_seq);
 422 }
 423
 424
 425 /*
 426 string
 427 Sequence::species()
 428 {
 429   return species;
 430 }
 431 */
 432
 433 void
 434 Sequence::clear()
 435 {
 436   sequence = "";
 437   header = "";
 438   species = "";
 439   annots.clear();
 440 }
 441
 442 void
 443 Sequence::save(fstream &save_file)
 444                //string save_file_path)
 445 {
 446   //fstream save_file;
 447   list<annot>::iterator annots_i;
 448
 449   // not sure why, or if i'm doing something wrong, but can't seem to pass
 450   // file pointers down to this method from the mussa control class
 451   // so each call to save a sequence appends to the file started by mussa_class
 452   //save_file.open(save_file_path.c_str(), ios::app);
 453
 454   save_file << "<Sequence>" << endl;
 455   save_file << sequence << endl;
 456   save_file << "</Sequence>" << endl;
 457
 458   save_file << "<Annotations>" << endl;
 459   save_file << species << endl;
 460   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 461   {
 462     save_file << annots_i->start << " " << annots_i->end << " " ;
 463     save_file << annots_i->name << " " << annots_i->type << endl;
 464   }
 465   save_file << "</Annotations>" << endl;
 466   //save_file.close();
 467 }
 468
 469 void
 470 Sequence::load_museq(string load_file_path, int seq_num)
 471 {
 472   fstream load_file;
 473   string file_data_line;
 474   int seq_counter;
 475   annot an_annot;
 476   string::size_type space_split_i;
 477   string annot_value;
 478
 479   annots.clear();
 480   load_file.open(load_file_path.c_str(), ios::in);
 481
 482   seq_counter = 0;
 483   // search for the seq_num-th sequence
 484   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 485   {
 486     getline(load_file,file_data_line);
 487     if (file_data_line == "<Sequence>")
 488       seq_counter++;
 489   }
 490   getline(load_file, file_data_line);
 491   sequence = file_data_line;
 492   getline(load_file, file_data_line);
 493   getline(load_file, file_data_line);
 494   if (file_data_line == "<Annotations>")
 495   {
 496     getline(load_file, file_data_line);
 497     species = file_data_line;
 498     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 499     {
 500       getline(load_file,file_data_line);
 501       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 502       {
 503         // need to get 4 values...almost same code 4 times...
 504         // get annot start index
 505         space_split_i = file_data_line.find(" ");
 506         annot_value = file_data_line.substr(0,space_split_i);
 507         an_annot.start = atoi (annot_value.c_str());
 508         file_data_line = file_data_line.substr(space_split_i+1);
 509         // get annot end index
 510         space_split_i = file_data_line.find(" ");
 511         annot_value = file_data_line.substr(0,space_split_i);
 512         an_annot.end = atoi (annot_value.c_str());
 513
 514         if (space_split_i == string::npos)  // no entry for type or name
 515         {
 516           cout << "seq, annots - no type or name\n";
 517           an_annot.type = "";
 518           an_annot.name = "";
 519         }
 520         else   // else get annot type
 521         {
 522           file_data_line = file_data_line.substr(space_split_i+1);
 523           space_split_i = file_data_line.find(" ");
 524           annot_value = file_data_line.substr(0,space_split_i);
 525           an_annot.type = annot_value;
 526           if (space_split_i == string::npos)  // no entry for name
 527           {
 528             cout << "seq, annots - no name\n";
 529             an_annot.name = "";
 530           }
 531           else          // get annot name
 532           {
 533             file_data_line = file_data_line.substr(space_split_i+1);
 534             space_split_i = file_data_line.find(" ");
 535             annot_value = file_data_line.substr(0,space_split_i);
 536             an_annot.type = annot_value;
 537           }
 538         }
 539         annots.push_back(an_annot);  // don't forget to actually add the annot
 540       }
 541       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 542       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 543     }
 544   }
 545   load_file.close();
 546 }
 547
 548
 549 string
 550 Sequence::rc_motif(string a_motif)
 551 {
 552   string rev_comp;
 553   char conversionTable[257];
 554   int seq_i, table_i, len;
 555
 556   len = a_motif.length();
 557   rev_comp.reserve(len);
 558
 559   for(table_i=0; table_i < 256; table_i++)
 560   {
 561     conversionTable[table_i] = '~';
 562   }
 563   // add end of string character for printing out table for testing purposes
 564   conversionTable[256] = '\0';
 565
 566   // add in the characters for the bases we want to convert (IUPAC)
 567   conversionTable[(int)'A'] = 'T';
 568   conversionTable[(int)'T'] = 'A';
 569   conversionTable[(int)'G'] = 'C';
 570   conversionTable[(int)'C'] = 'G';
 571   conversionTable[(int)'N'] = 'N';
 572   conversionTable[(int)'M'] = 'K';
 573   conversionTable[(int)'R'] = 'Y';
 574   conversionTable[(int)'W'] = 'W';
 575   conversionTable[(int)'S'] = 'S';
 576   conversionTable[(int)'Y'] = 'R';
 577   conversionTable[(int)'K'] = 'M';
 578   conversionTable[(int)'V'] = 'B';
 579   conversionTable[(int)'H'] = 'D';
 580   conversionTable[(int)'D'] = 'H';
 581   conversionTable[(int)'B'] = 'V';
 582
 583   // finally, the actual conversion loop
 584   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 585   {
 586     //cout << "** i = " << seq_i << " bp = " <<
 587     table_i = (int) a_motif[seq_i];
 588     rev_comp += conversionTable[table_i];
 589   }
 590
 591   //cout << "seq: " << a_motif << endl;
 592   //cout << "rc:  " << rev_comp << endl;
 593
 594   return rev_comp;
 595 }
 596
 597 string
 598 Sequence::motif_normalize(string a_motif)
 599 {
 600   string valid_motif;
 601   int seq_i, len;
 602
 603   len = a_motif.length();
 604   valid_motif.reserve(len);
 605
 606   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 607   // current nonIUPAC symbols are omitted, which is not reported atm
 608   for(seq_i = 0; seq_i < len; seq_i++)
 609   {
 610     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 611       valid_motif += 'A';
 612     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 613       valid_motif += 'T';
 614     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 615       valid_motif += 'G';
 616     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 617       valid_motif += 'C';
 618     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 619       valid_motif += 'N';
 620     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 621       valid_motif += 'M';
 622     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 623       valid_motif += 'R';
 624     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 625       valid_motif += 'W';
 626     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 627       valid_motif += 'S';
 628     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 629       valid_motif += 'Y';
 630     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 631       valid_motif += 'G';
 632     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 633       valid_motif += 'V';
 634     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 635       valid_motif += 'H';
 636     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 637       valid_motif += 'D';
 638     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 639       valid_motif += 'B';
 640     else {
 641       string msg = "Letter ";
 642       msg += a_motif[seq_i];
 643       msg += " is not a valid IUPAC symbol";
 644       throw motif_normalize_error(msg);
 645     }
 646   }
 647   //cout << "valid_motif is: " << valid_motif << endl;
 648   return valid_motif;
 649 }
 650
 651 void Sequence::add_motif(string a_motif)
 652 {
 653   vector<int> motif_starts = find_motif(a_motif);
 654
 655   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 656       motif_start_i != motif_starts.end();
 657       ++motif_start_i)
 658   {
 659     motif_list.push_back(motif(*motif_start_i, a_motif));
 660   }
 661 }
 662
 663 void Sequence::clear_motifs()
 664 {
 665   motif_list.clear();
 666 }
 667
 668 const list<motif>& Sequence::motifs() const
 669 {
 670   return motif_list;
 671 }
 672
 673 vector<int>
 674 Sequence::find_motif(string a_motif)
 675 {
 676   vector<int> motif_match_starts;
 677   string a_motif_rc;
 678
 679   motif_match_starts.clear();
 680
 681   //cout << "motif is: " << a_motif << endl;
 682   a_motif = motif_normalize(a_motif);
 683   //cout << "motif is: " << a_motif << endl;
 684
 685   if (a_motif != "")
 686   {
 687     //cout << "Sequence: none blank motif\n";
 688     motif_scan(a_motif, &motif_match_starts);
 689
 690     a_motif_rc = rc_motif(a_motif);
 691     // make sure not to do search again if it is a palindrome
 692     if (a_motif_rc != a_motif)
 693       motif_scan(a_motif_rc, &motif_match_starts);
 694   }
 695   return motif_match_starts;
 696 }
 697
 698 void
 699 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 700 {
 701   char * seq_c;
 702   string::size_type seq_i;
 703   int motif_i, motif_len;
 704
 705   // faster to loop thru the sequence as a old c string (ie char array)
 706   seq_c = (char*)sequence.c_str();
 707   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 708   motif_len = a_motif.length();
 709
 710   //cout << "motif_length: " << motif_len << endl;
 711   //cout << "RAAARRRRR\n";
 712
 713   motif_i = 0;
 714
 715   //cout << "motif: " << a_motif << endl;
 716
 717   //cout << "Sequence: motif, length= " << length << endl;
 718   seq_i = 0;
 719   while (seq_i < sequence.length())
 720   {
 721     //cout << seq_c[seq_i];
 722     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 723     // this is pretty much a straight translation of Nora's python code
 724     // to match iupac letter codes
 725     if (a_motif[motif_i] =='N')
 726       motif_i++;
 727     else if (a_motif[motif_i] == seq_c[seq_i])
 728       motif_i++;
 729     else if ((a_motif[motif_i] =='M') &&
 730              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 731       motif_i++;
 732     else if ((a_motif[motif_i] =='R') &&
 733              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 734       motif_i++;
 735     else if ((a_motif[motif_i] =='W') &&
 736              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 737       motif_i++;
 738     else if ((a_motif[motif_i] =='S') &&
 739              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 740       motif_i++;
 741     else if ((a_motif[motif_i] =='Y') &&
 742              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 743       motif_i++;
 744     else if ((a_motif[motif_i] =='K') &&
 745              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 746       motif_i++;
 747     else if ((a_motif[motif_i] =='V') &&
 748              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 749               (seq_c[seq_i]=='G')))
 750       motif_i++;
 751     else if ((a_motif[seq_i] =='H') &&
 752              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 753               (seq_c[seq_i]=='T')))
 754       motif_i++;
 755     else if ((a_motif[motif_i] =='D') &&
 756              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 757               (seq_c[seq_i]=='T')))
 758       motif_i++;
 759     else if ((a_motif[motif_i] =='B') &&
 760              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 761               (seq_c[seq_i]=='T')))
 762       motif_i++;
 763
 764     else
 765     {
 766       seq_i -= motif_i;
 767       motif_i = 0;
 768     }
 769
 770     // end Nora stuff, now we see if a match is found this pass
 771     if (motif_i == motif_len)
 772     {
 773       //cout << "!!";
 774       annot new_motif;
 775       motif_match_starts->push_back(seq_i - motif_len + 1);
 776       motif_i = 0;
 777     }
 778
 779     seq_i++;
 780   }
 781   //cout << endl;
 782 }
 783