alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25
  26 #include "alg/sequence.hpp"
  27 #include "mussa_exceptions.hpp"
  28
  29 #include <string>
  30 #include <iostream>
  31
  32 namespace fs = boost::filesystem;
  33 using namespace std;
  34
  35 annot::annot()
  36  : start(0),
  37    end(0),
  38    type(""),
  39    name("")
  40 {
  41 }
  42
  43 annot::annot(int start, int end, std::string type, std::string name)
  44  : start(start),
  45    end(end),
  46    type(type),
  47    name(name)
  48 {
  49 }
  50
  51 motif::motif(int start, std::string motif)
  52  : annot(start, start+motif.size(), "motif", motif),
  53    sequence(motif)
  54 {
  55 }
  56
  57 Sequence::Sequence()
  58   : sequence(""),
  59     header(""),
  60     species("")
  61 {
  62   annots.clear();
  63   motif_list.clear();
  64 }
  65
  66 Sequence::Sequence(string seq)
  67 {
  68   set_filtered_sequence(seq);
  69 }
  70
  71 Sequence &Sequence::operator=(const Sequence& s)
  72 {
  73   if (this != &s) {
  74     sequence = s.sequence;
  75     header = s.header;
  76     species = s.species;
  77     annots = s.annots;
  78   }
  79   return *this;
  80 }
  81
  82 Sequence &Sequence::operator=(const std::string& s)
  83 {
  84   set_filtered_sequence(s);
  85   return *this;
  86 }
  87
  88 char Sequence::operator[](int index) const
  89 {
  90   return sequence[index];
  91 }
  92
  93 ostream& operator<<(ostream& out, const Sequence& seq)
  94 {
  95   out << "Sequence(" << seq.get_seq() << ")";
  96   return out;
  97 }
  98
  99 //! load a fasta file into a sequence
 100 /*!
 101  * \param file_path the location of the fasta file in the filesystem
 102  * \param seq_num which sequence in the file to load
 103  * \param start_index starting position in the fasta sequence, 0 for beginning
 104  * \param end_index ending position in the fasta sequence, 0 for end
 105  * \return error message, empty string if no error. (gag!)
 106  */
 107 void
 108 Sequence::load_fasta(fs::path file_path, int seq_num,
 109                      int start_index, int end_index)
 110 {
 111   fs::fstream data_file;
 112   string file_data_line;
 113   int header_counter = 0;
 114   bool read_seq = true;
 115   string rev_comp;
 116   string sequence_raw;
 117   string seq_tmp;             // holds sequence during basic filtering
 118
 119   data_file.open(file_path, ios::in);
 120
 121   if (seq_num == 0) {
 122     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 123   }
 124   if (!data_file)
 125   {
 126     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 127   }
 128   // if file opened okay, read it
 129   else
 130   {
 131     // search for the header of the fasta sequence we want
 132     while ( (!data_file.eof()) && (header_counter < seq_num) )
 133     {
 134       getline(data_file,file_data_line);
 135       if (file_data_line.substr(0,1) == ">")
 136         header_counter++;
 137     }
 138
 139     header = file_data_line.substr(1);
 140
 141     sequence_raw = "";
 142
 143     while ( !data_file.eof() && read_seq )
 144     {
 145       getline(data_file,file_data_line);
 146       if (file_data_line.substr(0,1) == ">")
 147         read_seq = false;
 148       else sequence_raw += file_data_line;
 149     }
 150
 151     data_file.close();
 152
 153     // Lastly, if subselection of the sequence was specified we keep cut out
 154     // and only keep that part
 155     // end_index = 0 means no end was specified, so cut to the end
 156     if (end_index == 0)
 157       end_index = sequence_raw.size();
 158
 159     // sequence filtering for upcasing agctn and convert non AGCTN to N
 160     set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
 161   }
 162 }
 163
 164 void Sequence::set_filtered_sequence(const string &old_seq,
 165                                      string::size_type start,
 166                                      string::size_type count)
 167 {
 168   char conversionTable[257];
 169
 170   if ( count == 0)
 171     count = old_seq.size() - start;
 172   sequence.clear();
 173   sequence.reserve(count);
 174
 175   // Make a conversion table
 176
 177   // everything we don't specify below will become 'N'
 178   for(int table_i=0; table_i < 256; table_i++)
 179   {
 180     conversionTable[table_i] = 'N';
 181   }
 182   // add end of string character for printing out table for testing purposes
 183   conversionTable[256] = '\0';
 184
 185   // we want these to map to themselves - ie not to change
 186   conversionTable[(int)'A'] = 'A';
 187   conversionTable[(int)'T'] = 'T';
 188   conversionTable[(int)'G'] = 'G';
 189   conversionTable[(int)'C'] = 'C';
 190   // this is to upcase
 191   conversionTable[(int)'a'] = 'A';
 192   conversionTable[(int)'t'] = 'T';
 193   conversionTable[(int)'g'] = 'G';
 194   conversionTable[(int)'c'] = 'C';
 195
 196   // finally, the actual conversion loop
 197   for(string::size_type seq_index = 0; seq_index < count; seq_index++)
 198   {
 199     sequence += conversionTable[ (int)old_seq[seq_index+start]];
 200   }
 201 }
 202
 203   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 204   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 205
 206
 207 void
 208 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 209 {
 210   fs::fstream data_file;
 211   string file_data_line;
 212   annot an_annot;
 213   string::size_type space_split_i;
 214   string annot_value;
 215   list<annot>::iterator list_i;
 216   string err_msg;
 217
 218
 219   annots.clear();
 220   data_file.open(file_path, ios::in);
 221
 222   if (!data_file)
 223   {
 224     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 225   }
 226   // if file opened okay, read it
 227   else
 228   {
 229     getline(data_file,file_data_line);
 230     species = file_data_line;
 231
 232     // end_index = 0 means no end was specified, so cut to the end
 233     if (end_index == 0)
 234       end_index = sequence.length();
 235
 236     //cout << "START: " << start_index << " END: " << end_index << endl;
 237
 238     while ( !data_file.eof() )
 239     {
 240       getline(data_file,file_data_line);
 241       if (file_data_line != "")
 242       {
 243         // need to get 4 values...almost same code 4 times...
 244         // get annot start index
 245         space_split_i = file_data_line.find(" ");
 246         annot_value = file_data_line.substr(0,space_split_i);
 247         an_annot.start = atoi (annot_value.c_str());
 248         file_data_line = file_data_line.substr(space_split_i+1);
 249         // get annot end index
 250         space_split_i = file_data_line.find(" ");
 251         annot_value = file_data_line.substr(0,space_split_i);
 252         an_annot.end = atoi (annot_value.c_str());
 253         file_data_line = file_data_line.substr(space_split_i+1);
 254
 255         //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 256               //     << endl;
 257
 258         // get annot name
 259         space_split_i = file_data_line.find(" ");
 260         if (space_split_i == string::npos)  // no entries for name & type
 261         {
 262           cout << "seq, annots - no name or type\n";
 263           an_annot.name = "";
 264           an_annot.type = "";
 265         }
 266         else
 267         {
 268           annot_value = file_data_line.substr(0,space_split_i);
 269           an_annot.name = annot_value;
 270           file_data_line = file_data_line.substr(space_split_i+1);
 271           // get annot type
 272           space_split_i = file_data_line.find(" ");
 273           if (space_split_i == string::npos)  // no entry for type
 274             an_annot.type = "";
 275           else
 276           {
 277             annot_value = file_data_line.substr(0,space_split_i);
 278             an_annot.type = annot_value;
 279           }
 280         }
 281
 282
 283         // add annot to list if it falls within the range of sequence specified
 284         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 285         {
 286           an_annot.start -= start_index;
 287           an_annot.end -= start_index;
 288           annots.push_back(an_annot);
 289         }
 290         // else no (or bogus) annotations
 291       }
 292     }
 293
 294     data_file.close();
 295     /*
 296     // debugging check
 297     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 298     {
 299       cout << (*list_i).start << "," << (*list_i).end << "\t";
 300       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 301     }
 302     */
 303   }
 304 }
 305
 306 const std::string& Sequence::get_species() const
 307 {
 308   return species;
 309 }
 310
 311 bool Sequence::empty() const
 312 {
 313   return (size() == 0);
 314 }
 315
 316 const std::list<annot>& Sequence::annotations() const
 317 {
 318   return annots;
 319 }
 320
 321 string::size_type Sequence::length() const
 322 {
 323   return size();
 324 }
 325
 326 string::size_type Sequence::size() const
 327 {
 328   return sequence.size();
 329 }
 330
 331 Sequence::iterator Sequence::begin()
 332 {
 333   return sequence.begin();
 334 }
 335
 336 Sequence::const_iterator Sequence::begin() const
 337 {
 338   return sequence.begin();
 339 }
 340
 341 Sequence::iterator Sequence::end()
 342 {
 343   return sequence.end();
 344 }
 345
 346 Sequence::const_iterator Sequence::end() const
 347 {
 348   return sequence.end();
 349 }
 350
 351
 352 const string&
 353 Sequence::get_seq() const
 354 {
 355   return sequence;
 356 }
 357
 358
 359 string
 360 Sequence::subseq(int start, int end) const
 361 {
 362   return sequence.substr(start, end);
 363 }
 364
 365
 366 const char *
 367 Sequence::c_seq() const
 368 {
 369   return sequence.c_str();
 370 }
 371
 372 string
 373 Sequence::rev_comp() const
 374 {
 375   string rev_comp;
 376   char conversionTable[257];
 377   int seq_i, table_i, len;
 378
 379   len = sequence.length();
 380   rev_comp.reserve(len);
 381   // make a conversion table
 382   // init all parts of conversion table to '~' character
 383   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 384   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 385   // seven generations of their progeny) who decides to make it mean
 386   // something special!!!
 387   // PS - double the curse for any smartass non-biologist who tries it as well
 388   for(table_i=0; table_i < 256; table_i++)
 389   {
 390     conversionTable[table_i] = '~';
 391   }
 392   // add end of string character for printing out table for testing purposes
 393   conversionTable[256] = '\0';
 394
 395   // add in the characters for the bases we want to convert
 396   conversionTable[(int)'A'] = 'T';
 397   conversionTable[(int)'T'] = 'A';
 398   conversionTable[(int)'G'] = 'C';
 399   conversionTable[(int)'C'] = 'G';
 400   conversionTable[(int)'N'] = 'N';
 401
 402   // finally, the actual conversion loop
 403   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 404   {
 405     table_i = (int) sequence[seq_i];
 406     rev_comp += conversionTable[table_i];
 407   }
 408
 409   return rev_comp;
 410 }
 411
 412
 413 const string&
 414 Sequence::get_header() const
 415 {
 416   return header;
 417 }
 418 /*
 419 //FIXME: i don't think this code is callable
 420 string
 421 Sequence::sp_name() const
 422 {
 423   return species;
 424 }
 425 */
 426
 427 void
 428 Sequence::set_seq(const string& a_seq)
 429 {
 430   set_filtered_sequence(a_seq);
 431 }
 432
 433
 434 /*
 435 string
 436 Sequence::species()
 437 {
 438   return species;
 439 }
 440 */
 441
 442 void
 443 Sequence::clear()
 444 {
 445   sequence = "";
 446   header = "";
 447   species = "";
 448   annots.clear();
 449 }
 450
 451 void
 452 Sequence::save(fs::fstream &save_file)
 453                //string save_file_path)
 454 {
 455   //fstream save_file;
 456   list<annot>::iterator annots_i;
 457
 458   // not sure why, or if i'm doing something wrong, but can't seem to pass
 459   // file pointers down to this method from the mussa control class
 460   // so each call to save a sequence appends to the file started by mussa_class
 461   //save_file.open(save_file_path.c_str(), ios::app);
 462
 463   save_file << "<Sequence>" << endl;
 464   save_file << sequence << endl;
 465   save_file << "</Sequence>" << endl;
 466
 467   save_file << "<Annotations>" << endl;
 468   save_file << species << endl;
 469   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 470   {
 471     save_file << annots_i->start << " " << annots_i->end << " " ;
 472     save_file << annots_i->name << " " << annots_i->type << endl;
 473   }
 474   save_file << "</Annotations>" << endl;
 475   //save_file.close();
 476 }
 477
 478 void
 479 Sequence::load_museq(fs::path load_file_path, int seq_num)
 480 {
 481   fs::fstream load_file;
 482   string file_data_line;
 483   int seq_counter;
 484   annot an_annot;
 485   string::size_type space_split_i;
 486   string annot_value;
 487
 488   annots.clear();
 489   load_file.open(load_file_path, ios::in);
 490
 491   seq_counter = 0;
 492   // search for the seq_num-th sequence
 493   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 494   {
 495     getline(load_file,file_data_line);
 496     if (file_data_line == "<Sequence>")
 497       seq_counter++;
 498   }
 499   getline(load_file, file_data_line);
 500   sequence = file_data_line;
 501   getline(load_file, file_data_line);
 502   getline(load_file, file_data_line);
 503   if (file_data_line == "<Annotations>")
 504   {
 505     getline(load_file, file_data_line);
 506     species = file_data_line;
 507     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 508     {
 509       getline(load_file,file_data_line);
 510       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 511       {
 512         // need to get 4 values...almost same code 4 times...
 513         // get annot start index
 514         space_split_i = file_data_line.find(" ");
 515         annot_value = file_data_line.substr(0,space_split_i);
 516         an_annot.start = atoi (annot_value.c_str());
 517         file_data_line = file_data_line.substr(space_split_i+1);
 518         // get annot end index
 519         space_split_i = file_data_line.find(" ");
 520         annot_value = file_data_line.substr(0,space_split_i);
 521         an_annot.end = atoi (annot_value.c_str());
 522
 523         if (space_split_i == string::npos)  // no entry for type or name
 524         {
 525           cout << "seq, annots - no type or name\n";
 526           an_annot.type = "";
 527           an_annot.name = "";
 528         }
 529         else   // else get annot type
 530         {
 531           file_data_line = file_data_line.substr(space_split_i+1);
 532           space_split_i = file_data_line.find(" ");
 533           annot_value = file_data_line.substr(0,space_split_i);
 534           an_annot.type = annot_value;
 535           if (space_split_i == string::npos)  // no entry for name
 536           {
 537             cout << "seq, annots - no name\n";
 538             an_annot.name = "";
 539           }
 540           else          // get annot name
 541           {
 542             file_data_line = file_data_line.substr(space_split_i+1);
 543             space_split_i = file_data_line.find(" ");
 544             annot_value = file_data_line.substr(0,space_split_i);
 545             an_annot.type = annot_value;
 546           }
 547         }
 548         annots.push_back(an_annot);  // don't forget to actually add the annot
 549       }
 550       //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 551       //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
 552     }
 553   }
 554   load_file.close();
 555 }
 556
 557
 558 string
 559 Sequence::rc_motif(string a_motif)
 560 {
 561   string rev_comp;
 562   char conversionTable[257];
 563   int seq_i, table_i, len;
 564
 565   len = a_motif.length();
 566   rev_comp.reserve(len);
 567
 568   for(table_i=0; table_i < 256; table_i++)
 569   {
 570     conversionTable[table_i] = '~';
 571   }
 572   // add end of string character for printing out table for testing purposes
 573   conversionTable[256] = '\0';
 574
 575   // add in the characters for the bases we want to convert (IUPAC)
 576   conversionTable[(int)'A'] = 'T';
 577   conversionTable[(int)'T'] = 'A';
 578   conversionTable[(int)'G'] = 'C';
 579   conversionTable[(int)'C'] = 'G';
 580   conversionTable[(int)'N'] = 'N';
 581   conversionTable[(int)'M'] = 'K';
 582   conversionTable[(int)'R'] = 'Y';
 583   conversionTable[(int)'W'] = 'W';
 584   conversionTable[(int)'S'] = 'S';
 585   conversionTable[(int)'Y'] = 'R';
 586   conversionTable[(int)'K'] = 'M';
 587   conversionTable[(int)'V'] = 'B';
 588   conversionTable[(int)'H'] = 'D';
 589   conversionTable[(int)'D'] = 'H';
 590   conversionTable[(int)'B'] = 'V';
 591
 592   // finally, the actual conversion loop
 593   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 594   {
 595     //cout << "** i = " << seq_i << " bp = " <<
 596     table_i = (int) a_motif[seq_i];
 597     rev_comp += conversionTable[table_i];
 598   }
 599
 600   //cout << "seq: " << a_motif << endl;
 601   //cout << "rc:  " << rev_comp << endl;
 602
 603   return rev_comp;
 604 }
 605
 606 string
 607 Sequence::motif_normalize(string a_motif)
 608 {
 609   string valid_motif;
 610   int seq_i, len;
 611
 612   len = a_motif.length();
 613   valid_motif.reserve(len);
 614
 615   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 616   // current nonIUPAC symbols are omitted, which is not reported atm
 617   for(seq_i = 0; seq_i < len; seq_i++)
 618   {
 619     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 620       valid_motif += 'A';
 621     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 622       valid_motif += 'T';
 623     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 624       valid_motif += 'G';
 625     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 626       valid_motif += 'C';
 627     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 628       valid_motif += 'N';
 629     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 630       valid_motif += 'M';
 631     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 632       valid_motif += 'R';
 633     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 634       valid_motif += 'W';
 635     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 636       valid_motif += 'S';
 637     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 638       valid_motif += 'Y';
 639     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 640       valid_motif += 'G';
 641     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 642       valid_motif += 'V';
 643     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 644       valid_motif += 'H';
 645     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 646       valid_motif += 'D';
 647     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 648       valid_motif += 'B';
 649     else {
 650       string msg = "Letter ";
 651       msg += a_motif[seq_i];
 652       msg += " is not a valid IUPAC symbol";
 653       throw motif_normalize_error(msg);
 654     }
 655   }
 656   //cout << "valid_motif is: " << valid_motif << endl;
 657   return valid_motif;
 658 }
 659
 660 void Sequence::add_motif(string a_motif)
 661 {
 662   vector<int> motif_starts = find_motif(a_motif);
 663
 664   for(vector<int>::iterator motif_start_i = motif_starts.begin();
 665       motif_start_i != motif_starts.end();
 666       ++motif_start_i)
 667   {
 668     motif_list.push_back(motif(*motif_start_i, a_motif));
 669   }
 670 }
 671
 672 void Sequence::clear_motifs()
 673 {
 674   motif_list.clear();
 675 }
 676
 677 const list<motif>& Sequence::motifs() const
 678 {
 679   return motif_list;
 680 }
 681
 682 vector<int>
 683 Sequence::find_motif(string a_motif)
 684 {
 685   vector<int> motif_match_starts;
 686   string a_motif_rc;
 687
 688   motif_match_starts.clear();
 689
 690   //cout << "motif is: " << a_motif << endl;
 691   a_motif = motif_normalize(a_motif);
 692   //cout << "motif is: " << a_motif << endl;
 693
 694   if (a_motif != "")
 695   {
 696     //cout << "Sequence: none blank motif\n";
 697     motif_scan(a_motif, &motif_match_starts);
 698
 699     a_motif_rc = rc_motif(a_motif);
 700     // make sure not to do search again if it is a palindrome
 701     if (a_motif_rc != a_motif)
 702       motif_scan(a_motif_rc, &motif_match_starts);
 703   }
 704   return motif_match_starts;
 705 }
 706
 707 void
 708 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 709 {
 710   char * seq_c;
 711   string::size_type seq_i;
 712   int motif_i, motif_len;
 713
 714   // faster to loop thru the sequence as a old c string (ie char array)
 715   seq_c = (char*)sequence.c_str();
 716   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 717   motif_len = a_motif.length();
 718
 719   //cout << "motif_length: " << motif_len << endl;
 720   //cout << "RAAARRRRR\n";
 721
 722   motif_i = 0;
 723
 724   //cout << "motif: " << a_motif << endl;
 725
 726   //cout << "Sequence: motif, length= " << length << endl;
 727   seq_i = 0;
 728   while (seq_i < sequence.length())
 729   {
 730     //cout << seq_c[seq_i];
 731     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 732     // this is pretty much a straight translation of Nora's python code
 733     // to match iupac letter codes
 734     if (a_motif[motif_i] =='N')
 735       motif_i++;
 736     else if (a_motif[motif_i] == seq_c[seq_i])
 737       motif_i++;
 738     else if ((a_motif[motif_i] =='M') &&
 739              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 740       motif_i++;
 741     else if ((a_motif[motif_i] =='R') &&
 742              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 743       motif_i++;
 744     else if ((a_motif[motif_i] =='W') &&
 745              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 746       motif_i++;
 747     else if ((a_motif[motif_i] =='S') &&
 748              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 749       motif_i++;
 750     else if ((a_motif[motif_i] =='Y') &&
 751              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 752       motif_i++;
 753     else if ((a_motif[motif_i] =='K') &&
 754              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 755       motif_i++;
 756     else if ((a_motif[motif_i] =='V') &&
 757              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 758               (seq_c[seq_i]=='G')))
 759       motif_i++;
 760     else if ((a_motif[seq_i] =='H') &&
 761              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 762               (seq_c[seq_i]=='T')))
 763       motif_i++;
 764     else if ((a_motif[motif_i] =='D') &&
 765              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 766               (seq_c[seq_i]=='T')))
 767       motif_i++;
 768     else if ((a_motif[motif_i] =='B') &&
 769              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 770               (seq_c[seq_i]=='T')))
 771       motif_i++;
 772
 773     else
 774     {
 775       seq_i -= motif_i;
 776       motif_i = 0;
 777     }
 778
 779     // end Nora stuff, now we see if a match is found this pass
 780     if (motif_i == motif_len)
 781     {
 782       //cout << "!!";
 783       annot new_motif;
 784       motif_match_starts->push_back(seq_i - motif_len + 1);
 785       motif_i = 0;
 786     }
 787
 788     seq_i++;
 789   }
 790   //cout << endl;
 791 }
 792