alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25 namespace fs = boost::filesystem;
  26
  27 #include <boost/spirit/core.hpp>
  28 #include <boost/spirit/actor/push_back_actor.hpp>
  29 #include <boost/spirit/iterator/file_iterator.hpp>
  30 #include <boost/spirit/utility/chset.hpp>
  31 namespace spirit = boost::spirit;
  32
  33 #include "alg/sequence.hpp"
  34 #include "mussa_exceptions.hpp"
  35
  36 #include <string>
  37 #include <stdexcept>
  38 #include <iostream>
  39 #include <sstream>
  40 #include <set>
  41
  42 annot::annot()
  43  : begin(0),
  44    end(0),
  45    type(""),
  46    name("")
  47 {
  48 }
  49
  50 annot::annot(int begin, int end, std::string type, std::string name)
  51  : begin(begin),
  52    end(end),
  53    type(type),
  54    name(name)
  55 {
  56 }
  57
  58 annot::~annot()
  59 {
  60 }
  61
  62 bool operator==(const annot& left, const annot& right)
  63 {
  64   return ((left.begin== right.begin) and
  65           (left.end == right.end) and
  66           (left.type == right.type) and
  67           (left.name == right.name));
  68 }
  69
  70 motif::motif(int begin, std::string motif)
  71  : annot(begin, begin+motif.size(), "motif", motif),
  72    sequence(motif)
  73 {
  74 }
  75
  76 motif::~motif()
  77 {
  78 }
  79
  80
  81 Sequence::Sequence(alphabet_ref alphabet_)
  82   : parent(0),
  83     alphabet(alphabet_),
  84     seq_start(0),
  85     seq_count(0),
  86     strand(UnknownStrand)
  87 {
  88 }
  89
  90 Sequence::~Sequence()
  91 {
  92 }
  93
  94 Sequence::Sequence(const char *seq, alphabet_ref alphabet_)
  95   : parent(0),
  96     alphabet(alphabet_),
  97     seq_start(0),
  98     seq_count(0),
  99     strand(UnknownStrand),
 100     header(""),
 101     species("")
 102 {
 103   set_filtered_sequence(seq, alphabet);
 104 }
 105
 106 Sequence::Sequence(const std::string& seq, alphabet_ref alphabet_)
 107   : parent(0),
 108     alphabet(alphabet_),
 109     seq_start(0),
 110     seq_count(0),
 111     strand(UnknownStrand),
 112     header(""),
 113     species("")
 114 {
 115   set_filtered_sequence(seq, alphabet);
 116 }
 117
 118 Sequence::Sequence(const Sequence& o)
 119   : parent(o.parent),
 120     seq(o.seq),
 121     alphabet(o.alphabet),
 122     seq_start(o.seq_start),
 123     seq_count(o.seq_count),
 124     strand(o.strand),
 125     header(o.header),
 126     species(o.species),
 127     annots(o.annots),
 128     motif_list(o.motif_list)
 129 {
 130 }
 131
 132 Sequence &Sequence::operator=(const Sequence& s)
 133 {
 134   if (this != &s) {
 135     parent = s.parent;
 136     seq = s.seq;
 137     alphabet = s.alphabet;
 138     seq_start = s.seq_start;
 139     seq_count = s.seq_count;
 140     strand = s.strand;
 141     header = s.header;
 142     species = s.species;
 143     annots = s.annots;
 144     motif_list = s.motif_list;
 145   }
 146   return *this;
 147 }
 148
 149 static void multiplatform_getline(std::istream& in, std::string& line)
 150 {
 151   line.clear();
 152   char c;
 153   in.get(c);
 154   while(in.good() and !(c == '\012' or c == '\015') ) {
 155     line.push_back(c);
 156     in.get(c);
 157   }
 158   // if we have cr-lf eat it
 159   c = in.peek();
 160   if (c=='\012' or c == '\015') {
 161     in.get();
 162   }
 163 }
 164
 165 void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
 166 {
 167   load_fasta(file_path, alphabet, seq_num, start_index, end_index);
 168 }
 169
 170 //! load a fasta file into a sequence
 171 void Sequence::load_fasta(fs::path file_path, alphabet_ref a,
 172                           int seq_num, int start_index, int end_index)
 173 {
 174   fs::fstream data_file;
 175   data_file.open(file_path, std::ios::in);
 176
 177   if (!data_file.good())
 178   {
 179     throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
 180   } else {
 181     try {
 182       load_fasta(data_file, a, seq_num, start_index, end_index);
 183     } catch(sequence_empty_error e) {
 184       // there doesn't appear to be any sequence
 185       // catch and rethrow to include the filename
 186       std::stringstream msg;
 187       msg << "The selected sequence in "
 188           << file_path.native_file_string()
 189           << " appears to be empty";
 190       throw sequence_empty_error(msg.str());
 191     } catch(sequence_empty_file_error e) {
 192       std::stringstream errormsg;
 193       errormsg << file_path.native_file_string()
 194                << " did not have any fasta sequences" << std::endl;
 195       throw sequence_empty_file_error(errormsg.str());
 196     }
 197   }
 198 }
 199
 200 void Sequence::load_fasta(std::iostream& file,
 201                           int seq_num, int start_index, int end_index)
 202 {
 203   load_fasta(file, alphabet, seq_num, start_index, end_index);
 204 }
 205
 206 void
 207 Sequence::load_fasta(std::iostream& data_file, alphabet_ref a,
 208                      int seq_num,
 209                      int start_index, int end_index)
 210 {
 211   std::string file_data_line;
 212   int header_counter = 0;
 213   bool read_seq = true;
 214   std::string rev_comp;
 215   std::string sequence_raw;
 216   std::string seq_tmp;      // holds sequence during basic filtering
 217   const Alphabet &alpha = get_alphabet(a);
 218
 219   if (seq_num == 0) {
 220     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 221   }
 222
 223   // search for the header of the fasta sequence we want
 224   while ( (!data_file.eof()) && (header_counter < seq_num) )
 225   {
 226     multiplatform_getline(data_file, file_data_line);
 227     if (file_data_line.substr(0,1) == ">")
 228       header_counter++;
 229   }
 230
 231   if (header_counter > 0) {
 232     header = file_data_line.substr(1);
 233
 234     sequence_raw = "";
 235
 236     while ( !data_file.eof() && read_seq ) {
 237       multiplatform_getline(data_file,file_data_line);
 238       if (file_data_line.substr(0,1) == ">")
 239         read_seq = false;
 240       else {
 241         for (std::string::const_iterator line_i = file_data_line.begin();
 242              line_i != file_data_line.end();
 243              ++line_i)
 244          {
 245            if(alpha.exists(*line_i)) {
 246              sequence_raw += *line_i;
 247            } else {
 248             throw sequence_invalid_load_error("Unrecognized characters in fasta sequence");
 249            }
 250          }
 251       }
 252     }
 253
 254     // Lastly, if subselection of the sequence was specified we keep cut out
 255     // and only keep that part
 256     // end_index = 0 means no end was specified, so cut to the end
 257     if (end_index == 0)
 258       end_index = sequence_raw.size();
 259
 260     // sequence filtering for upcasing agctn and convert non AGCTN to N
 261     if (end_index-start_index <= 0) {
 262       std::string msg("The selected sequence appears to be empty");
 263       throw sequence_empty_error(msg);
 264     }
 265     set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index);
 266   } else {
 267     std::string errormsg("There were no fasta sequences");
 268     throw sequence_empty_file_error(errormsg);
 269   }
 270 }
 271
 272 void Sequence::set_filtered_sequence(const std::string &in_seq,
 273                                      alphabet_ref alphabet_,
 274                                      size_type start,
 275                                      size_type count,
 276                                      strand_type strand_)
 277 {
 278   alphabet = alphabet_;
 279   if ( count == npos)
 280     count = in_seq.size() - start;
 281   boost::shared_ptr<seq_string> new_seq(new seq_string);
 282   new_seq->reserve(count);
 283
 284   // finally, the actual conversion loop
 285   const Alphabet& alpha_impl = get_alphabet(); // go get one of our actual alphabets
 286   std::string::const_iterator seq_i = in_seq.begin()+start;
 287   for(size_type i = 0; i != count; ++i, ++seq_i)
 288   {
 289     if (alpha_impl.exists(*seq_i)) {
 290       new_seq->append(1, *seq_i);
 291     } else {
 292       new_seq->append(1, 'N');
 293     }
 294   }
 295   parent = 0;
 296   seq = new_seq;
 297   seq_start = 0;
 298   seq_count = count;
 299   strand = strand_;
 300 }
 301
 302 void
 303 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 304 {
 305   fs::fstream data_stream(file_path, std::ios::in);
 306   if (!data_stream)
 307   {
 308     throw mussa_load_error("Sequence File: " + file_path.string() + " not found");
 309   }
 310
 311   // so i should probably be passing the parse function some iterators
 312   // but the annotations files are (currently) small, so i think i can
 313   // get away with loading the whole file into memory
 314   std::string data;
 315   char c;
 316   while(data_stream.good()) {
 317     data_stream.get(c);
 318     data.push_back(c);
 319   }
 320   data_stream.close();
 321
 322   parse_annot(data, start_index, end_index);
 323 }
 324
 325 /* If this works, yikes, this is some brain hurting code.
 326  *
 327  * what's going on is that when pb_annot is instantiated it stores references
 328  * to begin, end, name, type, declared in the parse function, then
 329  * when operator() is called it grabs values from those references
 330  * and uses that to instantiate an annot object and append that to our
 331  * annotation list.
 332  *
 333  * This weirdness is because the spirit library requires that actions
 334  * conform to a specific prototype operator()(IteratorT, IteratorT)
 335  * which doesn't provide any useful opportunity for me to actually
 336  * grab the results of our parsing.
 337  *
 338  * so I instantiate this structure in order to have a place to grab
 339  * my data from.
 340  */
 341
 342 struct push_back_annot {
 343   std::list<annot>& annot_list;
 344   int& begin;
 345   int& end;
 346   std::string& name;
 347   std::string& type;
 348
 349   push_back_annot(std::list<annot>& annot_list_,
 350                   int& begin_,
 351                   int& end_,
 352                   std::string& name_,
 353                   std::string& type_)
 354   : annot_list(annot_list_),
 355     begin(begin_),
 356     end(end_),
 357     name(name_),
 358     type(type_)
 359   {
 360   }
 361
 362   void operator()(std::string::const_iterator,
 363                   std::string::const_iterator) const
 364   {
 365     //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
 366     annot_list.push_back(annot(begin, end, name, type));
 367   };
 368 };
 369
 370 struct push_back_seq {
 371   std::list<Sequence>& seq_list;
 372   std::string& name;
 373   std::string& seq;
 374
 375   push_back_seq(std::list<Sequence>& seq_list_,
 376                 std::string& name_,
 377                 std::string& seq_)
 378   : seq_list(seq_list_),
 379     name(name_),
 380     seq(seq_)
 381   {
 382   }
 383
 384   void operator()(std::string::const_iterator,
 385                   std::string::const_iterator) const
 386   {
 387     // filter out newlines from our sequence
 388     std::string new_seq;
 389     for(std::string::const_iterator seq_i = seq.begin();
 390         seq_i != seq.end();
 391         ++seq_i)
 392     {
 393       if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
 394     }
 395     //std::cout << "adding seq: " << name << " " << new_seq << std::endl;
 396
 397     Sequence s(new_seq);
 398     s.set_fasta_header(name);
 399     seq_list.push_back(s);
 400   };
 401 };
 402
 403 bool
 404 Sequence::parse_annot(std::string data, int start_index, int end_index)
 405 {
 406   int start=0;
 407   int end=0;
 408   std::string name;
 409   std::string type;
 410   std::string seq;
 411   std::list<Sequence> query_seqs;
 412
 413   bool status = spirit::parse(data.begin(), data.end(),
 414                 (
 415                  //begin grammar
 416                    !(
 417                       (
 418                         spirit::alpha_p >>
 419                         +(spirit::graph_p)
 420                       )[spirit::assign_a(species)] >>
 421                       +(spirit::space_p)
 422                     ) >>
 423                     *(
 424                        ( // ignore html tags
 425                          *(spirit::space_p) >>
 426                          spirit::ch_p('<') >>
 427                          +(~spirit::ch_p('>')) >>
 428                          spirit::ch_p('>') >>
 429                          *(spirit::space_p)
 430                        )
 431                      |
 432                       ( // parse an absolute location name
 433                        (spirit::uint_p[spirit::assign_a(start)] >>
 434                         +spirit::space_p >>
 435                         spirit::uint_p[spirit::assign_a(end)] >>
 436                         +spirit::space_p >>
 437                         (
 438                            spirit::alpha_p >>
 439                            *spirit::graph_p
 440                         )[spirit::assign_a(name)] >>
 441                         // optional type
 442                         !(
 443                             +spirit::space_p >>
 444                             (
 445                               spirit::alpha_p >>
 446                               *spirit::graph_p
 447                             )[spirit::assign_a(type)]
 448                         )
 449                         // to understand how this group gets set
 450                         // read the comment above struct push_back_annot
 451                        )[push_back_annot(annots, start, end, type, name)]
 452                      |
 453                       ((spirit::ch_p('>')|spirit::str_p("&gt;")) >>
 454                          (*(spirit::print_p))[spirit::assign_a(name)] >>
 455                          spirit::eol_p >>
 456                          (+(spirit::chset<>(Alphabet::nucleic_alphabet.c_str())))[spirit::assign_a(seq)]
 457                        )[push_back_seq(query_seqs, name, seq)]
 458                       ) >>
 459                       *spirit::space_p
 460                      )
 461                 //end grammar
 462                 )).full;
 463
 464   // go seearch for query sequences
 465   find_sequences(query_seqs.begin(), query_seqs.end());
 466   return status;
 467 }
 468
 469 void Sequence::add_annotation(const annot& a)
 470 {
 471   annots.push_back(a);
 472 }
 473
 474 const std::list<annot>& Sequence::annotations() const
 475 {
 476   return annots;
 477 }
 478
 479 Sequence
 480 Sequence::subseq(int start, int count)
 481 {
 482   if (!seq) {
 483     Sequence new_seq;
 484     return new_seq;
 485   }
 486
 487   // there might be an off by one error with start+count > size()
 488   if ( count == npos || start+count > size()) {
 489     count = size()-start;
 490   }
 491   Sequence new_seq(*this);
 492   new_seq.parent = this;
 493   new_seq.seq_start = seq_start+start;
 494   new_seq.seq_count = count;
 495
 496   new_seq.motif_list = motif_list;
 497   new_seq.annots.clear();
 498   // attempt to copy & reannotate position based annotations
 499   int end = start+count;
 500
 501   for(std::list<annot>::const_iterator annot_i = annots.begin();
 502       annot_i != annots.end();
 503       ++annot_i)
 504   {
 505     int annot_begin= annot_i->begin;
 506     int annot_end = annot_i->end;
 507
 508     if (annot_begin < end) {
 509       if (annot_begin >= start) {
 510         annot_begin -= start;
 511       } else {
 512         annot_begin = 0;
 513       }
 514
 515       if (annot_end < end) {
 516         annot_end -= start;
 517       } else {
 518         annot_end = count;
 519       }
 520
 521       annot new_annot(annot_begin, annot_end, annot_i->type, annot_i->name);
 522       new_seq.annots.push_back(new_annot);
 523     }
 524   }
 525
 526   return new_seq;
 527 }
 528
 529 std::string Sequence::create_reverse_map() const
 530 {
 531   std::string rc_map(256, '~');
 532   // if we're rna, use U instead of T
 533   // we might want to add an "is_rna" to sequence at somepoint
 534   char TU = (alphabet == reduced_rna_alphabet) ? 'U' : 'T';
 535   char tu = (alphabet == reduced_rna_alphabet) ? 'u' : 't';
 536   rc_map['A'] = TU ; rc_map['a'] = tu ;
 537   rc_map['T'] = 'A'; rc_map['t'] = 'a';
 538   rc_map['U'] = 'A'; rc_map['u'] = 'a';
 539   rc_map['G'] = 'C'; rc_map['g'] = 'c';
 540   rc_map['C'] = 'G'; rc_map['c'] = 'g';
 541   rc_map['M'] = 'K'; rc_map['m'] = 'k';
 542   rc_map['R'] = 'Y'; rc_map['r'] = 'y';
 543   rc_map['W'] = 'W'; rc_map['w'] = 'w';
 544   rc_map['S'] = 'S'; rc_map['s'] = 's';
 545   rc_map['Y'] = 'R'; rc_map['y'] = 'r';
 546   rc_map['K'] = 'M'; rc_map['k'] = 'm';
 547   rc_map['V'] = 'B'; rc_map['v'] = 'b';
 548   rc_map['H'] = 'D'; rc_map['h'] = 'd';
 549   rc_map['D'] = 'H'; rc_map['d'] = 'h';
 550   rc_map['B'] = 'V'; rc_map['b'] = 'v';
 551   rc_map['N'] = 'N'; rc_map['n'] = 'n';
 552   rc_map['X'] = 'X'; rc_map['x'] = 'x';
 553   rc_map['?'] = '?';
 554   rc_map['.'] = '.';
 555   rc_map['-'] = '-';
 556   rc_map['~'] = '~'; // not really needed, but perhaps it's clearer.
 557   return rc_map;
 558 }
 559
 560 Sequence Sequence::rev_comp() const
 561 {
 562   std::string rev_comp;
 563   rev_comp.reserve(length());
 564
 565   std::string rc_map = create_reverse_map();
 566
 567   // reverse and convert
 568   seq_string::const_reverse_iterator seq_i;
 569   seq_string::const_reverse_iterator seq_end = seq->rend();
 570   for(seq_i = seq->rbegin();
 571       seq_i != seq_end;
 572       ++seq_i)
 573   {
 574     rev_comp.append(1, rc_map[*seq_i]);
 575   }
 576   return Sequence(rev_comp, alphabet);
 577 }
 578
 579 void Sequence::set_fasta_header(std::string header_)
 580 {
 581   header = header_;
 582 }
 583
 584 void Sequence::set_species(const std::string& name)
 585 {
 586   species = name;
 587 }
 588
 589 std::string Sequence::get_species() const
 590 {
 591   return species;
 592 }
 593
 594
 595 std::string
 596 Sequence::get_fasta_header() const
 597 {
 598   return header;
 599 }
 600
 601 std::string
 602 Sequence::get_name() const
 603 {
 604   if (header.size() > 0)
 605     return header;
 606   else if (species.size() > 0)
 607     return species;
 608   else
 609     return "";
 610 }
 611
 612 const Alphabet& Sequence::get_alphabet() const
 613 {
 614   return get_alphabet(alphabet);
 615 }
 616
 617 const Alphabet& Sequence::get_alphabet(alphabet_ref alpha) const
 618 {
 619   switch (alpha) {
 620     case reduced_dna_alphabet:
 621       return Alphabet::reduced_dna_alphabet;
 622     case reduced_rna_alphabet:
 623       return Alphabet::reduced_rna_alphabet;
 624     case reduced_nucleic_alphabet:
 625       return Alphabet::reduced_nucleic_alphabet;
 626     case nucleic_alphabet:
 627       return Alphabet::nucleic_alphabet;
 628     case protein_alphabet:
 629       return Alphabet::protein_alphabet;
 630     default:
 631       throw std::runtime_error("unrecognized alphabet type");
 632       break;
 633   }
 634 }
 635
 636 void Sequence::set_sequence(const std::string& s, alphabet_ref a)
 637 {
 638   set_filtered_sequence(s, a);
 639 }
 640
 641 std::string Sequence::get_sequence() const
 642 {
 643   if (seq)
 644     return *seq;
 645   else
 646     return std::string();
 647 }
 648
 649 Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
 650 {
 651   return at(i);
 652 }
 653
 654 Sequence::const_reference Sequence::at(Sequence::size_type i) const
 655 {
 656   if (!seq) throw std::out_of_range("empty sequence");
 657   return seq->at(i+seq_start);
 658 }
 659
 660 void
 661 Sequence::clear()
 662 {
 663   parent = 0;
 664   seq.reset();
 665   seq_start = 0;
 666   seq_count = 0;
 667   strand = UnknownStrand;
 668   header.clear();
 669   species.clear();
 670   annots.clear();
 671   motif_list.clear();
 672 }
 673
 674 const char *Sequence::c_str() const
 675 {
 676   if (seq)
 677     return seq->c_str()+seq_start;
 678   else
 679     return 0;
 680 }
 681
 682 Sequence::const_iterator Sequence::begin() const
 683 {
 684   if (seq and seq_count != 0)
 685     return seq->begin()+seq_start;
 686   else
 687     return Sequence::const_iterator(0);
 688 }
 689
 690 Sequence::const_iterator Sequence::end() const
 691 {
 692   if (seq and seq_count != 0) {
 693     return seq->begin() + seq_start + seq_count;
 694   } else {
 695     return Sequence::const_iterator(0);
 696   }
 697 }
 698
 699 bool Sequence::empty() const
 700 {
 701   return (seq_count == 0) ? true : false;
 702 }
 703
 704 Sequence::size_type Sequence::start() const
 705 {
 706   if (parent)
 707     return seq_start - parent->start();
 708   else
 709     return seq_start;
 710 }
 711
 712 Sequence::size_type Sequence::stop() const
 713 {
 714   return start() + seq_count;
 715 }
 716
 717 Sequence::size_type Sequence::size() const
 718 {
 719   return seq_count;
 720 }
 721
 722 Sequence::size_type Sequence::length() const
 723 {
 724   return size();
 725 }
 726
 727 void
 728 Sequence::save(fs::fstream &save_file)
 729 {
 730   //fstream save_file;
 731   std::list<annot>::iterator annots_i;
 732
 733   // not sure why, or if i'm doing something wrong, but can't seem to pass
 734   // file pointers down to this method from the mussa control class
 735   // so each call to save a sequence appends to the file started by mussa_class
 736   //save_file.open(save_file_path.c_str(), std::ios::app);
 737
 738   save_file << "<Sequence>" << std::endl;
 739   save_file << *this << std::endl;
 740   save_file << "</Sequence>" << std::endl;
 741
 742   save_file << "<Annotations>" << std::endl;
 743   save_file << species << std::endl;
 744   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 745   {
 746     save_file << annots_i->begin << " " << annots_i->end << " " ;
 747     save_file << annots_i->name << " " << annots_i->type << std::endl;
 748   }
 749   save_file << "</Annotations>" << std::endl;
 750   //save_file.close();
 751 }
 752
 753 void
 754 Sequence::load_museq(fs::path load_file_path, int seq_num)
 755 {
 756   fs::fstream load_file;
 757   std::string file_data_line;
 758   int seq_counter;
 759   annot an_annot;
 760   std::string::size_type space_split_i;
 761   std::string annot_value;
 762
 763   annots.clear();
 764   load_file.open(load_file_path, std::ios::in);
 765
 766   seq_counter = 0;
 767   // search for the seq_num-th sequence
 768   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 769   {
 770     getline(load_file,file_data_line);
 771     if (file_data_line == "<Sequence>")
 772       seq_counter++;
 773   }
 774   getline(load_file, file_data_line);
 775   // looks like the sequence is written as a single line
 776   set_filtered_sequence(file_data_line, reduced_dna_alphabet);
 777   getline(load_file, file_data_line);
 778   getline(load_file, file_data_line);
 779   if (file_data_line == "<Annotations>")
 780   {
 781     getline(load_file, file_data_line);
 782     species = file_data_line;
 783     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 784     {
 785       getline(load_file,file_data_line);
 786       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 787       {
 788         // need to get 4 values...almost same code 4 times...
 789         // get annot start index
 790         space_split_i = file_data_line.find(" ");
 791         annot_value = file_data_line.substr(0,space_split_i);
 792         an_annot.begin = atoi (annot_value.c_str());
 793         file_data_line = file_data_line.substr(space_split_i+1);
 794         // get annot end index
 795         space_split_i = file_data_line.find(" ");
 796         annot_value = file_data_line.substr(0,space_split_i);
 797         an_annot.end = atoi (annot_value.c_str());
 798
 799         if (space_split_i == std::string::npos)  // no entry for type or name
 800         {
 801           std::cout << "seq, annots - no type or name\n";
 802           an_annot.type = "";
 803           an_annot.name = "";
 804         }
 805         else   // else get annot type
 806         {
 807           file_data_line = file_data_line.substr(space_split_i+1);
 808           space_split_i = file_data_line.find(" ");
 809           annot_value = file_data_line.substr(0,space_split_i);
 810           an_annot.type = annot_value;
 811           if (space_split_i == std::string::npos)  // no entry for name
 812           {
 813             std::cout << "seq, annots - no name\n";
 814             an_annot.name = "";
 815           }
 816           else          // get annot name
 817           {
 818             file_data_line = file_data_line.substr(space_split_i+1);
 819             space_split_i = file_data_line.find(" ");
 820             annot_value = file_data_line.substr(0,space_split_i);
 821             an_annot.type = annot_value;
 822           }
 823         }
 824         annots.push_back(an_annot);  // don't forget to actually add the annot
 825       }
 826       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 827       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
 828     }
 829   }
 830   load_file.close();
 831 }
 832
 833
 834 void Sequence::add_motif(const Sequence& a_motif)
 835 {
 836   std::vector<int> motif_starts = find_motif(a_motif);
 837
 838   for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
 839       motif_start_i != motif_starts.end();
 840       ++motif_start_i)
 841   {
 842     motif_list.push_back(motif(*motif_start_i, a_motif.get_sequence()));
 843   }
 844 }
 845
 846 void Sequence::clear_motifs()
 847 {
 848   motif_list.clear();
 849 }
 850
 851 const std::list<motif>& Sequence::motifs() const
 852 {
 853   return motif_list;
 854 }
 855
 856 std::vector<int>
 857 Sequence::find_motif(const Sequence& a_motif) const
 858 {
 859   std::vector<int> motif_match_starts;
 860   Sequence norm_motif_rc;
 861
 862   motif_match_starts.clear();
 863   // std::cout << "motif is: " << norm_motif << std::endl;
 864
 865   if (a_motif.size() > 0)
 866   {
 867     //std::cout << "Sequence: none blank motif\n";
 868     motif_scan(a_motif, &motif_match_starts);
 869
 870     norm_motif_rc = a_motif.rev_comp();;
 871     // make sure not to do search again if it is a palindrome
 872     if (norm_motif_rc != a_motif) {
 873       motif_scan(norm_motif_rc, &motif_match_starts);
 874     }
 875   }
 876   return motif_match_starts;
 877 }
 878
 879 void
 880 Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
 881 {
 882   // if there's no sequence we can't scan for it?
 883   // should this throw an exception?
 884   if (!seq) return;
 885
 886   std::string::size_type seq_i = 0;
 887   Sequence::size_type motif_i = 0;
 888   Sequence::size_type motif_len = a_motif.length();
 889   Sequence::value_type motif_char;
 890   Sequence::value_type seq_char;
 891
 892   while (seq_i < seq->length())
 893   {
 894     // this is pretty much a straight translation of Nora's python code
 895     // to match iupac letter codes
 896     motif_char = toupper(a_motif[motif_i]);
 897     seq_char = toupper(seq->at(seq_i));
 898     if (motif_char =='N')
 899       motif_i++;
 900     else if (motif_char == seq_char)
 901       motif_i++;
 902     else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
 903       motif_i++;
 904     else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
 905       motif_i++;
 906     else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
 907       motif_i++;
 908     else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
 909       motif_i++;
 910     else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
 911       motif_i++;
 912     else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
 913       motif_i++;
 914     else if ((motif_char =='V') &&
 915              ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
 916       motif_i++;
 917     else if ((motif_char =='H') &&
 918              ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
 919       motif_i++;
 920     else if ((motif_char =='D') &&
 921              ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
 922       motif_i++;
 923     else if ((motif_char =='B') &&
 924              ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
 925       motif_i++;
 926     else
 927     {
 928       // if a motif doesn't match, erase our current trial and try again
 929       seq_i -= motif_i;
 930       motif_i = 0;
 931     }
 932
 933     // end Nora stuff, now we see if a match is found this pass
 934     if (motif_i == motif_len)
 935     {
 936       annot new_motif;
 937       motif_match_starts->push_back(seq_i - motif_len + 1);
 938       motif_i = 0;
 939     }
 940
 941     seq_i++;
 942   }
 943   //std::cout << std::endl;
 944 }
 945
 946 void Sequence::add_string_annotation(std::string a_seq,
 947                                      std::string name)
 948 {
 949   std::vector<int> seq_starts = find_motif(a_seq);
 950
 951   //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
 952
 953   for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
 954       seq_start_i != seq_starts.end();
 955       ++seq_start_i)
 956   {
 957     annots.push_back(annot(*seq_start_i,
 958                            *seq_start_i+a_seq.size(),
 959                            "",
 960                            name));
 961   }
 962 }
 963
 964 void Sequence::find_sequences(std::list<Sequence>::iterator start,
 965                               std::list<Sequence>::iterator end)
 966 {
 967   while (start != end) {
 968     add_string_annotation(start->get_sequence(), start->get_fasta_header());
 969     ++start;
 970   }
 971 }
 972
 973
 974 std::ostream& operator<<(std::ostream& out, const Sequence& s)
 975 {
 976   for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
 977     out << *s_i;
 978   }
 979   return out;
 980 }
 981
 982 bool operator<(const Sequence& x, const Sequence& y)
 983 {
 984   Sequence::const_iterator x_i = x.begin();
 985   Sequence::const_iterator y_i = y.begin();
 986   // for sequences there's some computation associated with computing .end
 987   // so lets cache it.
 988   Sequence::const_iterator xend = x.end();
 989   Sequence::const_iterator yend = y.end();
 990   while(1) {
 991     if( x_i == xend and y_i == yend ) {
 992       return false;
 993     } else if ( x_i == xend ) {
 994       return true;
 995     } else if ( y_i == yend ) {
 996       return false;
 997     } else if ( (*x_i) < (*y_i)) {
 998       return true;
 999     } else if ( (*x_i) > (*y_i) ) {
1000       return false;
1001     } else {
1002       ++x_i;
1003       ++y_i;
1004     }
1005   }
1006 }
1007
1008 bool operator==(const Sequence& x, const Sequence& y)
1009 {
1010   if (x.empty() and y.empty()) {
1011     // if there's no sequence in either sequence structure, they're equal
1012     return true;
1013   } else if (x.empty() or y.empty()) {
1014     // if we fail the first test, and we discover one is empty,
1015     // we know they can't be equal. (and we need to do this
1016     // to prevent dereferencing an empty pointer)
1017     return false;
1018   } else if (x.seq_count != y.seq_count) {
1019     // if they're of different lenghts, they're not equal
1020     return false;
1021   }
1022   Sequence::const_iterator xseq_i = x.begin();
1023   Sequence::const_iterator yseq_i = y.begin();
1024   // since the length of the two sequences is equal, we only need to
1025   // test one.
1026   for(; xseq_i != x.end(); ++xseq_i, ++yseq_i) {
1027     if (toupper(*xseq_i) != toupper(*yseq_i)) {
1028       return false;
1029     }
1030   }
1031   return true;
1032 }
1033
1034 bool operator!=(const Sequence& x, const Sequence& y)
1035 {
1036   return not operator==(x, y);
1037 }