alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25 #include <boost/filesystem/operations.hpp>
  26 namespace fs = boost::filesystem;
  27
  28 #include <boost/spirit/core.hpp>
  29 #include <boost/spirit/actor/push_back_actor.hpp>
  30 #include <boost/spirit/iterator/file_iterator.hpp>
  31 #include <boost/spirit/utility/chset.hpp>
  32 namespace spirit = boost::spirit;
  33
  34 #include "alg/sequence.hpp"
  35 #include "io.hpp"
  36 #include "mussa_exceptions.hpp"
  37
  38 #include <string>
  39 #include <stdexcept>
  40 #include <iostream>
  41 #include <sstream>
  42 #include <set>
  43
  44 bool operator==(const motif& left, const motif& right)
  45 {
  46   return ((left.begin== right.begin) and
  47           (left.end == right.end) and
  48           (left.type == right.type) and
  49           (left.name == right.name));
  50 }
  51
  52 motif::motif()
  53  : begin(0),
  54    end(0),
  55    type("motif"),
  56    name(""),
  57    sequence("")
  58 {
  59 }
  60
  61 motif::motif(int begin_, std::string motif)
  62  : begin(begin_),
  63    end(begin_+motif.size()),
  64    type("motif"),
  65    name(motif),
  66    sequence(motif)
  67 {
  68 }
  69
  70 motif::~motif()
  71 {
  72 }
  73
  74 Sequence::Sequence(AlphabetRef alphabet)
  75   : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)),
  76     annotation_list(new SeqSpanRefList),
  77     motif_list(new MotifList)
  78 {
  79 }
  80
  81 Sequence::~Sequence()
  82 {
  83 }
  84
  85 Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
  86   : header(""),
  87     species(""),
  88     annotation_list(new SeqSpanRefList),
  89     motif_list(new MotifList)
  90 {
  91   set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
  92 }
  93
  94 Sequence::Sequence(const std::string& seq,
  95                    AlphabetRef alphabet_,
  96                    SeqSpan::strand_type strand_)
  97   : header(""),
  98     species(""),
  99     annotation_list(new SeqSpanRefList),
 100     motif_list(new MotifList)
 101 {
 102   set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
 103 }
 104
 105 Sequence::Sequence(const Sequence& o)
 106   : seq(o.seq),
 107     header(o.header),
 108     species(o.species),
 109     annotation_list(o.annotation_list),
 110     motif_list(o.motif_list)
 111 {
 112 }
 113
 114 Sequence::Sequence(const Sequence* o)
 115   : seq(o->seq),
 116     header(o->header),
 117     species(o->species),
 118     annotation_list(o->annotation_list),
 119     motif_list(o->motif_list)
 120 {
 121 }
 122
 123 Sequence::Sequence(const SequenceRef o)
 124   : seq(new SeqSpan(o->seq)),
 125     header(o->header),
 126     species(o->species),
 127     annotation_list(new SeqSpanRefList),
 128     motif_list(o->motif_list)
 129 {
 130   // copy over the annotations in the other sequence ref,
 131   // attaching them to our current sequence ref
 132   for(SeqSpanRefList::const_iterator annot_i = o->annotation_list->begin();
 133       annot_i != o->annotation_list->end();
 134       ++annot_i)
 135   {
 136     size_type annot_begin= (*annot_i)->start();
 137     size_type annot_count = (*annot_i)->size();
 138
 139     SeqSpanRef new_annot(seq->subseq(annot_begin, annot_count));
 140     new_annot->setAnnotations((*annot_i)->annotations());
 141     annotation_list->push_back(new_annot);
 142   }
 143 }
 144
 145 Sequence::Sequence(const SeqSpanRef& seq_ref)
 146   : seq(seq_ref),
 147     header(""),
 148     species(""),
 149     annotation_list(new SeqSpanRefList),
 150     motif_list(new MotifList)
 151 {
 152 }
 153
 154 Sequence &Sequence::operator=(const Sequence& s)
 155 {
 156   if (this != &s) {
 157     seq = s.seq;
 158     header = s.header;
 159     species = s.species;
 160     annotation_list = s.annotation_list;
 161     motif_list = s.motif_list;
 162   }
 163   return *this;
 164 }
 165
 166 void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
 167 {
 168   load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 169 }
 170
 171 //! load a fasta file into a sequence
 172 void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
 173                           int seq_num, int start_index, int end_index)
 174 {
 175   fs::fstream data_file;
 176   data_file.open(file_path, std::ios::in);
 177
 178   if (!data_file.good())
 179   {
 180     throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
 181   } else {
 182     try {
 183       load_fasta(data_file, a, seq_num, start_index, end_index);
 184     } catch(sequence_empty_error e) {
 185       // there doesn't appear to be any sequence
 186       // catch and rethrow to include the filename
 187       std::stringstream msg;
 188       msg << "The selected sequence in "
 189           << file_path.native_file_string()
 190           << " appears to be empty";
 191       throw sequence_empty_error(msg.str());
 192     } catch(sequence_empty_file_error e) {
 193       std::stringstream errormsg;
 194       errormsg << file_path.native_file_string()
 195                << " did not have any fasta sequences" << std::endl;
 196       throw sequence_empty_file_error(errormsg.str());
 197     } catch(sequence_invalid_load_error e) {
 198       std::ostringstream msg;
 199       msg << file_path.native_file_string();
 200       msg << " " << e.what();
 201       throw sequence_invalid_load_error(msg.str());
 202     }
 203   }
 204 }
 205
 206 void Sequence::load_fasta(std::istream& file,
 207                           int seq_num, int start_index, int end_index)
 208 {
 209   load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 210 }
 211
 212 void
 213 Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
 214                      int seq_num,
 215                      int start_index, int end_index)
 216 {
 217   std::string file_data_line;
 218   int header_counter = 0;
 219   size_t line_counter = 0;
 220   bool read_seq = true;
 221   std::string rev_comp;
 222   std::string sequence_raw;
 223   std::string seq_tmp;      // holds sequence during basic filtering
 224   const Alphabet &alpha = Alphabet::get_alphabet(a);
 225
 226   if (seq_num == 0) {
 227     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 228   }
 229
 230   // search for the header of the fasta sequence we want
 231   while ( (!data_file.eof()) && (header_counter < seq_num) )
 232   {
 233     multiplatform_getline(data_file, file_data_line);
 234     ++line_counter;
 235     if (file_data_line.substr(0,1) == ">")
 236       header_counter++;
 237   }
 238
 239   if (header_counter > 0) {
 240     header = file_data_line.substr(1);
 241
 242     sequence_raw = "";
 243
 244     while ( !data_file.eof() && read_seq ) {
 245       multiplatform_getline(data_file,file_data_line);
 246       ++line_counter;
 247       if (file_data_line.substr(0,1) == ">")
 248         read_seq = false;
 249       else {
 250         for (std::string::const_iterator line_i = file_data_line.begin();
 251              line_i != file_data_line.end();
 252              ++line_i)
 253          {
 254            if(alpha.exists(*line_i)) {
 255              sequence_raw += *line_i;
 256            } else {
 257             std::ostringstream msg;
 258             msg << "Unrecognized characters in fasta sequence at line ";
 259             msg << line_counter;
 260             throw sequence_invalid_load_error(msg.str());
 261            }
 262          }
 263       }
 264     }
 265
 266     // Lastly, if subselection of the sequence was specified we keep cut out
 267     // and only keep that part
 268     // end_index = 0 means no end was specified, so cut to the end
 269     if (end_index == 0)
 270       end_index = sequence_raw.size();
 271
 272     // sequence filtering for upcasing agctn and convert non AGCTN to N
 273     if (end_index-start_index <= 0) {
 274       std::string msg("The selected sequence appears to be empty");
 275       throw sequence_empty_error(msg);
 276     }
 277     set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
 278   } else {
 279     std::string errormsg("There were no fasta sequences");
 280     throw sequence_empty_file_error(errormsg);
 281   }
 282 }
 283
 284 void Sequence::set_filtered_sequence(const std::string &in_seq,
 285                                      AlphabetRef alphabet_,
 286                                      size_type start,
 287                                      size_type count,
 288                                      SeqSpan::strand_type strand_)
 289 {
 290   if ( count == npos)
 291     count = in_seq.size() - start;
 292   std::string new_seq;
 293   new_seq.reserve(count);
 294
 295   // finally, the actual conversion loop
 296   const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
 297   std::string::const_iterator seq_i = in_seq.begin()+start;
 298   for(size_type i = 0; i != count; ++i, ++seq_i)
 299   {
 300     if (alpha_impl.exists(*seq_i)) {
 301       new_seq.append(1, toupper(*seq_i));
 302     } else {
 303       new_seq.append(1, 'N');
 304     }
 305   }
 306   SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
 307   seq = new_seq_ref;
 308 }
 309
 310 void
 311 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 312 {
 313   if (not fs::exists(file_path)) {
 314     throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
 315   }
 316   if (fs::is_directory(file_path)) {
 317     throw mussa_load_error(file_path.string() +
 318             " is a directory, please provide a file for annotations."
 319           );
 320   }
 321   fs::fstream data_stream(file_path, std::ios::in);
 322   if (!data_stream)
 323   {
 324     throw mussa_load_error("Error loading annotation file " + file_path.string());
 325   }
 326
 327   try {
 328     load_annot(data_stream, start_index, end_index);
 329   }  catch(annotation_load_error e) {
 330     std::ostringstream msg;
 331     msg << file_path.native_file_string()
 332         << " "
 333         << e.what();
 334     throw annotation_load_error(msg.str());
 335   }
 336   data_stream.close();
 337 }
 338
 339 void
 340 Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
 341 {
 342   // so i should probably be passing the parse function some iterators
 343   // but the annotations files are (currently) small, so i think i can
 344   // get away with loading the whole file into memory
 345   std::string data;
 346   char c;
 347   while(data_stream.good()) {
 348     data_stream.get(c);
 349     data.push_back(c);
 350   }
 351
 352   parse_annot(data, start_index, end_index);
 353 }
 354
 355 /* If this works, yikes, this is some brain hurting code.
 356  *
 357  * what's going on is that when pb_annot is instantiated it stores references
 358  * to begin, end, name, type, declared in the parse function, then
 359  * when operator() is called it grabs values from those references
 360  * and uses that to instantiate an annot object and append that to our
 361  * annotation list.
 362  *
 363  * This weirdness is because the spirit library requires that actions
 364  * conform to a specific prototype operator()(IteratorT, IteratorT)
 365  * which doesn't provide any useful opportunity for me to actually
 366  * grab the results of our parsing.
 367  *
 368  * so I instantiate this structure in order to have a place to grab
 369  * my data from.
 370  */
 371
 372 struct push_back_annot {
 373   Sequence* parent;
 374   SeqSpanRefListRef children;
 375   int& begin;
 376   int& end;
 377   std::string& name;
 378   std::string& type;
 379   int &parsed;
 380
 381   push_back_annot(Sequence* parent_seq,
 382                   SeqSpanRefListRef children_list,
 383                   int& begin_,
 384                   int& end_,
 385                   std::string& name_,
 386                   std::string& type_,
 387                   int &parsed_)
 388   : parent(parent_seq),
 389     children(children_list),
 390     begin(begin_),
 391     end(end_),
 392     name(name_),
 393     type(type_),
 394     parsed(parsed_)
 395   {
 396   }
 397
 398   void operator()(std::string::const_iterator,
 399                   std::string::const_iterator) const
 400   {
 401     children->push_back(parent->make_annotation(name, type, begin, end));
 402     ++parsed;
 403   };
 404 };
 405
 406 struct push_back_seq {
 407   std::list<Sequence>& seq_list;
 408   std::string& name;
 409   std::string& seq;
 410   int &parsed;
 411
 412   push_back_seq(std::list<Sequence>& seq_list_,
 413                 std::string& name_,
 414                 std::string& seq_,
 415                 int &parsed_)
 416   : seq_list(seq_list_),
 417     name(name_),
 418     seq(seq_),
 419     parsed(parsed_)
 420   {
 421   }
 422
 423   void operator()(std::string::const_iterator,
 424                   std::string::const_iterator) const
 425   {
 426     std::string::iterator seq_i = seq.begin();
 427     std::string::iterator seq_end = seq.end();
 428
 429     // this if block is a hack, for some reason spirit was
 430     // duplicating the last character if the file didn't end
 431     // with a new line.
 432     // this checks for the trailing newline, and if it is missing
 433     // removes the last character ( which should be the duplicated character.
 434     // check test_sequence.cpp:sequence_no_trailing_newline for test case
 435     // also see ticket:265 for more information
 436     if (seq.size() > 0) {
 437       std::string::value_type c = seq[seq.size()-1];
 438       if (not (c == '\015' or c == '\012')) {
 439         // doesn't end with a new line character
 440         seq_end--;
 441       }
 442     }
 443     // end hack
 444
 445     // filter out newlines from our sequence
 446     std::string new_seq;
 447     for(; seq_i != seq_end; ++seq_i)
 448     {
 449       if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
 450     }
 451     //std::cout << "adding seq: " << name << " " << new_seq << std::endl;
 452     Sequence s(new_seq);
 453     s.set_fasta_header(name);
 454     seq_list.push_back(s);
 455     ++parsed;
 456   };
 457 };
 458
 459 void
 460 Sequence::parse_annot(std::string data, int start_index, int end_index)
 461 {
 462   int start=0;
 463   int end=0;
 464   std::string name;
 465   std::string type;
 466   std::string seqstr;
 467   SeqSpanRefListRef parsed_annots(new SeqSpanRefList);
 468   std::list<Sequence> query_seqs;
 469   int parsed=0;
 470
 471   bool ok = spirit::parse(data.begin(), data.end(),
 472               (
 473                //begin grammar
 474                  !(
 475                     (
 476                       spirit::alpha_p >>
 477                       +(spirit::graph_p)
 478                     )[spirit::assign_a(species)] >>
 479                     +(spirit::space_p)
 480                   ) >>
 481                   *(
 482                      ( // ignore html tags
 483                        *(spirit::space_p) >>
 484                        spirit::ch_p('<') >>
 485                        +(~spirit::ch_p('>')) >>
 486                        spirit::ch_p('>') >>
 487                        *(spirit::space_p)
 488                      )
 489                    |
 490                     ( // parse an absolute location name
 491                      (spirit::uint_p[spirit::assign_a(start)] >>
 492                       +spirit::space_p >>
 493                       spirit::uint_p[spirit::assign_a(end)] >>
 494                       +spirit::space_p >>
 495                       (
 496                          spirit::alpha_p >>
 497                          *spirit::graph_p
 498                       )[spirit::assign_a(name)] >>
 499                       // optional type
 500                       !(
 501                           +spirit::space_p >>
 502                           (
 503                             spirit::alpha_p >>
 504                             *spirit::graph_p
 505                           )[spirit::assign_a(type)]
 506                       )
 507                       // to understand how this group gets set
 508                       // read the comment above struct push_back_annot
 509                      )[push_back_annot(this, parsed_annots, start, end, name, type, parsed)]
 510                    |
 511                     ((spirit::ch_p('>')|spirit::str_p("&gt;")) >>
 512                        (*(spirit::print_p))[spirit::assign_a(name)] >>
 513                        spirit::eol_p >>
 514                        (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seqstr)]
 515                      )[push_back_seq(query_seqs, name, seqstr, parsed)]
 516                     ) >>
 517                     *spirit::space_p
 518                    )
 519               //end grammar
 520               )).full;
 521   if (not ok) {
 522     std::stringstream msg;
 523     msg << "Error parsing annotation #" << parsed;
 524     throw annotation_load_error(msg.str());
 525   }
 526   // If everything loaded correctly add the sequences to our annotation list
 527   // add newly parsed annotations to our sequence
 528   std::copy(parsed_annots->begin(), parsed_annots->end(), std::back_inserter(*annotation_list));
 529   // go search for query sequences
 530   find_sequences(query_seqs.begin(), query_seqs.end());
 531 }
 532
 533 void Sequence::add_annotation(const SeqSpanRef a)
 534 {
 535   annotation_list->push_back(a);
 536 }
 537
 538 void Sequence::add_annotation(std::string name, std::string type, size_type start, size_type stop)
 539 {
 540   add_annotation(make_annotation(name, type, start, stop));
 541 }
 542
 543 SeqSpanRef
 544 Sequence::make_annotation(std::string name, std::string type, size_type start, size_type stop) const
 545 {
 546   // we want things to be in the positive direction
 547   if (stop < start) {
 548     size_type tmp = start;
 549     start = stop;
 550     stop = tmp;
 551   }
 552   size_type count = stop - start;
 553   SeqSpanRef new_annot(seq->subseq(start, count,  SeqSpan::UnknownStrand));
 554   AnnotationsRef metadata(new Annotations(name));
 555   metadata->set("type", type);
 556   new_annot->setAnnotations(metadata);
 557   return new_annot;
 558 }
 559
 560 const SeqSpanRefList& Sequence::annotations() const
 561 {
 562   return *annotation_list;
 563 }
 564
 565 void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
 566 {
 567   new_seq.motif_list = motif_list;
 568   new_seq.annotation_list.reset(new SeqSpanRefList);
 569
 570   for(SeqSpanRefList::const_iterator annot_i = annotation_list->begin();
 571       annot_i != annotation_list->end();
 572       ++annot_i)
 573   {
 574     size_type annot_begin= (*annot_i)->start();
 575     size_type annot_end = (*annot_i)->stop();
 576
 577     if (annot_begin < start+count) {
 578       if (annot_begin >= start) {
 579         annot_begin -= start;
 580       } else {
 581         annot_begin = 0;
 582       }
 583
 584       if (annot_end < start+count) {
 585         annot_end -= start;
 586       } else {
 587         annot_end = count;
 588       }
 589       SeqSpanRef new_annot(new_seq.seq->subseq(annot_begin, annot_end));
 590       new_annot->setAnnotations((*annot_i)->annotations());
 591       new_seq.annotation_list->push_back(new_annot);
 592     }
 593   }
 594 }
 595
 596 Sequence
 597 Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
 598 {
 599   // FIXME: should i really allow a subsequence of an empty sequence?
 600   if (!seq) {
 601     Sequence new_seq;
 602     return new_seq;
 603   }
 604
 605   Sequence new_seq(*this);
 606   new_seq.seq = seq->subseq(start, count, strand);
 607   if (seq->annotations()) {
 608     AnnotationsRef a(new Annotations(*(seq->annotations())));
 609     new_seq.seq->setAnnotations(a);
 610   }
 611   copy_children(new_seq, start, count);
 612
 613   return new_seq;
 614 }
 615
 616
 617 // FIXME: This needs to be moved into SeqSpan
 618 Sequence Sequence::rev_comp() const
 619 {
 620   // a reverse complement is the whole opposite strand
 621   return subseq(0, npos, SeqSpan::OppositeStrand);
 622 }
 623
 624 const Alphabet& Sequence::get_alphabet() const
 625 {
 626   return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
 627 }
 628
 629 void Sequence::set_fasta_header(std::string header_)
 630 {
 631   header = header_;
 632 }
 633
 634 void Sequence::set_species(const std::string& name)
 635 {
 636   species = name;
 637 }
 638
 639 std::string Sequence::get_species() const
 640 {
 641   return species;
 642 }
 643
 644
 645 std::string
 646 Sequence::get_fasta_header() const
 647 {
 648   return header;
 649 }
 650
 651 std::string
 652 Sequence::get_name() const
 653 {
 654   if (header.size() > 0)
 655     return header;
 656   else if (species.size() > 0)
 657     return species;
 658   else
 659     return "";
 660 }
 661
 662 void Sequence::set_sequence(const std::string& s, AlphabetRef a)
 663 {
 664   set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
 665 }
 666
 667 std::string Sequence::get_sequence() const
 668 {
 669   return seq->sequence();
 670 }
 671
 672 Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
 673 {
 674   return at(i);
 675 }
 676
 677 void
 678 Sequence::clear()
 679 {
 680   seq.reset();
 681   header.clear();
 682   species.clear();
 683   annotation_list.reset(new SeqSpanRefList);
 684   motif_list.reset(new MotifList);
 685 }
 686
 687 void
 688 Sequence::save(fs::fstream &save_file)
 689 {
 690   std::string type("type");
 691   std::string empty_str("");
 692   //fstream save_file;
 693   SeqSpanRefList::iterator annots_i;
 694   AnnotationsRef metadata;
 695
 696   save_file << "<Sequence>" << std::endl;
 697   save_file << *this << std::endl;
 698   save_file << "</Sequence>" << std::endl;
 699
 700   save_file << "<Annotations>" << std::endl;
 701   save_file << species << std::endl;
 702   for (annots_i = annotation_list->begin();
 703        annots_i != annotation_list->end();
 704        ++annots_i)
 705   {
 706     metadata = (*annots_i)->annotations();
 707     save_file << (*annots_i)->parentStart() << " " << (*annots_i)->parentStop() << " " ;
 708     save_file << metadata->name() << " "
 709               << metadata->getdefault(type, empty_str) << std::endl;
 710   }
 711   save_file << "</Annotations>" << std::endl;
 712   //save_file.close();
 713 }
 714
 715 //void
 716 //Sequence::load_museq(fs::path load_file_path, int seq_num)
 717 //{
 718 //  fs::fstream load_file;
 719 //  std::string file_data_line;
 720 //  int seq_counter;
 721 //  //annot an_annot;
 722 //  int annot_begin;
 723 //  int annot_end;
 724 //  std::string annot_name;
 725 //  std::string annot_type;
 726 //
 727 //  std::string::size_type space_split_i;
 728 //  std::string annot_value;
 729 //
 730 //  annotation_list.reset(new SeqSpanRefList);
 731 //
 732 //  load_file.open(load_file_path, std::ios::in);
 733 //
 734 //  seq_counter = 0;
 735 //  // search for the seq_num-th sequence
 736 //  while ( (!load_file.eof()) && (seq_counter < seq_num) )
 737 //  {
 738 //    getline(load_file,file_data_line);
 739 //    if (file_data_line == "<Sequence>")
 740 //      seq_counter++;
 741 //  }
 742 //  getline(load_file, file_data_line);
 743 //  // looks like the sequence is written as a single line
 744 //  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
 745 //  getline(load_file, file_data_line);
 746 //  getline(load_file, file_data_line);
 747 //  if (file_data_line == "<Annotations>")
 748 //  {
 749 //    getline(load_file, file_data_line);
 750 //    species = file_data_line;
 751 //    while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 752 //    {
 753 //      getline(load_file,file_data_line);
 754 //      if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 755 //      {
 756 //        // need to get 4 values...almost same code 4 times...
 757 //        // get annot start index
 758 //        space_split_i = file_data_line.find(" ");
 759 //        annot_value = file_data_line.substr(0,space_split_i);
 760 //        annot_begin = atoi (annot_value.c_str());
 761 //        file_data_line = file_data_line.substr(space_split_i+1);
 762 //        // get annot end index
 763 //        space_split_i = file_data_line.find(" ");
 764 //        annot_value = file_data_line.substr(0,space_split_i);
 765 //        annot_end = atoi (annot_value.c_str());
 766 //
 767 //        if (space_split_i == std::string::npos)  // no entry for type or name
 768 //        {
 769 //          std::cout << "seq, annots - no type or name\n";
 770 //          annot_name = "";
 771 //          annot_type = "";
 772 //        }
 773 //        else   // else get annot type
 774 //        {
 775 //          file_data_line = file_data_line.substr(space_split_i+1);
 776 //          space_split_i = file_data_line.find(" ");
 777 //          annot_value = file_data_line.substr(0,space_split_i);
 778 //          //an_annot.type = annot_value;
 779 //          annot_type = annot_value;
 780 //          if (space_split_i == std::string::npos)  // no entry for name
 781 //          {
 782 //            std::cout << "seq, annots - no name\n";
 783 //            annot_name = "";
 784 //          }
 785 //          else          // get annot name
 786 //          {
 787 //            file_data_line = file_data_line.substr(space_split_i+1);
 788 //            space_split_i = file_data_line.find(" ");
 789 //            annot_value = file_data_line.substr(0,space_split_i);
 790 //            // this seems like its wrong?
 791 //            annot_type = annot_value;
 792 //          }
 793 //        }
 794 //        add_annotation(annot_name, annot_type, annot_begin, annot_end);
 795 //      }
 796 //      //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 797 //      //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
 798 //    }
 799 //  }
 800 //  load_file.close();
 801 //}
 802
 803 SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
 804 {
 805   boost::shared_ptr<Sequence> seq(new Sequence);
 806   std::string file_data_line;
 807   int seq_counter;
 808   //annot an_annot;
 809   int annot_begin;
 810   int annot_end;
 811   std::string annot_name;
 812   std::string annot_type;
 813
 814   std::string::size_type space_split_i;
 815   std::string annot_value;
 816
 817   //seq->annotation_list.reset(new SeqSpanRefList);
 818
 819   seq_counter = 0;
 820   // search for the next sequence
 821   int seq_num = 1;
 822   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 823   {
 824     getline(load_file,file_data_line);
 825     if (file_data_line == "<Sequence>")
 826       seq_counter++;
 827   }
 828
 829   // Could not find next sequence
 830   if (load_file.eof())
 831   {
 832     seq.reset();
 833     return seq;
 834   }
 835
 836   getline(load_file, file_data_line);
 837   // looks like the sequence is written as a single line
 838   seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
 839   getline(load_file, file_data_line);
 840   getline(load_file, file_data_line);
 841   if (file_data_line == "<Annotations>")
 842   {
 843     getline(load_file, file_data_line);
 844     seq->set_species(file_data_line);
 845     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 846     {
 847       getline(load_file,file_data_line);
 848       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 849       {
 850         // need to get 4 values...almost same code 4 times...
 851         // get annot start index
 852         space_split_i = file_data_line.find(" ");
 853         annot_value = file_data_line.substr(0,space_split_i);
 854         annot_begin = atoi (annot_value.c_str());
 855         file_data_line = file_data_line.substr(space_split_i+1);
 856         // get annot end index
 857         space_split_i = file_data_line.find(" ");
 858         annot_value = file_data_line.substr(0,space_split_i);
 859         annot_end = atoi (annot_value.c_str());
 860
 861         if (space_split_i == std::string::npos)  // no entry for type or name
 862         {
 863           std::cout << "seq, annots - no type or name\n";
 864           annot_name = "";
 865           annot_type = "";
 866         }
 867         else   // else get annot type
 868         {
 869           file_data_line = file_data_line.substr(space_split_i+1);
 870           space_split_i = file_data_line.find(" ");
 871           annot_value = file_data_line.substr(0,space_split_i);
 872           //an_annot.type = annot_value;
 873           annot_type = annot_value;
 874           if (space_split_i == std::string::npos)  // no entry for name
 875           {
 876             std::cout << "seq, annots - no name\n";
 877             annot_name = "";
 878           }
 879           else          // get annot name
 880           {
 881             file_data_line = file_data_line.substr(space_split_i+1);
 882             space_split_i = file_data_line.find(" ");
 883             annot_value = file_data_line.substr(0,space_split_i);
 884             // this seems like its wrong?
 885             annot_type = annot_value;
 886           }
 887         }
 888         seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
 889       }
 890       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 891       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
 892     }
 893   }
 894   //load_file.close();
 895   return seq;
 896 }
 897
 898
 899 void Sequence::add_motif(const Sequence& a_motif)
 900 {
 901   std::vector<int> motif_starts = find_motif(a_motif);
 902
 903   for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
 904       motif_start_i != motif_starts.end();
 905       ++motif_start_i)
 906   {
 907     motif_list->push_back(motif(*motif_start_i, a_motif.get_sequence()));
 908   }
 909 }
 910
 911 void Sequence::clear_motifs()
 912 {
 913   if (motif_list)
 914     motif_list->clear();
 915   else
 916     motif_list.reset(new MotifList);
 917 }
 918
 919 const Sequence::MotifList& Sequence::motifs() const
 920 {
 921   return *motif_list;
 922 }
 923
 924 std::vector<int>
 925 Sequence::find_motif(const Sequence& a_motif) const
 926 {
 927   std::vector<int> motif_match_starts;
 928   Sequence norm_motif_rc;
 929
 930   motif_match_starts.clear();
 931   // std::cout << "motif is: " << norm_motif << std::endl;
 932
 933   if (a_motif.size() > 0)
 934   {
 935     //std::cout << "Sequence: none blank motif\n";
 936     motif_scan(a_motif, &motif_match_starts);
 937
 938     norm_motif_rc = a_motif.rev_comp();;
 939     // make sure not to do search again if it is a palindrome
 940     if (norm_motif_rc != a_motif) {
 941       motif_scan(norm_motif_rc, &motif_match_starts);
 942     }
 943   }
 944   return motif_match_starts;
 945 }
 946
 947 void
 948 Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
 949 {
 950   // if there's no sequence we can't scan for it?
 951   // should this throw an exception?
 952   if (!seq) return;
 953
 954   std::string::size_type seq_i = 0;
 955   Sequence::size_type motif_i = 0;
 956   Sequence::size_type motif_len = a_motif.length();
 957   Sequence::value_type motif_char;
 958   Sequence::value_type seq_char;
 959
 960   while (seq_i < size())
 961   {
 962     // this is pretty much a straight translation of Nora's python code
 963     // to match iupac letter codes
 964     motif_char = toupper(a_motif[motif_i]);
 965     seq_char = toupper(seq->at(seq_i));
 966     if (motif_char =='N')
 967       motif_i++;
 968     else if (motif_char == seq_char)
 969       motif_i++;
 970     else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
 971       motif_i++;
 972     else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
 973       motif_i++;
 974     else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
 975       motif_i++;
 976     else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
 977       motif_i++;
 978     else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
 979       motif_i++;
 980     else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
 981       motif_i++;
 982     else if ((motif_char =='V') &&
 983              ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
 984       motif_i++;
 985     else if ((motif_char =='H') &&
 986              ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
 987       motif_i++;
 988     else if ((motif_char =='D') &&
 989              ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
 990       motif_i++;
 991     else if ((motif_char =='B') &&
 992              ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
 993       motif_i++;
 994     else
 995     {
 996       // if a motif doesn't match, erase our current trial and try again
 997       seq_i -= motif_i;
 998       motif_i = 0;
 999     }
1000
1001     // end Nora stuff, now we see if a match is found this pass
1002     if (motif_i == motif_len)
1003     {
1004       motif_match_starts->push_back(seq_i - motif_len + 1);
1005       motif_i = 0;
1006     }
1007
1008     seq_i++;
1009   }
1010   //std::cout << std::endl;
1011 }
1012
1013 void Sequence::add_string_annotation(std::string a_seq,
1014                                      std::string name)
1015 {
1016   std::vector<int> seq_starts = find_motif(a_seq);
1017
1018   for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
1019       seq_start_i != seq_starts.end();
1020       ++seq_start_i)
1021   {
1022     add_annotation(name, "", *seq_start_i, *seq_start_i+a_seq.size());
1023   }
1024 }
1025
1026 void Sequence::find_sequences(std::list<Sequence>::iterator start,
1027                               std::list<Sequence>::iterator end)
1028 {
1029   while (start != end) {
1030     add_string_annotation(start->get_sequence(), start->get_fasta_header());
1031     ++start;
1032   }
1033 }
1034
1035
1036 std::ostream& operator<<(std::ostream& out, const Sequence& s)
1037 {
1038   if (s.seq) {
1039     for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
1040       out << *s_i;
1041     }
1042   }
1043   return out;
1044 }
1045
1046 bool operator<(const Sequence& x, const Sequence& y)
1047 {
1048   Sequence::const_iterator x_i = x.begin();
1049   Sequence::const_iterator y_i = y.begin();
1050   // for sequences there's some computation associated with computing .end
1051   // so lets cache it.
1052   Sequence::const_iterator xend = x.end();
1053   Sequence::const_iterator yend = y.end();
1054   while(1) {
1055     if( x_i == xend and y_i == yend ) {
1056       return false;
1057     } else if ( x_i == xend ) {
1058       return true;
1059     } else if ( y_i == yend ) {
1060       return false;
1061     } else if ( (*x_i) < (*y_i)) {
1062       return true;
1063     } else if ( (*x_i) > (*y_i) ) {
1064       return false;
1065     } else {
1066       ++x_i;
1067       ++y_i;
1068     }
1069   }
1070 }
1071
1072 template <typename Iter1, typename Iter2>
1073 static
1074 bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
1075 {
1076   Iter1 aseq_i = abegin;
1077   Iter2 bseq_i = bbegin;
1078   if (aend-abegin == bend-bbegin) {
1079     // since the length of the two sequences is equal, we only need to
1080     // test one.
1081     for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
1082       if (toupper(*aseq_i) != toupper(*bseq_i)) {
1083         return false;
1084       }
1085     }
1086     return true;
1087   } else {
1088     return false;
1089   }
1090 }
1091
1092 bool operator==(const Sequence& x, const Sequence& y)
1093 {
1094   if (x.seq and y.seq) {
1095     // both x and y are defined
1096     if (SeqSpan::isFamily(x.seq, y.seq)) {
1097       // both are part of the same SeqSpan tree
1098       return *(x.seq) == *(y.seq);
1099     } else {
1100       // we'll have to do a real comparison
1101       return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
1102                x.begin(), x.end(),
1103                y.begin(), y.end()
1104              );
1105     }
1106   } else {
1107     // true if they're both empty (with either a null SeqSpanRef or
1108     // a zero length string
1109     return (x.size() == y.size());
1110   }
1111 }
1112
1113 bool operator!=(const Sequence& x, const Sequence& y)
1114 {
1115   return not operator==(x, y);
1116 }
1117