alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25 #include <boost/filesystem/operations.hpp>
  26 namespace fs = boost::filesystem;
  27
  28 #include <boost/spirit/core.hpp>
  29 #include <boost/spirit/actor/push_back_actor.hpp>
  30 #include <boost/spirit/iterator/file_iterator.hpp>
  31 #include <boost/spirit/utility/chset.hpp>
  32 namespace spirit = boost::spirit;
  33
  34 #include "alg/sequence.hpp"
  35 #include "mussa_exceptions.hpp"
  36
  37 #include <string>
  38 #include <stdexcept>
  39 #include <iostream>
  40 #include <sstream>
  41 #include <set>
  42
  43 annot::annot()
  44  : begin(0),
  45    end(0),
  46    type(""),
  47    name("")
  48 {
  49 }
  50
  51 annot::annot(int begin, int end, std::string type, std::string name)
  52  : begin(begin),
  53    end(end),
  54    type(type),
  55    name(name)
  56 {
  57 }
  58
  59 annot::~annot()
  60 {
  61 }
  62
  63 bool operator==(const annot& left, const annot& right)
  64 {
  65   return ((left.begin== right.begin) and
  66           (left.end == right.end) and
  67           (left.type == right.type) and
  68           (left.name == right.name));
  69 }
  70
  71 motif::motif(int begin, std::string motif)
  72  : annot(begin, begin+motif.size(), "motif", motif),
  73    sequence(motif)
  74 {
  75 }
  76
  77 motif::~motif()
  78 {
  79 }
  80
  81
  82 Sequence::Sequence(AlphabetRef alphabet)
  83   : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand)),
  84     motif_list(new MotifList)
  85 {
  86 }
  87
  88 Sequence::~Sequence()
  89 {
  90 }
  91
  92 Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
  93   : header(""),
  94     species(""),
  95     motif_list(new MotifList)
  96 {
  97   set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
  98 }
  99
 100 Sequence::Sequence(const std::string& seq,
 101                    AlphabetRef alphabet_,
 102                    SeqSpan::strand_type strand_)
 103   : header(""),
 104     species(""),
 105     motif_list(new MotifList)
 106 {
 107   set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
 108 }
 109
 110 Sequence::Sequence(const Sequence& o)
 111   : seq(o.seq),
 112     header(o.header),
 113     species(o.species),
 114     annots(o.annots),
 115     motif_list(o.motif_list)
 116 {
 117 }
 118
 119 Sequence::Sequence(const Sequence* o)
 120   : seq(o->seq),
 121     header(o->header),
 122     species(o->species),
 123     annots(o->annots),
 124     motif_list(o->motif_list)
 125 {
 126 }
 127
 128 Sequence::Sequence(const SequenceRef o)
 129   : seq(new SeqSpan(o->seq)),
 130     header(o->header),
 131     species(o->species),
 132     annots(o->annots),
 133     motif_list(o->motif_list)
 134 {
 135 }
 136
 137 Sequence::Sequence(const SeqSpanRef& seq_ref)
 138   : seq(seq_ref),
 139     header(""),
 140     species(""),
 141     motif_list(new MotifList)
 142 {
 143 }
 144
 145 Sequence &Sequence::operator=(const Sequence& s)
 146 {
 147   if (this != &s) {
 148     seq = s.seq;
 149     header = s.header;
 150     species = s.species;
 151     annots = s.annots;
 152     motif_list = s.motif_list;
 153   }
 154   return *this;
 155 }
 156
 157 static void multiplatform_getline(std::istream& in, std::string& line)
 158 {
 159   line.clear();
 160   char c;
 161   in.get(c);
 162   while(in.good() and !(c == '\012' or c == '\015') ) {
 163     line.push_back(c);
 164     in.get(c);
 165   }
 166   // if we have cr-lf eat it
 167   c = in.peek();
 168   if (c=='\012' or c == '\015') {
 169     in.get();
 170   }
 171 }
 172
 173 void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
 174 {
 175   load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 176 }
 177
 178 //! load a fasta file into a sequence
 179 void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
 180                           int seq_num, int start_index, int end_index)
 181 {
 182   fs::fstream data_file;
 183   data_file.open(file_path, std::ios::in);
 184
 185   if (!data_file.good())
 186   {
 187     throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
 188   } else {
 189     try {
 190       load_fasta(data_file, a, seq_num, start_index, end_index);
 191     } catch(sequence_empty_error e) {
 192       // there doesn't appear to be any sequence
 193       // catch and rethrow to include the filename
 194       std::stringstream msg;
 195       msg << "The selected sequence in "
 196           << file_path.native_file_string()
 197           << " appears to be empty";
 198       throw sequence_empty_error(msg.str());
 199     } catch(sequence_empty_file_error e) {
 200       std::stringstream errormsg;
 201       errormsg << file_path.native_file_string()
 202                << " did not have any fasta sequences" << std::endl;
 203       throw sequence_empty_file_error(errormsg.str());
 204     } catch(sequence_invalid_load_error e) {
 205       std::ostringstream msg;
 206       msg << file_path.native_file_string();
 207       msg << " " << e.what();
 208       throw sequence_invalid_load_error(msg.str());
 209     }
 210   }
 211 }
 212
 213 void Sequence::load_fasta(std::istream& file,
 214                           int seq_num, int start_index, int end_index)
 215 {
 216   load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 217 }
 218
 219 void
 220 Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
 221                      int seq_num,
 222                      int start_index, int end_index)
 223 {
 224   std::string file_data_line;
 225   int header_counter = 0;
 226   size_t line_counter = 0;
 227   bool read_seq = true;
 228   std::string rev_comp;
 229   std::string sequence_raw;
 230   std::string seq_tmp;      // holds sequence during basic filtering
 231   const Alphabet &alpha = Alphabet::get_alphabet(a);
 232
 233   if (seq_num == 0) {
 234     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 235   }
 236
 237   // search for the header of the fasta sequence we want
 238   while ( (!data_file.eof()) && (header_counter < seq_num) )
 239   {
 240     multiplatform_getline(data_file, file_data_line);
 241     ++line_counter;
 242     if (file_data_line.substr(0,1) == ">")
 243       header_counter++;
 244   }
 245
 246   if (header_counter > 0) {
 247     header = file_data_line.substr(1);
 248
 249     sequence_raw = "";
 250
 251     while ( !data_file.eof() && read_seq ) {
 252       multiplatform_getline(data_file,file_data_line);
 253       ++line_counter;
 254       if (file_data_line.substr(0,1) == ">")
 255         read_seq = false;
 256       else {
 257         for (std::string::const_iterator line_i = file_data_line.begin();
 258              line_i != file_data_line.end();
 259              ++line_i)
 260          {
 261            if(alpha.exists(*line_i)) {
 262              sequence_raw += *line_i;
 263            } else {
 264             std::ostringstream msg;
 265             msg << "Unrecognized characters in fasta sequence at line ";
 266             msg << line_counter;
 267             throw sequence_invalid_load_error(msg.str());
 268            }
 269          }
 270       }
 271     }
 272
 273     // Lastly, if subselection of the sequence was specified we keep cut out
 274     // and only keep that part
 275     // end_index = 0 means no end was specified, so cut to the end
 276     if (end_index == 0)
 277       end_index = sequence_raw.size();
 278
 279     // sequence filtering for upcasing agctn and convert non AGCTN to N
 280     if (end_index-start_index <= 0) {
 281       std::string msg("The selected sequence appears to be empty");
 282       throw sequence_empty_error(msg);
 283     }
 284     set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
 285   } else {
 286     std::string errormsg("There were no fasta sequences");
 287     throw sequence_empty_file_error(errormsg);
 288   }
 289 }
 290
 291 void Sequence::set_filtered_sequence(const std::string &in_seq,
 292                                      AlphabetRef alphabet_,
 293                                      size_type start,
 294                                      size_type count,
 295                                      SeqSpan::strand_type strand_)
 296 {
 297   if ( count == npos)
 298     count = in_seq.size() - start;
 299   std::string new_seq;
 300   new_seq.reserve(count);
 301
 302   // finally, the actual conversion loop
 303   const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
 304   std::string::const_iterator seq_i = in_seq.begin()+start;
 305   for(size_type i = 0; i != count; ++i, ++seq_i)
 306   {
 307     if (alpha_impl.exists(*seq_i)) {
 308       new_seq.append(1, toupper(*seq_i));
 309     } else {
 310       new_seq.append(1, 'N');
 311     }
 312   }
 313   SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
 314   seq = new_seq_ref;
 315 }
 316
 317 void
 318 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 319 {
 320   if (not fs::exists(file_path)) {
 321     throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
 322   }
 323   if (fs::is_directory(file_path)) {
 324     throw mussa_load_error(file_path.string() +
 325             " is a directory, please provide a file for annotations."
 326           );
 327   }
 328   fs::fstream data_stream(file_path, std::ios::in);
 329   if (!data_stream)
 330   {
 331     throw mussa_load_error("Error loading annotation file " + file_path.string());
 332   }
 333
 334   // so i should probably be passing the parse function some iterators
 335   // but the annotations files are (currently) small, so i think i can
 336   // get away with loading the whole file into memory
 337   std::string data;
 338   char c;
 339   while(data_stream.good()) {
 340     data_stream.get(c);
 341     data.push_back(c);
 342   }
 343   data_stream.close();
 344
 345   try {
 346     parse_annot(data, start_index, end_index);
 347   } catch(annotation_load_error e) {
 348     std::ostringstream msg;
 349     msg << file_path.native_file_string()
 350         << " "
 351         << e.what();
 352     throw annotation_load_error(msg.str());
 353   }
 354 }
 355
 356 /* If this works, yikes, this is some brain hurting code.
 357  *
 358  * what's going on is that when pb_annot is instantiated it stores references
 359  * to begin, end, name, type, declared in the parse function, then
 360  * when operator() is called it grabs values from those references
 361  * and uses that to instantiate an annot object and append that to our
 362  * annotation list.
 363  *
 364  * This weirdness is because the spirit library requires that actions
 365  * conform to a specific prototype operator()(IteratorT, IteratorT)
 366  * which doesn't provide any useful opportunity for me to actually
 367  * grab the results of our parsing.
 368  *
 369  * so I instantiate this structure in order to have a place to grab
 370  * my data from.
 371  */
 372
 373 struct push_back_annot {
 374   std::list<annot>& annot_list;
 375   int& begin;
 376   int& end;
 377   std::string& name;
 378   std::string& type;
 379   int &parsed;
 380
 381   push_back_annot(std::list<annot>& annot_list_,
 382                   int& begin_,
 383                   int& end_,
 384                   std::string& name_,
 385                   std::string& type_,
 386                   int &parsed_)
 387   : annot_list(annot_list_),
 388     begin(begin_),
 389     end(end_),
 390     name(name_),
 391     type(type_),
 392     parsed(parsed_)
 393   {
 394   }
 395
 396   void operator()(std::string::const_iterator,
 397                   std::string::const_iterator) const
 398   {
 399     //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
 400     annot_list.push_back(annot(begin, end, name, type));
 401     ++parsed;
 402   };
 403 };
 404
 405 struct push_back_seq {
 406   std::list<Sequence>& seq_list;
 407   std::string& name;
 408   std::string& seq;
 409   int &parsed;
 410
 411   push_back_seq(std::list<Sequence>& seq_list_,
 412                 std::string& name_,
 413                 std::string& seq_,
 414                 int &parsed_)
 415   : seq_list(seq_list_),
 416     name(name_),
 417     seq(seq_),
 418     parsed(parsed_)
 419   {
 420   }
 421
 422   void operator()(std::string::const_iterator,
 423                   std::string::const_iterator) const
 424   {
 425     // filter out newlines from our sequence
 426     std::string new_seq;
 427     for(std::string::const_iterator seq_i = seq.begin();
 428         seq_i != seq.end();
 429         ++seq_i)
 430     {
 431       if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
 432     }
 433     //std::cout << "adding seq: " << name << " " << new_seq << std::endl;
 434
 435     Sequence s(new_seq);
 436     s.set_fasta_header(name);
 437     seq_list.push_back(s);
 438     ++parsed;
 439   };
 440 };
 441
 442 void
 443 Sequence::parse_annot(std::string data, int start_index, int end_index)
 444 {
 445   int start=0;
 446   int end=0;
 447   std::string name;
 448   std::string type;
 449   std::string seq;
 450   std::list<annot> parsed_annots;
 451   std::list<Sequence> query_seqs;
 452   int parsed=0;
 453
 454   bool ok = spirit::parse(data.begin(), data.end(),
 455               (
 456                //begin grammar
 457                  !(
 458                     (
 459                       spirit::alpha_p >>
 460                       +(spirit::graph_p)
 461                     )[spirit::assign_a(species)] >>
 462                     +(spirit::space_p)
 463                   ) >>
 464                   *(
 465                      ( // ignore html tags
 466                        *(spirit::space_p) >>
 467                        spirit::ch_p('<') >>
 468                        +(~spirit::ch_p('>')) >>
 469                        spirit::ch_p('>') >>
 470                        *(spirit::space_p)
 471                      )
 472                    |
 473                     ( // parse an absolute location name
 474                      (spirit::uint_p[spirit::assign_a(start)] >>
 475                       +spirit::space_p >>
 476                       spirit::uint_p[spirit::assign_a(end)] >>
 477                       +spirit::space_p >>
 478                       (
 479                          spirit::alpha_p >>
 480                          *spirit::graph_p
 481                       )[spirit::assign_a(name)] >>
 482                       // optional type
 483                       !(
 484                           +spirit::space_p >>
 485                           (
 486                             spirit::alpha_p >>
 487                             *spirit::graph_p
 488                           )[spirit::assign_a(type)]
 489                       )
 490                       // to understand how this group gets set
 491                       // read the comment above struct push_back_annot
 492                      )[push_back_annot(parsed_annots, start, end, type, name, parsed)]
 493                    |
 494                     ((spirit::ch_p('>')|spirit::str_p("&gt;")) >>
 495                        (*(spirit::print_p))[spirit::assign_a(name)] >>
 496                        spirit::eol_p >>
 497                        (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seq)]
 498                      )[push_back_seq(query_seqs, name, seq, parsed)]
 499                     ) >>
 500                     *spirit::space_p
 501                    )
 502               //end grammar
 503               )).full;
 504   if (not ok) {
 505     std::stringstream msg;
 506     msg << "Error parsing annotation #" << parsed;
 507     throw annotation_load_error(msg.str());
 508   }
 509   // add newly parsed annotations to our sequence
 510   std::copy(parsed_annots.begin(), parsed_annots.end(), std::back_inserter(annots));
 511   // go seearch for query sequences
 512   find_sequences(query_seqs.begin(), query_seqs.end());
 513 }
 514
 515 void Sequence::add_annotation(const annot& a)
 516 {
 517   annots.push_back(a);
 518 }
 519
 520 const std::list<annot>& Sequence::annotations() const
 521 {
 522   return annots;
 523 }
 524
 525 void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
 526 {
 527   new_seq.motif_list = motif_list;
 528   new_seq.annots.clear();
 529
 530   for(std::list<annot>::const_iterator annot_i = annots.begin();
 531       annot_i != annots.end();
 532       ++annot_i)
 533   {
 534     size_type annot_begin= annot_i->begin;
 535     size_type annot_end = annot_i->end;
 536
 537     if (annot_begin < start+count) {
 538       if (annot_begin >= start) {
 539         annot_begin -= start;
 540       } else {
 541         annot_begin = 0;
 542       }
 543
 544       if (annot_end < start+count) {
 545         annot_end -= start;
 546       } else {
 547         annot_end = count;
 548       }
 549
 550       annot new_annot(annot_begin, annot_end, annot_i->type, annot_i->name);
 551       new_seq.annots.push_back(new_annot);
 552     }
 553   }
 554 }
 555
 556 Sequence
 557 Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
 558 {
 559   // FIXME: should i really allow a subsequence of an empty sequence?
 560   if (!seq) {
 561     Sequence new_seq;
 562     return new_seq;
 563   }
 564
 565   Sequence new_seq = *this;
 566   new_seq.seq = seq->subseq(start, count, strand);
 567   if (seq->annotations()) {
 568     AnnotationsRef a(new Annotations(*(seq->annotations())));
 569     new_seq.seq->setAnnotations(a);
 570   }
 571   copy_children(new_seq, start, count);
 572
 573   return new_seq;
 574 }
 575
 576
 577 // FIXME: This needs to be moved into SeqSpan
 578 Sequence Sequence::rev_comp() const
 579 {
 580   // a reverse complement is the whole opposite strand
 581   return subseq(0, npos, SeqSpan::OppositeStrand);
 582 }
 583
 584 const Alphabet& Sequence::get_alphabet() const
 585 {
 586   return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
 587 }
 588
 589 void Sequence::set_fasta_header(std::string header_)
 590 {
 591   header = header_;
 592 }
 593
 594 void Sequence::set_species(const std::string& name)
 595 {
 596   species = name;
 597 }
 598
 599 std::string Sequence::get_species() const
 600 {
 601   return species;
 602 }
 603
 604
 605 std::string
 606 Sequence::get_fasta_header() const
 607 {
 608   return header;
 609 }
 610
 611 std::string
 612 Sequence::get_name() const
 613 {
 614   if (header.size() > 0)
 615     return header;
 616   else if (species.size() > 0)
 617     return species;
 618   else
 619     return "";
 620 }
 621
 622 void Sequence::set_sequence(const std::string& s, AlphabetRef a)
 623 {
 624   set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
 625 }
 626
 627 std::string Sequence::get_sequence() const
 628 {
 629   return seq->sequence();
 630 }
 631
 632 Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
 633 {
 634   return at(i);
 635 }
 636
 637 void
 638 Sequence::clear()
 639 {
 640   seq.reset();
 641   header.clear();
 642   species.clear();
 643   annots.clear();
 644   motif_list.reset(new MotifList);
 645 }
 646
 647 void
 648 Sequence::save(fs::fstream &save_file)
 649 {
 650   //fstream save_file;
 651   std::list<annot>::iterator annots_i;
 652
 653   // not sure why, or if i'm doing something wrong, but can't seem to pass
 654   // file pointers down to this method from the mussa control class
 655   // so each call to save a sequence appends to the file started by mussa_class
 656   //save_file.open(save_file_path.c_str(), std::ios::app);
 657
 658   save_file << "<Sequence>" << std::endl;
 659   save_file << *this << std::endl;
 660   save_file << "</Sequence>" << std::endl;
 661
 662   save_file << "<Annotations>" << std::endl;
 663   save_file << species << std::endl;
 664   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 665   {
 666     save_file << annots_i->begin << " " << annots_i->end << " " ;
 667     save_file << annots_i->name << " " << annots_i->type << std::endl;
 668   }
 669   save_file << "</Annotations>" << std::endl;
 670   //save_file.close();
 671 }
 672
 673 void
 674 Sequence::load_museq(fs::path load_file_path, int seq_num)
 675 {
 676   fs::fstream load_file;
 677   std::string file_data_line;
 678   int seq_counter;
 679   annot an_annot;
 680   std::string::size_type space_split_i;
 681   std::string annot_value;
 682
 683   annots.clear();
 684   load_file.open(load_file_path, std::ios::in);
 685
 686   seq_counter = 0;
 687   // search for the seq_num-th sequence
 688   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 689   {
 690     getline(load_file,file_data_line);
 691     if (file_data_line == "<Sequence>")
 692       seq_counter++;
 693   }
 694   getline(load_file, file_data_line);
 695   // looks like the sequence is written as a single line
 696   set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
 697   getline(load_file, file_data_line);
 698   getline(load_file, file_data_line);
 699   if (file_data_line == "<Annotations>")
 700   {
 701     getline(load_file, file_data_line);
 702     species = file_data_line;
 703     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 704     {
 705       getline(load_file,file_data_line);
 706       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 707       {
 708         // need to get 4 values...almost same code 4 times...
 709         // get annot start index
 710         space_split_i = file_data_line.find(" ");
 711         annot_value = file_data_line.substr(0,space_split_i);
 712         an_annot.begin = atoi (annot_value.c_str());
 713         file_data_line = file_data_line.substr(space_split_i+1);
 714         // get annot end index
 715         space_split_i = file_data_line.find(" ");
 716         annot_value = file_data_line.substr(0,space_split_i);
 717         an_annot.end = atoi (annot_value.c_str());
 718
 719         if (space_split_i == std::string::npos)  // no entry for type or name
 720         {
 721           std::cout << "seq, annots - no type or name\n";
 722           an_annot.type = "";
 723           an_annot.name = "";
 724         }
 725         else   // else get annot type
 726         {
 727           file_data_line = file_data_line.substr(space_split_i+1);
 728           space_split_i = file_data_line.find(" ");
 729           annot_value = file_data_line.substr(0,space_split_i);
 730           an_annot.type = annot_value;
 731           if (space_split_i == std::string::npos)  // no entry for name
 732           {
 733             std::cout << "seq, annots - no name\n";
 734             an_annot.name = "";
 735           }
 736           else          // get annot name
 737           {
 738             file_data_line = file_data_line.substr(space_split_i+1);
 739             space_split_i = file_data_line.find(" ");
 740             annot_value = file_data_line.substr(0,space_split_i);
 741             an_annot.type = annot_value;
 742           }
 743         }
 744         annots.push_back(an_annot);  // don't forget to actually add the annot
 745       }
 746       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 747       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
 748     }
 749   }
 750   load_file.close();
 751 }
 752
 753
 754 void Sequence::add_motif(const Sequence& a_motif)
 755 {
 756   std::vector<int> motif_starts = find_motif(a_motif);
 757
 758   for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
 759       motif_start_i != motif_starts.end();
 760       ++motif_start_i)
 761   {
 762     motif_list->push_back(motif(*motif_start_i, a_motif.get_sequence()));
 763   }
 764 }
 765
 766 void Sequence::clear_motifs()
 767 {
 768   if (motif_list)
 769     motif_list->clear();
 770   else
 771     motif_list.reset(new MotifList);
 772 }
 773
 774 const Sequence::MotifList& Sequence::motifs() const
 775 {
 776   return *motif_list;
 777 }
 778
 779 std::vector<int>
 780 Sequence::find_motif(const Sequence& a_motif) const
 781 {
 782   std::vector<int> motif_match_starts;
 783   Sequence norm_motif_rc;
 784
 785   motif_match_starts.clear();
 786   // std::cout << "motif is: " << norm_motif << std::endl;
 787
 788   if (a_motif.size() > 0)
 789   {
 790     //std::cout << "Sequence: none blank motif\n";
 791     motif_scan(a_motif, &motif_match_starts);
 792
 793     norm_motif_rc = a_motif.rev_comp();;
 794     // make sure not to do search again if it is a palindrome
 795     if (norm_motif_rc != a_motif) {
 796       motif_scan(norm_motif_rc, &motif_match_starts);
 797     }
 798   }
 799   return motif_match_starts;
 800 }
 801
 802 void
 803 Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
 804 {
 805   // if there's no sequence we can't scan for it?
 806   // should this throw an exception?
 807   if (!seq) return;
 808
 809   std::string::size_type seq_i = 0;
 810   Sequence::size_type motif_i = 0;
 811   Sequence::size_type motif_len = a_motif.length();
 812   Sequence::value_type motif_char;
 813   Sequence::value_type seq_char;
 814
 815   while (seq_i < size())
 816   {
 817     // this is pretty much a straight translation of Nora's python code
 818     // to match iupac letter codes
 819     motif_char = toupper(a_motif[motif_i]);
 820     seq_char = toupper(seq->at(seq_i));
 821     if (motif_char =='N')
 822       motif_i++;
 823     else if (motif_char == seq_char)
 824       motif_i++;
 825     else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
 826       motif_i++;
 827     else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
 828       motif_i++;
 829     else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
 830       motif_i++;
 831     else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
 832       motif_i++;
 833     else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
 834       motif_i++;
 835     else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
 836       motif_i++;
 837     else if ((motif_char =='V') &&
 838              ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
 839       motif_i++;
 840     else if ((motif_char =='H') &&
 841              ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
 842       motif_i++;
 843     else if ((motif_char =='D') &&
 844              ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
 845       motif_i++;
 846     else if ((motif_char =='B') &&
 847              ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
 848       motif_i++;
 849     else
 850     {
 851       // if a motif doesn't match, erase our current trial and try again
 852       seq_i -= motif_i;
 853       motif_i = 0;
 854     }
 855
 856     // end Nora stuff, now we see if a match is found this pass
 857     if (motif_i == motif_len)
 858     {
 859       annot new_motif;
 860       motif_match_starts->push_back(seq_i - motif_len + 1);
 861       motif_i = 0;
 862     }
 863
 864     seq_i++;
 865   }
 866   //std::cout << std::endl;
 867 }
 868
 869 void Sequence::add_string_annotation(std::string a_seq,
 870                                      std::string name)
 871 {
 872   std::vector<int> seq_starts = find_motif(a_seq);
 873
 874   //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
 875
 876   for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
 877       seq_start_i != seq_starts.end();
 878       ++seq_start_i)
 879   {
 880     annots.push_back(annot(*seq_start_i,
 881                            *seq_start_i+a_seq.size(),
 882                            "",
 883                            name));
 884   }
 885 }
 886
 887 void Sequence::find_sequences(std::list<Sequence>::iterator start,
 888                               std::list<Sequence>::iterator end)
 889 {
 890   while (start != end) {
 891     add_string_annotation(start->get_sequence(), start->get_fasta_header());
 892     ++start;
 893   }
 894 }
 895
 896
 897 std::ostream& operator<<(std::ostream& out, const Sequence& s)
 898 {
 899   if (s.seq) {
 900     for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
 901       out << *s_i;
 902     }
 903   }
 904   return out;
 905 }
 906
 907 bool operator<(const Sequence& x, const Sequence& y)
 908 {
 909   Sequence::const_iterator x_i = x.begin();
 910   Sequence::const_iterator y_i = y.begin();
 911   // for sequences there's some computation associated with computing .end
 912   // so lets cache it.
 913   Sequence::const_iterator xend = x.end();
 914   Sequence::const_iterator yend = y.end();
 915   while(1) {
 916     if( x_i == xend and y_i == yend ) {
 917       return false;
 918     } else if ( x_i == xend ) {
 919       return true;
 920     } else if ( y_i == yend ) {
 921       return false;
 922     } else if ( (*x_i) < (*y_i)) {
 923       return true;
 924     } else if ( (*x_i) > (*y_i) ) {
 925       return false;
 926     } else {
 927       ++x_i;
 928       ++y_i;
 929     }
 930   }
 931 }
 932
 933 template <typename Iter1, typename Iter2>
 934 static
 935 bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
 936 {
 937   Iter1 aseq_i = abegin;
 938   Iter2 bseq_i = bbegin;
 939   if (aend-abegin == bend-bbegin) {
 940     // since the length of the two sequences is equal, we only need to
 941     // test one.
 942     for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
 943       if (toupper(*aseq_i) != toupper(*bseq_i)) {
 944         return false;
 945       }
 946     }
 947     return true;
 948   } else {
 949     return false;
 950   }
 951 }
 952
 953 bool operator==(const Sequence& x, const Sequence& y)
 954 {
 955   if (x.seq and y.seq) {
 956     // both x and y are defined
 957     if (SeqSpan::isFamily(x.seq, y.seq)) {
 958       // both are part of the same SeqSpan tree
 959       return *(x.seq) == *(y.seq);
 960     } else {
 961       // we'll have to do a real comparison
 962       return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
 963                x.begin(), x.end(),
 964                y.begin(), y.end()
 965              );
 966     }
 967   } else {
 968     // true if they're both empty (with either a null SeqSpanRef or
 969     // a zero length string
 970     return (x.size() == y.size());
 971   }
 972 }
 973
 974 bool operator!=(const Sequence& x, const Sequence& y)
 975 {
 976   return not operator==(x, y);
 977 }
 978