alg/sequence.cpp

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24 #include <boost/filesystem/fstream.hpp>
  25 #include <boost/filesystem/operations.hpp>
  26 namespace fs = boost::filesystem;
  27
  28 #include <boost/spirit/core.hpp>
  29 #include <boost/spirit/actor/push_back_actor.hpp>
  30 #include <boost/spirit/iterator/file_iterator.hpp>
  31 #include <boost/spirit/utility/chset.hpp>
  32 namespace spirit = boost::spirit;
  33
  34 #include "alg/sequence.hpp"
  35 #include "mussa_exceptions.hpp"
  36
  37 #include <string>
  38 #include <stdexcept>
  39 #include <iostream>
  40 #include <sstream>
  41 #include <set>
  42
  43 annot::annot()
  44  : begin(0),
  45    end(0),
  46    type(""),
  47    name("")
  48 {
  49 }
  50
  51 annot::annot(int begin, int end, std::string type, std::string name)
  52  : begin(begin),
  53    end(end),
  54    type(type),
  55    name(name)
  56 {
  57 }
  58
  59 annot::~annot()
  60 {
  61 }
  62
  63 bool operator==(const annot& left, const annot& right)
  64 {
  65   return ((left.begin== right.begin) and
  66           (left.end == right.end) and
  67           (left.type == right.type) and
  68           (left.name == right.name));
  69 }
  70
  71 motif::motif(int begin, std::string motif)
  72  : annot(begin, begin+motif.size(), "motif", motif),
  73    sequence(motif)
  74 {
  75 }
  76
  77 motif::~motif()
  78 {
  79 }
  80
  81
  82 Sequence::Sequence(AlphabetRef alphabet)
  83   : seq(new SeqSpan("", alphabet, SeqSpan::PlusStrand))
  84 {
  85 }
  86
  87 Sequence::~Sequence()
  88 {
  89 }
  90
  91 Sequence::Sequence(const char *seq, AlphabetRef alphabet_, SeqSpan::strand_type strand_)
  92   : header(""),
  93     species("")
  94 {
  95   set_filtered_sequence(seq, alphabet_, 0, npos, strand_);
  96 }
  97
  98 Sequence::Sequence(const std::string& seq,
  99                    AlphabetRef alphabet_,
 100                    SeqSpan::strand_type strand_)
 101   : header(""),
 102     species("")
 103 {
 104   set_filtered_sequence(seq, alphabet_, 0, seq.size(), strand_);
 105 }
 106
 107 Sequence::Sequence(const Sequence& o)
 108   : seq(o.seq),
 109     header(o.header),
 110     species(o.species),
 111     annots(o.annots),
 112     motif_list(o.motif_list)
 113 {
 114 }
 115
 116 Sequence::Sequence(const Sequence* o)
 117   : seq(o->seq),
 118     header(o->header),
 119     species(o->species),
 120     annots(o->annots),
 121     motif_list(o->motif_list)
 122 {
 123 }
 124
 125 Sequence::Sequence(const SequenceRef o)
 126   : seq(new SeqSpan(o->seq)),
 127     header(o->header),
 128     species(o->species),
 129     annots(o->annots),
 130     motif_list(o->motif_list)
 131 {
 132 }
 133
 134 Sequence::Sequence(const SeqSpanRef& seq_ref)
 135   : seq(seq_ref),
 136     header(""),
 137     species("")
 138 {
 139 }
 140
 141 Sequence &Sequence::operator=(const Sequence& s)
 142 {
 143   if (this != &s) {
 144     seq = s.seq;
 145     header = s.header;
 146     species = s.species;
 147     annots = s.annots;
 148     motif_list = s.motif_list;
 149   }
 150   return *this;
 151 }
 152
 153 static void multiplatform_getline(std::istream& in, std::string& line)
 154 {
 155   line.clear();
 156   char c;
 157   in.get(c);
 158   while(in.good() and !(c == '\012' or c == '\015') ) {
 159     line.push_back(c);
 160     in.get(c);
 161   }
 162   // if we have cr-lf eat it
 163   c = in.peek();
 164   if (c=='\012' or c == '\015') {
 165     in.get();
 166   }
 167 }
 168
 169 void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
 170 {
 171   load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 172 }
 173
 174 //! load a fasta file into a sequence
 175 void Sequence::load_fasta(fs::path file_path, AlphabetRef a,
 176                           int seq_num, int start_index, int end_index)
 177 {
 178   fs::fstream data_file;
 179   data_file.open(file_path, std::ios::in);
 180
 181   if (!data_file.good())
 182   {
 183     throw mussa_load_error("Sequence File: "+file_path.string()+" not found");
 184   } else {
 185     try {
 186       load_fasta(data_file, a, seq_num, start_index, end_index);
 187     } catch(sequence_empty_error e) {
 188       // there doesn't appear to be any sequence
 189       // catch and rethrow to include the filename
 190       std::stringstream msg;
 191       msg << "The selected sequence in "
 192           << file_path.native_file_string()
 193           << " appears to be empty";
 194       throw sequence_empty_error(msg.str());
 195     } catch(sequence_empty_file_error e) {
 196       std::stringstream errormsg;
 197       errormsg << file_path.native_file_string()
 198                << " did not have any fasta sequences" << std::endl;
 199       throw sequence_empty_file_error(errormsg.str());
 200     } catch(sequence_invalid_load_error e) {
 201       std::ostringstream msg;
 202       msg << file_path.native_file_string();
 203       msg << " " << e.what();
 204       throw sequence_invalid_load_error(msg.str());
 205     }
 206   }
 207 }
 208
 209 void Sequence::load_fasta(std::istream& file,
 210                           int seq_num, int start_index, int end_index)
 211 {
 212   load_fasta(file, reduced_nucleic_alphabet, seq_num, start_index, end_index);
 213 }
 214
 215 void
 216 Sequence::load_fasta(std::istream& data_file, AlphabetRef a,
 217                      int seq_num,
 218                      int start_index, int end_index)
 219 {
 220   std::string file_data_line;
 221   int header_counter = 0;
 222   size_t line_counter = 0;
 223   bool read_seq = true;
 224   std::string rev_comp;
 225   std::string sequence_raw;
 226   std::string seq_tmp;      // holds sequence during basic filtering
 227   const Alphabet &alpha = Alphabet::get_alphabet(a);
 228
 229   if (seq_num == 0) {
 230     throw mussa_load_error("fasta sequence number is 1 based (can't be 0)");
 231   }
 232
 233   // search for the header of the fasta sequence we want
 234   while ( (!data_file.eof()) && (header_counter < seq_num) )
 235   {
 236     multiplatform_getline(data_file, file_data_line);
 237     ++line_counter;
 238     if (file_data_line.substr(0,1) == ">")
 239       header_counter++;
 240   }
 241
 242   if (header_counter > 0) {
 243     header = file_data_line.substr(1);
 244
 245     sequence_raw = "";
 246
 247     while ( !data_file.eof() && read_seq ) {
 248       multiplatform_getline(data_file,file_data_line);
 249       ++line_counter;
 250       if (file_data_line.substr(0,1) == ">")
 251         read_seq = false;
 252       else {
 253         for (std::string::const_iterator line_i = file_data_line.begin();
 254              line_i != file_data_line.end();
 255              ++line_i)
 256          {
 257            if(alpha.exists(*line_i)) {
 258              sequence_raw += *line_i;
 259            } else {
 260             std::ostringstream msg;
 261             msg << "Unrecognized characters in fasta sequence at line ";
 262             msg << line_counter;
 263             throw sequence_invalid_load_error(msg.str());
 264            }
 265          }
 266       }
 267     }
 268
 269     // Lastly, if subselection of the sequence was specified we keep cut out
 270     // and only keep that part
 271     // end_index = 0 means no end was specified, so cut to the end
 272     if (end_index == 0)
 273       end_index = sequence_raw.size();
 274
 275     // sequence filtering for upcasing agctn and convert non AGCTN to N
 276     if (end_index-start_index <= 0) {
 277       std::string msg("The selected sequence appears to be empty");
 278       throw sequence_empty_error(msg);
 279     }
 280     set_filtered_sequence(sequence_raw, a, start_index, end_index-start_index, SeqSpan::PlusStrand);
 281   } else {
 282     std::string errormsg("There were no fasta sequences");
 283     throw sequence_empty_file_error(errormsg);
 284   }
 285 }
 286
 287 void Sequence::set_filtered_sequence(const std::string &in_seq,
 288                                      AlphabetRef alphabet_,
 289                                      size_type start,
 290                                      size_type count,
 291                                      SeqSpan::strand_type strand_)
 292 {
 293   if ( count == npos)
 294     count = in_seq.size() - start;
 295   std::string new_seq;
 296   new_seq.reserve(count);
 297
 298   // finally, the actual conversion loop
 299   const Alphabet& alpha_impl = Alphabet::get_alphabet(alphabet_); // go get one of our actual alphabets
 300   std::string::const_iterator seq_i = in_seq.begin()+start;
 301   for(size_type i = 0; i != count; ++i, ++seq_i)
 302   {
 303     if (alpha_impl.exists(*seq_i)) {
 304       new_seq.append(1, toupper(*seq_i));
 305     } else {
 306       new_seq.append(1, 'N');
 307     }
 308   }
 309   SeqSpanRef new_seq_ref(new SeqSpan(new_seq, alphabet_, strand_));
 310   seq = new_seq_ref;
 311 }
 312
 313 void
 314 Sequence::load_annot(fs::path file_path, int start_index, int end_index)
 315 {
 316   if (not fs::exists(file_path)) {
 317     throw mussa_load_error("Annotation File " + file_path.string() + " was not found");
 318   }
 319   if (fs::is_directory(file_path)) {
 320     throw mussa_load_error(file_path.string() +
 321             " is a directory, please provide a file for annotations."
 322           );
 323   }
 324   fs::fstream data_stream(file_path, std::ios::in);
 325   if (!data_stream)
 326   {
 327     throw mussa_load_error("Error loading annotation file " + file_path.string());
 328   }
 329
 330   // so i should probably be passing the parse function some iterators
 331   // but the annotations files are (currently) small, so i think i can
 332   // get away with loading the whole file into memory
 333   std::string data;
 334   char c;
 335   while(data_stream.good()) {
 336     data_stream.get(c);
 337     data.push_back(c);
 338   }
 339   data_stream.close();
 340
 341   try {
 342     parse_annot(data, start_index, end_index);
 343   } catch(annotation_load_error e) {
 344     std::ostringstream msg;
 345     msg << file_path.native_file_string()
 346         << " "
 347         << e.what();
 348     throw annotation_load_error(msg.str());
 349   }
 350 }
 351
 352 /* If this works, yikes, this is some brain hurting code.
 353  *
 354  * what's going on is that when pb_annot is instantiated it stores references
 355  * to begin, end, name, type, declared in the parse function, then
 356  * when operator() is called it grabs values from those references
 357  * and uses that to instantiate an annot object and append that to our
 358  * annotation list.
 359  *
 360  * This weirdness is because the spirit library requires that actions
 361  * conform to a specific prototype operator()(IteratorT, IteratorT)
 362  * which doesn't provide any useful opportunity for me to actually
 363  * grab the results of our parsing.
 364  *
 365  * so I instantiate this structure in order to have a place to grab
 366  * my data from.
 367  */
 368
 369 struct push_back_annot {
 370   std::list<annot>& annot_list;
 371   int& begin;
 372   int& end;
 373   std::string& name;
 374   std::string& type;
 375   int &parsed;
 376
 377   push_back_annot(std::list<annot>& annot_list_,
 378                   int& begin_,
 379                   int& end_,
 380                   std::string& name_,
 381                   std::string& type_,
 382                   int &parsed_)
 383   : annot_list(annot_list_),
 384     begin(begin_),
 385     end(end_),
 386     name(name_),
 387     type(type_),
 388     parsed(parsed_)
 389   {
 390   }
 391
 392   void operator()(std::string::const_iterator,
 393                   std::string::const_iterator) const
 394   {
 395     //std::cout << "adding annot: " << begin << "|" << end << "|" << name << "|" << type << std::endl;
 396     annot_list.push_back(annot(begin, end, name, type));
 397     ++parsed;
 398   };
 399 };
 400
 401 struct push_back_seq {
 402   std::list<Sequence>& seq_list;
 403   std::string& name;
 404   std::string& seq;
 405   int &parsed;
 406
 407   push_back_seq(std::list<Sequence>& seq_list_,
 408                 std::string& name_,
 409                 std::string& seq_,
 410                 int &parsed_)
 411   : seq_list(seq_list_),
 412     name(name_),
 413     seq(seq_),
 414     parsed(parsed_)
 415   {
 416   }
 417
 418   void operator()(std::string::const_iterator,
 419                   std::string::const_iterator) const
 420   {
 421     // filter out newlines from our sequence
 422     std::string new_seq;
 423     for(std::string::const_iterator seq_i = seq.begin();
 424         seq_i != seq.end();
 425         ++seq_i)
 426     {
 427       if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
 428     }
 429     //std::cout << "adding seq: " << name << " " << new_seq << std::endl;
 430
 431     Sequence s(new_seq);
 432     s.set_fasta_header(name);
 433     seq_list.push_back(s);
 434     ++parsed;
 435   };
 436 };
 437
 438 void
 439 Sequence::parse_annot(std::string data, int start_index, int end_index)
 440 {
 441   int start=0;
 442   int end=0;
 443   std::string name;
 444   std::string type;
 445   std::string seq;
 446   std::list<annot> parsed_annots;
 447   std::list<Sequence> query_seqs;
 448   int parsed=0;
 449
 450   bool ok = spirit::parse(data.begin(), data.end(),
 451               (
 452                //begin grammar
 453                  !(
 454                     (
 455                       spirit::alpha_p >>
 456                       +(spirit::graph_p)
 457                     )[spirit::assign_a(species)] >>
 458                     +(spirit::space_p)
 459                   ) >>
 460                   *(
 461                      ( // ignore html tags
 462                        *(spirit::space_p) >>
 463                        spirit::ch_p('<') >>
 464                        +(~spirit::ch_p('>')) >>
 465                        spirit::ch_p('>') >>
 466                        *(spirit::space_p)
 467                      )
 468                    |
 469                     ( // parse an absolute location name
 470                      (spirit::uint_p[spirit::assign_a(start)] >>
 471                       +spirit::space_p >>
 472                       spirit::uint_p[spirit::assign_a(end)] >>
 473                       +spirit::space_p >>
 474                       (
 475                          spirit::alpha_p >>
 476                          *spirit::graph_p
 477                       )[spirit::assign_a(name)] >>
 478                       // optional type
 479                       !(
 480                           +spirit::space_p >>
 481                           (
 482                             spirit::alpha_p >>
 483                             *spirit::graph_p
 484                           )[spirit::assign_a(type)]
 485                       )
 486                       // to understand how this group gets set
 487                       // read the comment above struct push_back_annot
 488                      )[push_back_annot(parsed_annots, start, end, type, name, parsed)]
 489                    |
 490                     ((spirit::ch_p('>')|spirit::str_p("&gt;")) >>
 491                        (*(spirit::print_p))[spirit::assign_a(name)] >>
 492                        spirit::eol_p >>
 493                        (+(spirit::chset<>(Alphabet::nucleic_cstr)))[spirit::assign_a(seq)]
 494                      )[push_back_seq(query_seqs, name, seq, parsed)]
 495                     ) >>
 496                     *spirit::space_p
 497                    )
 498               //end grammar
 499               )).full;
 500   if (not ok) {
 501     std::stringstream msg;
 502     msg << "Error parsing annotation #" << parsed;
 503     throw annotation_load_error(msg.str());
 504   }
 505   // add newly parsed annotations to our sequence
 506   std::copy(parsed_annots.begin(), parsed_annots.end(), std::back_inserter(annots));
 507   // go seearch for query sequences
 508   find_sequences(query_seqs.begin(), query_seqs.end());
 509 }
 510
 511 void Sequence::add_annotation(const annot& a)
 512 {
 513   annots.push_back(a);
 514 }
 515
 516 const std::list<annot>& Sequence::annotations() const
 517 {
 518   return annots;
 519 }
 520
 521 void Sequence::copy_children(Sequence &new_seq, size_type start, size_type count) const
 522 {
 523   new_seq.motif_list = motif_list;
 524   new_seq.annots.clear();
 525
 526   for(std::list<annot>::const_iterator annot_i = annots.begin();
 527       annot_i != annots.end();
 528       ++annot_i)
 529   {
 530     size_type annot_begin= annot_i->begin;
 531     size_type annot_end = annot_i->end;
 532
 533     if (annot_begin < start+count) {
 534       if (annot_begin >= start) {
 535         annot_begin -= start;
 536       } else {
 537         annot_begin = 0;
 538       }
 539
 540       if (annot_end < start+count) {
 541         annot_end -= start;
 542       } else {
 543         annot_end = count;
 544       }
 545
 546       annot new_annot(annot_begin, annot_end, annot_i->type, annot_i->name);
 547       new_seq.annots.push_back(new_annot);
 548     }
 549   }
 550 }
 551
 552 Sequence
 553 Sequence::subseq(size_type start, size_type count, SeqSpan::strand_type strand) const
 554 {
 555   // FIXME: should i really allow a subsequence of an empty sequence?
 556   if (!seq) {
 557     Sequence new_seq;
 558     return new_seq;
 559   }
 560
 561   Sequence new_seq = *this;
 562   new_seq.seq = seq->subseq(start, count, strand);
 563   if (seq->annotations()) {
 564     AnnotationsRef a(new Annotations(*(seq->annotations())));
 565     new_seq.seq->setAnnotations(a);
 566   }
 567   copy_children(new_seq, start, count);
 568
 569   return new_seq;
 570 }
 571
 572
 573 // FIXME: This needs to be moved into SeqSpan
 574 Sequence Sequence::rev_comp() const
 575 {
 576   // a reverse complement is the whole opposite strand
 577   return subseq(0, npos, SeqSpan::OppositeStrand);
 578 }
 579
 580 const Alphabet& Sequence::get_alphabet() const
 581 {
 582   return (seq) ? seq->get_alphabet() : Alphabet::empty_alphabet();
 583 }
 584
 585 void Sequence::set_fasta_header(std::string header_)
 586 {
 587   header = header_;
 588 }
 589
 590 void Sequence::set_species(const std::string& name)
 591 {
 592   species = name;
 593 }
 594
 595 std::string Sequence::get_species() const
 596 {
 597   return species;
 598 }
 599
 600
 601 std::string
 602 Sequence::get_fasta_header() const
 603 {
 604   return header;
 605 }
 606
 607 std::string
 608 Sequence::get_name() const
 609 {
 610   if (header.size() > 0)
 611     return header;
 612   else if (species.size() > 0)
 613     return species;
 614   else
 615     return "";
 616 }
 617
 618 void Sequence::set_sequence(const std::string& s, AlphabetRef a)
 619 {
 620   set_filtered_sequence(s, a, 0, s.size(), SeqSpan::PlusStrand);
 621 }
 622
 623 std::string Sequence::get_sequence() const
 624 {
 625   return seq->sequence();
 626 }
 627
 628 Sequence::const_reference Sequence::operator[](Sequence::size_type i) const
 629 {
 630   return at(i);
 631 }
 632
 633 void
 634 Sequence::clear()
 635 {
 636   seq.reset();
 637   header.clear();
 638   species.clear();
 639   annots.clear();
 640   motif_list.clear();
 641 }
 642
 643 void
 644 Sequence::save(fs::fstream &save_file)
 645 {
 646   //fstream save_file;
 647   std::list<annot>::iterator annots_i;
 648
 649   // not sure why, or if i'm doing something wrong, but can't seem to pass
 650   // file pointers down to this method from the mussa control class
 651   // so each call to save a sequence appends to the file started by mussa_class
 652   //save_file.open(save_file_path.c_str(), std::ios::app);
 653
 654   save_file << "<Sequence>" << std::endl;
 655   save_file << *this << std::endl;
 656   save_file << "</Sequence>" << std::endl;
 657
 658   save_file << "<Annotations>" << std::endl;
 659   save_file << species << std::endl;
 660   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 661   {
 662     save_file << annots_i->begin << " " << annots_i->end << " " ;
 663     save_file << annots_i->name << " " << annots_i->type << std::endl;
 664   }
 665   save_file << "</Annotations>" << std::endl;
 666   //save_file.close();
 667 }
 668
 669 void
 670 Sequence::load_museq(fs::path load_file_path, int seq_num)
 671 {
 672   fs::fstream load_file;
 673   std::string file_data_line;
 674   int seq_counter;
 675   annot an_annot;
 676   std::string::size_type space_split_i;
 677   std::string annot_value;
 678
 679   annots.clear();
 680   load_file.open(load_file_path, std::ios::in);
 681
 682   seq_counter = 0;
 683   // search for the seq_num-th sequence
 684   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 685   {
 686     getline(load_file,file_data_line);
 687     if (file_data_line == "<Sequence>")
 688       seq_counter++;
 689   }
 690   getline(load_file, file_data_line);
 691   // looks like the sequence is written as a single line
 692   set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
 693   getline(load_file, file_data_line);
 694   getline(load_file, file_data_line);
 695   if (file_data_line == "<Annotations>")
 696   {
 697     getline(load_file, file_data_line);
 698     species = file_data_line;
 699     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 700     {
 701       getline(load_file,file_data_line);
 702       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 703       {
 704         // need to get 4 values...almost same code 4 times...
 705         // get annot start index
 706         space_split_i = file_data_line.find(" ");
 707         annot_value = file_data_line.substr(0,space_split_i);
 708         an_annot.begin = atoi (annot_value.c_str());
 709         file_data_line = file_data_line.substr(space_split_i+1);
 710         // get annot end index
 711         space_split_i = file_data_line.find(" ");
 712         annot_value = file_data_line.substr(0,space_split_i);
 713         an_annot.end = atoi (annot_value.c_str());
 714
 715         if (space_split_i == std::string::npos)  // no entry for type or name
 716         {
 717           std::cout << "seq, annots - no type or name\n";
 718           an_annot.type = "";
 719           an_annot.name = "";
 720         }
 721         else   // else get annot type
 722         {
 723           file_data_line = file_data_line.substr(space_split_i+1);
 724           space_split_i = file_data_line.find(" ");
 725           annot_value = file_data_line.substr(0,space_split_i);
 726           an_annot.type = annot_value;
 727           if (space_split_i == std::string::npos)  // no entry for name
 728           {
 729             std::cout << "seq, annots - no name\n";
 730             an_annot.name = "";
 731           }
 732           else          // get annot name
 733           {
 734             file_data_line = file_data_line.substr(space_split_i+1);
 735             space_split_i = file_data_line.find(" ");
 736             annot_value = file_data_line.substr(0,space_split_i);
 737             an_annot.type = annot_value;
 738           }
 739         }
 740         annots.push_back(an_annot);  // don't forget to actually add the annot
 741       }
 742       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 743       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
 744     }
 745   }
 746   load_file.close();
 747 }
 748
 749
 750 void Sequence::add_motif(const Sequence& a_motif)
 751 {
 752   std::vector<int> motif_starts = find_motif(a_motif);
 753
 754   for(std::vector<int>::iterator motif_start_i = motif_starts.begin();
 755       motif_start_i != motif_starts.end();
 756       ++motif_start_i)
 757   {
 758     motif_list.push_back(motif(*motif_start_i, a_motif.get_sequence()));
 759   }
 760 }
 761
 762 void Sequence::clear_motifs()
 763 {
 764   motif_list.clear();
 765 }
 766
 767 const std::list<motif>& Sequence::motifs() const
 768 {
 769   return motif_list;
 770 }
 771
 772 std::vector<int>
 773 Sequence::find_motif(const Sequence& a_motif) const
 774 {
 775   std::vector<int> motif_match_starts;
 776   Sequence norm_motif_rc;
 777
 778   motif_match_starts.clear();
 779   // std::cout << "motif is: " << norm_motif << std::endl;
 780
 781   if (a_motif.size() > 0)
 782   {
 783     //std::cout << "Sequence: none blank motif\n";
 784     motif_scan(a_motif, &motif_match_starts);
 785
 786     norm_motif_rc = a_motif.rev_comp();;
 787     // make sure not to do search again if it is a palindrome
 788     if (norm_motif_rc != a_motif) {
 789       motif_scan(norm_motif_rc, &motif_match_starts);
 790     }
 791   }
 792   return motif_match_starts;
 793 }
 794
 795 void
 796 Sequence::motif_scan(const Sequence& a_motif, std::vector<int> * motif_match_starts) const
 797 {
 798   // if there's no sequence we can't scan for it?
 799   // should this throw an exception?
 800   if (!seq) return;
 801
 802   std::string::size_type seq_i = 0;
 803   Sequence::size_type motif_i = 0;
 804   Sequence::size_type motif_len = a_motif.length();
 805   Sequence::value_type motif_char;
 806   Sequence::value_type seq_char;
 807
 808   while (seq_i < size())
 809   {
 810     // this is pretty much a straight translation of Nora's python code
 811     // to match iupac letter codes
 812     motif_char = toupper(a_motif[motif_i]);
 813     seq_char = toupper(seq->at(seq_i));
 814     if (motif_char =='N')
 815       motif_i++;
 816     else if (motif_char == seq_char)
 817       motif_i++;
 818     else if ((motif_char =='M') && ((seq_char=='A') || (seq_char=='C')))
 819       motif_i++;
 820     else if ((motif_char =='R') && ((seq_char=='A') || (seq_char=='G')))
 821       motif_i++;
 822     else if ((motif_char =='W') && ((seq_char=='A') || (seq_char=='T')))
 823       motif_i++;
 824     else if ((motif_char =='S') && ((seq_char=='C') || (seq_char=='G')))
 825       motif_i++;
 826     else if ((motif_char =='Y') && ((seq_char=='C') || (seq_char=='T')))
 827       motif_i++;
 828     else if ((motif_char =='K') && ((seq_char=='G') || (seq_char=='T')))
 829       motif_i++;
 830     else if ((motif_char =='V') &&
 831              ((seq_char=='A') || (seq_char=='C') || (seq_char=='G')))
 832       motif_i++;
 833     else if ((motif_char =='H') &&
 834              ((seq_char=='A') || (seq_char=='C') || (seq_char=='T')))
 835       motif_i++;
 836     else if ((motif_char =='D') &&
 837              ((seq_char=='A') || (seq_char=='G') || (seq_char=='T')))
 838       motif_i++;
 839     else if ((motif_char =='B') &&
 840              ((seq_char=='C') || (seq_char=='G') || (seq_char=='T')))
 841       motif_i++;
 842     else
 843     {
 844       // if a motif doesn't match, erase our current trial and try again
 845       seq_i -= motif_i;
 846       motif_i = 0;
 847     }
 848
 849     // end Nora stuff, now we see if a match is found this pass
 850     if (motif_i == motif_len)
 851     {
 852       annot new_motif;
 853       motif_match_starts->push_back(seq_i - motif_len + 1);
 854       motif_i = 0;
 855     }
 856
 857     seq_i++;
 858   }
 859   //std::cout << std::endl;
 860 }
 861
 862 void Sequence::add_string_annotation(std::string a_seq,
 863                                      std::string name)
 864 {
 865   std::vector<int> seq_starts = find_motif(a_seq);
 866
 867   //std::cout << "searching for " << a_seq << " found " << seq_starts.size() << std::endl;
 868
 869   for(std::vector<int>::iterator seq_start_i = seq_starts.begin();
 870       seq_start_i != seq_starts.end();
 871       ++seq_start_i)
 872   {
 873     annots.push_back(annot(*seq_start_i,
 874                            *seq_start_i+a_seq.size(),
 875                            "",
 876                            name));
 877   }
 878 }
 879
 880 void Sequence::find_sequences(std::list<Sequence>::iterator start,
 881                               std::list<Sequence>::iterator end)
 882 {
 883   while (start != end) {
 884     add_string_annotation(start->get_sequence(), start->get_fasta_header());
 885     ++start;
 886   }
 887 }
 888
 889
 890 std::ostream& operator<<(std::ostream& out, const Sequence& s)
 891 {
 892   if (s.seq) {
 893     for(Sequence::const_iterator s_i = s.begin(); s_i != s.end(); ++s_i) {
 894       out << *s_i;
 895     }
 896   }
 897   return out;
 898 }
 899
 900 bool operator<(const Sequence& x, const Sequence& y)
 901 {
 902   Sequence::const_iterator x_i = x.begin();
 903   Sequence::const_iterator y_i = y.begin();
 904   // for sequences there's some computation associated with computing .end
 905   // so lets cache it.
 906   Sequence::const_iterator xend = x.end();
 907   Sequence::const_iterator yend = y.end();
 908   while(1) {
 909     if( x_i == xend and y_i == yend ) {
 910       return false;
 911     } else if ( x_i == xend ) {
 912       return true;
 913     } else if ( y_i == yend ) {
 914       return false;
 915     } else if ( (*x_i) < (*y_i)) {
 916       return true;
 917     } else if ( (*x_i) > (*y_i) ) {
 918       return false;
 919     } else {
 920       ++x_i;
 921       ++y_i;
 922     }
 923   }
 924 }
 925
 926 template <typename Iter1, typename Iter2>
 927 static
 928 bool sequence_insensitive_equality(Iter1 abegin, Iter1 aend, Iter2 bbegin, Iter2 bend)
 929 {
 930   Iter1 aseq_i = abegin;
 931   Iter2 bseq_i = bbegin;
 932   if (aend-abegin == bend-bbegin) {
 933     // since the length of the two sequences is equal, we only need to
 934     // test one.
 935     for(; aseq_i != aend; ++aseq_i, ++bseq_i) {
 936       if (toupper(*aseq_i) != toupper(*bseq_i)) {
 937         return false;
 938       }
 939     }
 940     return true;
 941   } else {
 942     return false;
 943   }
 944 }
 945
 946 bool operator==(const Sequence& x, const Sequence& y)
 947 {
 948   if (x.seq and y.seq) {
 949     // both x and y are defined
 950     if (SeqSpan::isFamily(x.seq, y.seq)) {
 951       // both are part of the same SeqSpan tree
 952       return *(x.seq) == *(y.seq);
 953     } else {
 954       // we'll have to do a real comparison
 955       return sequence_insensitive_equality<SeqSpan::const_iterator, SeqSpan::const_iterator>(
 956                x.begin(), x.end(),
 957                y.begin(), y.end()
 958              );
 959     }
 960   } else {
 961     // true if they're both empty (with either a null SeqSpanRef or
 962     // a zero length string
 963     return (x.size() == y.size());
 964   }
 965 }
 966
 967 bool operator!=(const Sequence& x, const Sequence& y)
 968 {
 969   return not operator==(x, y);
 970 }
 971