refactor glsequence to be more testable

[mussa.git] / alg / sequence.cxx
diff --git a/alg/sequence.cxx b/alg/sequence.cxx

index 1b18d2adef3b7e1c9944fac3c5a3179983826bb6..8afe8f044a06fbdff68b1b700232b271256e51a6 100644 (file)
--- a/alg/sequence.cxx
+++ b/alg/sequence.cxx
@@ -34,8 +34,26 @@ Sequence::Sequence()
  {
  }
  
-Sequence::Sequence(string seq):sequence(filter_sequence(seq))
+Sequence::Sequence(string seq)
  {
+  set_filtered_sequence(seq);
+}
+
+Sequence &Sequence::operator=(const Sequence& s)
+{
+  if (this != &s) {
+    sequence = s.sequence;
+    header = s.header;
+    species = s.species;
+    annots = s.annots;
+  }
+  return *this;
+}
+
+Sequence &Sequence::operator=(const std::string& s)
+{
+  set_filtered_sequence(s);
+  return *this;
  }
  
  //! load a fasta file into a sequence
@@ -61,7 +79,7 @@ Sequence::load_fasta(string file_path, int seq_num,
    data_file.open(file_path.c_str(), ios::in);
  
    if (!data_file)
-  {
+  {    
      throw mussa_load_error("Sequence File: " + file_path + " not found"); 
    }
    // if file opened okay, read it
@@ -96,20 +114,20 @@ Sequence::load_fasta(string file_path, int seq_num,
        end_index = sequence_raw.size();
  
      // sequence filtering for upcasing agctn and convert non AGCTN to N
-    sequence = filter_sequence(sequence_raw, start_index, end_index-start_index);
+    set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
    }
  }
  
-string Sequence::filter_sequence(const string &old_seq, 
-                                 string::size_type start, 
-                                 string::size_type count) const
+void Sequence::set_filtered_sequence(const string &old_seq, 
+                                     string::size_type start, 
+                                     string::size_type count)
  {
    char conversionTable[257];
-  string new_seq;
  
    if ( count == 0)
      count = old_seq.size() - start;
-  new_seq.reserve(count);
+  sequence.clear();
+  sequence.reserve(count);
  
    // Make a conversion table
  
@@ -135,9 +153,8 @@ string Sequence::filter_sequence(const string &old_seq,
    // finally, the actual conversion loop
    for(string::size_type seq_index = 0; seq_index < count; seq_index++)
    {
-    new_seq += conversionTable[ (int)old_seq[seq_index+start]];
+    sequence += conversionTable[ (int)old_seq[seq_index+start]];
    }
-  return new_seq;
  }
  
    // this doesn't work properly under gcc 3.x ... it can't recognize toupper
@@ -192,8 +209,8 @@ Sequence::load_annot(string file_path, int start_index, int end_index)
          an_annot.end = atoi (annot_value.c_str());
          file_data_line = file_data_line.substr(space_split_i+1);
  
-        cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
-                  << endl;
+        //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+             //     << endl;
  
          // get annot name
          space_split_i = file_data_line.find(" ");
@@ -244,6 +261,11 @@ Sequence::load_annot(string file_path, int start_index, int end_index)
    }
  }
  
+bool Sequence::empty() const
+{
+  return (size() == 0);
+}
+
  const std::list<annot> Sequence::annotations() const
  {
    return annots;
@@ -259,6 +281,26 @@ string::size_type Sequence::size() const
    return sequence.size();
  }
  
+Sequence::iterator Sequence::begin()
+{
+  return sequence.begin();
+}
+
+Sequence::const_iterator Sequence::begin() const
+{
+  return sequence.begin();
+}
+
+Sequence::iterator Sequence::end()
+{
+  return sequence.end();
+}
+
+Sequence::const_iterator Sequence::end() const
+{
+  return sequence.end();
+}
+
  
  const string&
  Sequence::get_seq() const
@@ -338,7 +380,7 @@ Sequence::sp_name() const
  void
  Sequence::set_seq(const string& a_seq)
  {
-  sequence = filter_sequence(a_seq);
+  set_filtered_sequence(a_seq);
  }
  
  
@@ -431,35 +473,35 @@ Sequence::load_museq(string load_file_path, int seq_num)
          annot_value = file_data_line.substr(0,space_split_i);
          an_annot.end = atoi (annot_value.c_str());
  
-       if (space_split_i == string::npos)  // no entry for type or name
-       {
-         cout << "seq, annots - no type or name\n";
-         an_annot.type = "";
-         an_annot.name = "";
-       }
-       else   // else get annot type
-       {
-         file_data_line = file_data_line.substr(space_split_i+1);
-         space_split_i = file_data_line.find(" ");
-         annot_value = file_data_line.substr(0,space_split_i);
-         an_annot.type = annot_value;
-         if (space_split_i == string::npos)  // no entry for name
-         {
-           cout << "seq, annots - no name\n";
-           an_annot.name = "";
-         }
-         else          // get annot name
-         {
-           file_data_line = file_data_line.substr(space_split_i+1);
-           space_split_i = file_data_line.find(" ");
-           annot_value = file_data_line.substr(0,space_split_i);
-           an_annot.type = annot_value;
-         }
-       }
-       annots.push_back(an_annot);  // don't forget to actually add the annot
+        if (space_split_i == string::npos)  // no entry for type or name
+        {
+          cout << "seq, annots - no type or name\n";
+          an_annot.type = "";
+          an_annot.name = "";
+        }
+        else   // else get annot type
+        {
+          file_data_line = file_data_line.substr(space_split_i+1);
+          space_split_i = file_data_line.find(" ");
+          annot_value = file_data_line.substr(0,space_split_i);
+          an_annot.type = annot_value;
+          if (space_split_i == string::npos)  // no entry for name
+          {
+            cout << "seq, annots - no name\n";
+            an_annot.name = "";
+          }
+          else          // get annot name
+          {
+            file_data_line = file_data_line.substr(space_split_i+1);
+            space_split_i = file_data_line.find(" ");
+            annot_value = file_data_line.substr(0,space_split_i);
+            an_annot.type = annot_value;
+          }
+        }
+        annots.push_back(an_annot);  // don't forget to actually add the annot
        }
-      cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
-          << "-->" << an_annot.type << "::" << an_annot.name << endl;
+      //cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+      //     << "-->" << an_annot.type << "::" << an_annot.name << endl;
      }
    }
    load_file.close();
@@ -508,8 +550,8 @@ Sequence::rc_motif(string a_motif)
      rev_comp += conversionTable[table_i];
    }
  
-  cout << "seq: " << a_motif << endl;
-  cout << "rc:  " << rev_comp << endl;
+  //cout << "seq: " << a_motif << endl;
+  //cout << "rc:  " << rev_comp << endl;
  
    return rev_comp;
  }
@@ -559,7 +601,7 @@ Sequence::motif_validate(string a_motif)
        valid_motif += 'B';
     }
  
-  cout << "valid_motif is: " << valid_motif << endl;
+  //cout << "valid_motif is: " << valid_motif << endl;
   
    return valid_motif;
  }
@@ -574,14 +616,14 @@ Sequence::find_motif(string a_motif)
  
    motif_match_starts.clear();
  
-  cout << "motif is: " << a_motif << endl;
+  //cout << "motif is: " << a_motif << endl;
    a_motif = motif_validate(a_motif);
    //cout << "motif is: " << a_motif << endl;
  
  
    if (a_motif != "")
    {
-    cout << "Sequence: none blank motif\n";
+    //cout << "Sequence: none blank motif\n";
      motif_scan(a_motif, &motif_match_starts);
  
      a_motif_rc = rc_motif(a_motif);
@@ -615,8 +657,7 @@ Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
    seq_i = 0;
    while (seq_i < sequence.length())
    {
-    cout << seq_c[seq_i];
-    //if ((seq_i > 10885) && (seq_i < 10917))
+    //cout << seq_c[seq_i];
      //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
      // this is pretty much a straight translation of Nora's python code
      // to match iupac letter codes
@@ -678,14 +719,3 @@ Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
    cout << endl;
  }
  
-/*
-        // get annot start index
-        space_split_i = file_data_line.find(" ");
-        annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.name = annot_value;
-        file_data_line = file_data_line.substr(space_split_i+1);
-        // get annot start index
-        space_split_i = file_data_line.find(" ");
-        annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.type = annot_value;
-*/