refactor nway_other.cxx
authorDiane Trout <diane@caltech.edu>
Fri, 24 Feb 2006 07:17:16 +0000 (07:17 +0000)
committerDiane Trout <diane@caltech.edu>
Fri, 24 Feb 2006 07:17:16 +0000 (07:17 +0000)
In order to understand how the transitive and radial nway comparisons
worked I wanted to make the algorithms fit in a single window. I did
this by taking various types of initialization loops that were duplicated
between both algortihms and refactored them into seperate functions.
(mostly these common loops handled building paths that were between
our various iterators on each species lists of FLPs)

I thought it might be nice to turn the various functions I ripped out
into a class that manages the march of the iterators across the lists
of FLPs.

I also added some fairly basic unit tests for the nway (transitive) comparison.

alg/flp.cxx
alg/flp.hh
alg/nway_other.cxx
alg/nway_paths.cxx
alg/nway_paths.hh
test/test_flp.cxx
test/test_nway.cxx

index 536736151aef3b4730ac9a275a5580c35e4dd633..88a0d824a6aafa6be0c326df6c320b427aaf9049 100644 (file)
@@ -119,7 +119,7 @@ FLPs::matches(int index)
 }
 
 list<int>
-FLPs::thres_matches(int index, int thres)
+FLPs::thres_matches(int index, int thres) const
 {
   if (all_matches == 0) 
     throw runtime_error("please call FLPs.seqcomp first");
index 9861fbb597077540409f1d8ce5cb8d783f684e80..22b50360a09c07f568b593f15cf411f33be08835 100644 (file)
@@ -66,7 +66,7 @@ class FLPs
     //! Return all the matches for a particular window?
     std::list<int> matches(int index);
     //! Return all the matches for a particular window abouve threshold
-    std::list<int> thres_matches(int index, int thres);
+    std::list<int> thres_matches(int index, int thres) const;
     //! Return the number of windows stored in this FLPs
     /*! (this is should be the same as seq1_len-window_size+1
      */
index b15269c3fdc7941517cec937c5eaabffb18f3a1e..62f78581d1ca092fd1205718f92e35e6c5145743 100644 (file)
 
 using namespace std;
 
+//! dump the matches for a particular window
+void dump_matches_win(int win_i, int species_num, vector<list<int> > all_matches)
+{    
+  cout << "<win_debug win=" << win_i << ">" << endl;
+  for (int sp_i = 0; sp_i < species_num-1; sp_i++)
+  {
+    cout << "sp " << sp_i << ": ";
+    for(list<int>::iterator debug_i = all_matches[sp_i].begin();
+        debug_i != all_matches[sp_i].end();
+        ++debug_i)
+    {
+      cout << *debug_i << " ";
+    }
+    cout << endl;
+  }
+  cout << "</win_debug>"<< endl;
+}
+
+/*! (re-)initialize all_matches with the list of matches between
+ *  species (sequence) 0 and all the other sequences
+ *  \return true if there were matches against all the species, false otherwise
+ */
+bool
+make_all_seq_win_matches(const vector<vector<FLPs> >& all_comparisons,
+                         vector<list<int> >& all_matches,
+                         int window_index, int soft_threshold)
+{
+  list<int> new_nodes; 
+  bool some_matches=true;
+  all_matches.clear();
+  for(int sp_i=1; (sp_i < all_comparisons.size()) && (some_matches); sp_i++ )
+  {
+    new_nodes.clear();
+    // --thres
+    //new_nodes = all_comparisons[0][sp_i].matches(win_i);
+    new_nodes = all_comparisons[0][sp_i].thres_matches(window_index, 
+                                                       soft_threshold);
+    if (new_nodes.empty())
+      some_matches = false;
+    else
+      all_matches.push_back(new_nodes);
+  }
+  // we only record matches from seq 0 to another seq, so we should have seq-1
+  // matches if there were enough matches.
+  // if there weren't enough we should have even fewer matches
+  // (this complicated boolean hides all the logic in one assert call,
+  //  hopefully allowing the assert to be optimized away)
+  assert(( some_matches && (all_matches.size() == (all_comparisons.size()-1)))
+       ||(!some_matches && (all_matches.size() <  (all_comparisons.size()-1))));
+  return some_matches;
+}
+
+/*! reset all the speces begin and end iterators to the start and end
+ *  of the match pairs stored in all_matches
+ */
+void reset_species_iterators(vector<list<int> >& all_matches,
+                             vector<list<int>::iterator>& sp_itor_begin,
+                             vector<list<int>::iterator>& sp_itor_end)
+{
+  sp_itor_begin.clear();
+  sp_itor_end.clear();
+  // set each species list of matches to beginning
+  for (int sp_i = 0; sp_i < all_matches.size(); sp_i++)
+  {
+    sp_itor_begin.push_back(all_matches[sp_i].begin());
+    sp_itor_end.push_back(all_matches[sp_i].end());
+  }
+}
+
+/*! Set the path vector to be the list of window positions from
+ *  the current locations of our species beginning iterators
+ */
+void set_path_to_cur_sp_itor_track(
+    vector<int>& path, 
+    int base_window_index,
+    const vector<list<int>::iterator>& sp_itor_begin)
+{  
+  // add path that each species iterator is pointing to
+  path.clear();
+  path.push_back(base_window_index);
+
+  for (int sp_i = 0; sp_i < sp_itor_begin.size(); sp_i++)
+  {
+    path.push_back(*(sp_itor_begin[sp_i]));
+  }
+} 
+
+/*! advance the species iterator track
+ *  if we were able to advance the track return true
+ *  if we weren't able to advance, we've looked at every track so return false
+ */
+bool advance_sp_itor_track(vector<list<int>::iterator>& sp_itor_begin,
+                           const vector<list<int>::iterator>& sp_itor_end,
+                           vector<list<int> >& all_matches)
+{
+  bool not_advanced = true;
+  int sp_depth = all_matches.size() - 1;  // this roves up & down species list
+  while ( (not_advanced) && (sp_depth != -1) )
+  {
+    //cout << "foe\n";
+    //cout << sp_depth << ".." << *(sp_itor_begin[sp_depth]) << endl;
+    (sp_itor_begin[sp_depth])++;   // advance iter at current depth
+    //cout << sp_depth << ".." << *(sp_itor_begin[sp_depth]) << endl;
+    
+    // if we reached the end of the list, reset iter & go up one depth
+    if (sp_itor_begin[sp_depth] == sp_itor_end[sp_depth])
+    {
+      //cout << "fum\n";
+      sp_itor_begin[sp_depth] = all_matches[sp_depth].begin();
+      sp_depth--;
+      //cout << "depth = " << sp_depth << endl;
+    }
+    else
+      not_advanced = false;
+  }
+}
+
 void
 Nway_Paths::radiate_path_search(vector<vector<FLPs> > all_comparisons)
 {
   vector<int> path;
-  int win_i, sp_i, sp_depth, window_num;
-  bool some_matches, still_paths, not_advanced;
-  list<int> new_nodes;
+  int win_i, sp_i, window_num;
+  bool still_paths, not_advanced;
   vector<list<int> > all_matches;
-  vector<list<int>::iterator> sp_nodes, sp_nodes_end;
-  list<int>::iterator debug_i;
+  vector<list<int>::iterator> sp_itor_begin(all_comparisons.size());
+  vector<list<int>::iterator> sp_itor_end(all_comparisons.size());
 
   pathz.clear();
   window_num = all_comparisons[0][1].size();
   cout << window_num << endl;    // just a sanity check
 
-  sp_nodes.reserve(species_num);
-  sp_nodes_end.reserve(species_num);
-
   // loop thru all windows in first species
   for (win_i = 0; win_i < window_num; win_i++)
   {
-    // first we see if the first seq has matches to all other seqs at this win
-    some_matches = true;
-    sp_i = 1;
-    all_matches.clear();
-    while ( (sp_i < species_num) && (some_matches) )
+    // if 1st seq does have match to all the others, then make all possible 
+    // paths out of all these matches (in all_matches)
+    if(make_all_seq_win_matches(all_comparisons, all_matches, win_i, soft_thres))
     {
-      new_nodes.clear();
-      new_nodes = all_comparisons[0][sp_i].matches(win_i);
-      if (new_nodes.empty())
-       some_matches = false;
-
-      all_matches.push_back(new_nodes);
-      sp_i++;
-    }
-    //cout << "fee\n";
-    /*
-    if (some_matches)
-    {
-      cout << win_i << " plrf" << endl;
-      for (sp_i = 0; sp_i < species_num-1; sp_i++)
-      {
-       debug_i = all_matches[sp_i].begin();
-       while (debug_i != all_matches[sp_i].end())
-       {
-         cout << *debug_i << " ";
-         debug_i++;
-       }
-       cout << "plrfff"<< endl;
-      }
-    }
-    */
-    // if 1st seq does match to all others, make all possible paths
-    // out of all these matches
-    if (some_matches)
-    {
-      sp_nodes.clear();
-      sp_nodes_end.clear();
-      // set each species list of matches to beginning
-      for (sp_i = 0; sp_i < species_num-1; sp_i++)
-      {
-       sp_nodes.push_back(all_matches[sp_i].begin());
-       sp_nodes_end.push_back(all_matches[sp_i].end());
-      }
+      //debug dump_matches_win(win_i, species_num, all_matches);
+      reset_species_iterators(all_matches, sp_itor_begin, sp_itor_end);
 
       still_paths = true;
-      sp_depth = species_num - 2;       // this roves up & down species list
       //cout << "fie\n";
       while (still_paths) 
       {
-       // add path that each species iterator is pointing to
-       path.clear();
-       path.push_back(win_i);
-       
-       //cout << win_i;
-       for (sp_i = 0; sp_i < species_num-1; sp_i++)
-       {
-         //cout  << ", " << *(sp_nodes[sp_i]);
-         path.push_back(*(sp_nodes[sp_i]));
-       }
-       //cout << endl;
+        // add path that each species iterator is pointing to
+        set_path_to_cur_sp_itor_track(path, win_i, sp_itor_begin);
        
-       pathz.push_back(path);
-
-       // now advance the right iterator
-       not_advanced = true;
-       while ( (not_advanced) && (sp_depth != -1) )
-       {
-         //cout << "foe\n";
-          //cout << sp_depth << ".." << *(sp_nodes[sp_depth]) << endl;
-          (sp_nodes[sp_depth])++;   // advance iter at current depth
-          //cout << sp_depth << ".." << *(sp_nodes[sp_depth]) << endl;
-
-         // if we reached the end of the list, reset iter & go up one depth
-         if (sp_nodes[sp_depth] == sp_nodes_end[sp_depth])
-         {
-           //cout << "fum\n";
-           sp_nodes[sp_depth] = all_matches[sp_depth].begin();
-           sp_depth--;
-           //cout << "depth = " << sp_depth << endl;
-         }
-         else
-           not_advanced = false;
-       }
-
-       if (sp_depth == -1)   // jumped up to first species, all paths searched
-         still_paths = false;
-       else                 // otherwise just reset to max depth and continue
-         sp_depth = species_num - 2;
+        pathz.push_back(path);
+
+        // now advance the right iterator
+        still_paths = advance_sp_itor_track(sp_itor_begin, 
+                                            sp_itor_end, 
+                                            all_matches);
       }
     }
   }
 }
 
+bool is_transitive_path(const vector<int>& path, 
+                        const vector<vector<FLPs> >& all_comparisons,
+                        const int soft_threshold)
+{
+  int cur_node;
+  list<int> trans_check_nodes;
 
-
+  for (int sp_depth=1; sp_depth != path.size()-1; ++sp_depth)
+  {
+    cur_node = path[sp_depth];
+    for(int i = sp_depth+1; i != path.size()-1; i++)
+    {
+      // --thres
+      //trans_check_nodes = all_comparisons[sp_depth][i].matches(cur_node);
+      trans_check_nodes =
+        all_comparisons[sp_depth][i].thres_matches(cur_node, soft_threshold);
+      
+      if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
+                                            trans_check_nodes.end(),
+                                            path[i]) ) 
+         &&(trans_check_nodes.end() == find(trans_check_nodes.begin(),
+                                            trans_check_nodes.end(),
+                                            path[i] * -1) ) )
+        return false;
+    }
+  }
+  return true;
+}
 
 void
 Nway_Paths::trans_path_search(vector<vector<FLPs> > all_comparisons)
 {
+  assert (species_num == all_comparisons.size());
   vector<int> path;
-  int win_i, sp_i, sp_depth, window_num, i, cur_node;
-  bool some_matches, still_paths, not_advanced, trans_good;
-  list<int> new_nodes, trans_check_nodes;
+  int win_i, sp_i, sp_depth, window_num;
+  bool still_paths, not_advanced;
   vector<list<int> > all_matches;
-  vector<list<int>::iterator> sp_nodes, sp_nodes_end;
-  list<int>::iterator debug_i;
+  vector<list<int>::iterator> sp_itor_begin(all_comparisons.size()); 
+  vector<list<int>::iterator> sp_itor_end(all_comparisons.size()); 
 
   cout << "trans: softhres = " << soft_thres;
   cout << ", window = " << win_size << ", ";
@@ -148,129 +219,30 @@ Nway_Paths::trans_path_search(vector<vector<FLPs> > all_comparisons)
   window_num = all_comparisons[0][1].size();
   cout << "window number = " << window_num << endl;   // just a sanity check
   cout << "trans: species_num = " << species_num << endl;
-  sp_nodes.reserve(species_num);
-  cout << "trans: fie\n";
-  sp_nodes_end.reserve(species_num);
-  cout << "trans: foe\n";
   // loop thru all windows in first species
   for (win_i = 0; win_i < window_num; win_i++)
   {
-    // first we see if the first seq has matches to all other seqs at this win
-    some_matches = true;
-    sp_i = 1;
-    all_matches.clear();
-    while ( (sp_i < species_num) && (some_matches) )
-    {
-      new_nodes.clear();
-      // --thres
-      //new_nodes = all_comparisons[0][sp_i].matches(win_i);
-      new_nodes = all_comparisons[0][sp_i].thres_matches(win_i, soft_thres);
-      if (new_nodes.empty())
-       some_matches = false;
-
-      all_matches.push_back(new_nodes);
-      sp_i++;
-    }
-    //cout << "fee\n";
-    /*
-    if (some_matches)
-    {
-      cout << win_i << " plrf" << endl;
-      for (sp_i = 0; sp_i < species_num-1; sp_i++)
-      {
-       debug_i = all_matches[sp_i].begin();
-       while (debug_i != all_matches[sp_i].end())
-       {
-         cout << *debug_i << " ";
-         debug_i++;
-       }
-       cout << "plrfff"<< endl;
-      }
-    }
-    */
-    // if 1st seq does match to all others, make all possible paths
-    // out of all these matches
-    if (some_matches)
+    // if 1st seq has a match to all the others for this window,
+    // then make all possible paths out of all these matches (in all_matches)
+    if(make_all_seq_win_matches(all_comparisons, all_matches, win_i, soft_thres))
     {
-      sp_nodes.clear();
-      sp_nodes_end.clear();
-      // set each species list of matches to beginning
-      for (sp_i = 0; sp_i < species_num-1; sp_i++)
-      {
-       sp_nodes.push_back(all_matches[sp_i].begin());
-       sp_nodes_end.push_back(all_matches[sp_i].end());
-      }
-
+      //debug? //dump_matches_win(win_i, species_num, all_matches);
+      reset_species_iterators(all_matches, sp_itor_begin, sp_itor_end);
       still_paths = true;
-      //cout << "fie\n";
       while (still_paths) 
       {
-       // add path that each species iterator is pointing to
-       path.clear();
-       path.push_back(win_i);
-       
-       //cout << win_i;
-       for (sp_i = 0; sp_i < species_num-1; sp_i++)
-       {
-         //cout  << ", " << *(sp_nodes[sp_i]);
-         path.push_back(*(sp_nodes[sp_i]));
-       }
-       //cout << endl;
-       // check transitivity
-       sp_depth = 1;
-       trans_good = true;
-       while ( (sp_depth != species_num-1) && (trans_good) )
-       {
-         cur_node = path[sp_depth];
-         for(i = sp_depth+1; i < species_num; i++)
-         {
-           // --thres
-           //trans_check_nodes = all_comparisons[sp_depth][i].matches(cur_node);
-           trans_check_nodes =
-             all_comparisons[sp_depth][i].thres_matches(cur_node, soft_thres);
-
-           if ( (trans_check_nodes.end() == find(trans_check_nodes.begin(),
-                                                 trans_check_nodes.end(),
-                                                 path[i]) ) &&
-                (trans_check_nodes.end() == find(trans_check_nodes.begin(),
-                                                 trans_check_nodes.end(),
-                                                 path[i] * -1) ) )
-             trans_good = false;
-         }
-         
-         sp_depth++;
-       }
-
-       if (trans_good)
-         pathz.push_back(path);
-
-       // now advance the right iterator
-       not_advanced = true;
-       sp_depth = species_num - 2;       // this roves up & down species list
-       while ( (not_advanced) && (sp_depth != -1) )
-       {
-         //cout << "foe\n";
-          //cout << sp_depth << ".." << *(sp_nodes[sp_depth]) << endl;
-          (sp_nodes[sp_depth])++;   // advance iter at current depth
-          //cout << sp_depth << ".." << *(sp_nodes[sp_depth]) << endl;
-
-         // if we reached the end of the list, reset iter & go up one depth
-         if (sp_nodes[sp_depth] == sp_nodes_end[sp_depth])
-         {
-           //cout << "fum\n";
-           sp_nodes[sp_depth] = all_matches[sp_depth].begin();
-           sp_depth--;
-           //cout << "depth = " << sp_depth << endl;
-         }
-         else
-           not_advanced = false;
-       }
-
-       if (sp_depth == -1)   // jumped up to first species, all paths searched
-         still_paths = false;
-       else                 // otherwise just reset to max depth and continue
-         sp_depth = species_num - 2;
+        // initialize a path to however far each species iterator has been 
+        // advanced.
+        set_path_to_cur_sp_itor_track(path, win_i, sp_itor_begin);
+
+        // if the path is transitive, save the path
+        if (is_transitive_path(path, all_comparisons, soft_thres))
+          pathz.push_back(path);
+
+        // now advance the right iterator
+        still_paths = advance_sp_itor_track(sp_itor_begin, 
+                                            sp_itor_end, 
+                                            all_matches);
       }
     }
   }
index 9bd1fdc114933fffd689a8125b1aaa263f402904..50e85d9c7622c77c487ac0b9899a3b40529e1d66 100644 (file)
@@ -65,10 +65,6 @@ Nway_Paths::simple_refine()
 
   while(pathz_i != pathz.end())
   {
-    //cout << "path number is: " << pathz.size() << endl;
-    //++count_loop;
-    //cout << "glorpagh!!!!!" << count_loop << endl;
-    
     // keep track of current path and advance to next path
     cur_path = *pathz_i;
     ++pathz_i;
@@ -105,12 +101,12 @@ Nway_Paths::simple_refine()
       new_path.insert(new_path.begin(),win_size + win_ext_len);
       for(i2 = 1; i2 <= species_num; i2++)
       {
-       if (new_path[i2] < 0)
-       {
-         //cout << "before: " << new_path[i2];
-         new_path[i2] +=  win_ext_len;
-         //cout << " after: " << new_path[i2] << endl;
-       }
+        if (new_path[i2] < 0)
+        {
+          //cout << "before: " << new_path[i2];
+          new_path[i2] +=  win_ext_len;
+          //cout << " after: " << new_path[i2] << endl;
+        }
       }
       refined_pathz.push_back(new_path);
       // reset stuff
@@ -444,21 +440,22 @@ Nway_Paths::find_paths(vector<vector<FLPs> > all_comparisons)
 void Nway_Paths::refine()
 {
 }
+*/
+
 
-void Nway_Paths::print()
+void Nway_Paths::print(list<vector<int> >& dump_path)
 {
-  list<list<int> >::iterator pathz_i;
+  list<vector<int> >::iterator pathz_i;
+  vector<int>::iterator path_i;
 
   cout << "printing list of lists\n";
-  for (pathz_i = pathz.begin(); pathz_i != pathz.end(); ++pathz_i)
+  for (pathz_i = dump_path.begin(); pathz_i != dump_path.end(); ++pathz_i)
   {
     for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
       cout << *path_i << " ";
     cout << endl;
   }
 }
-*/
-
 
 
 
index 124522412dcf9629ed5cdb3b34b73038ed3f7b01..5a055ca2cf01e564892dac5e02df9a539deeef00 100644 (file)
@@ -24,7 +24,7 @@ class Nway_Paths
 {
   friend class ConnView;
   friend class SeqView;
-  private:
+  protected:
     int species_num;
     int threshold;
     int win_size;
@@ -33,9 +33,6 @@ class Nway_Paths
     double ent_thres;
     std::vector<char *> c_sequences;
 
-    std::list<std::vector<int> > pathz;
-    std::list<std::vector<int> > refined_pathz;
-
   public:
     Nway_Paths();
     //! setup an nway comparison, initialize # of species, window size, 
@@ -63,6 +60,12 @@ class Nway_Paths
     void refine();
 
     void save_old(std::string save_file_path);
-    void print();
+    void print(std::list<std::vector<int> >&);
+
+    // these probably shouldn't be public, but lets start 
+    // simple
+    std::list<std::vector<int> > pathz;
+    std::list<std::vector<int> > refined_pathz;
+
 };
 #endif
index aa18f76d95047063f89fad78e8000d937652b932..fc03452ac112620de12ebdc71771d2d467056407 100644 (file)
@@ -5,7 +5,8 @@
 #include <boost/filesystem/operations.hpp>
 #include <boost/filesystem/path.hpp>
 
-#include "flp.hh"
+#include "alg/flp.hh"
+#include "alg/sequence.hh"
 #include <iostream>
 #include <list>
 
index aa6295bfc94236926fec0e82f2fbf4254bb44e66..321e40eb211506f72e87ce651fe7ecdd02d08d2b 100644 (file)
@@ -1,6 +1,7 @@
 #include <boost/test/auto_unit_test.hpp>
 
 #include <string>
+#include <iostream>
 
 #include "alg/mussa_class.hh"
 #include "alg/nway_paths.hh"
@@ -8,6 +9,49 @@
 
 using namespace std;
 
+//! there should be no matches
+BOOST_AUTO_TEST_CASE( nway_null )
+{
+  string s0("AAAANNNN");
+  string s1("GGGGNNNN");
+  string s2("TTTTNNNN");
+
+  Mussa analysis;
+  analysis.add_a_seq(s0);
+  analysis.add_a_seq(s1);
+  analysis.add_a_seq(s2);
+  analysis.analyze(4,3);
+  Nway_Paths npath = analysis.get_paths();
+  // there should be no paths for these sequences
+  for (std::list<std::vector<int> >::iterator pathz_i = npath.pathz.begin();
+       pathz_i != npath.pathz.end();
+       ++pathz_i)
+  {
+    BOOST_CHECK_EQUAL( pathz_i->size(), 0);
+  }
+}
+
 BOOST_AUTO_TEST_CASE( nway_test )
 {
+  string s0("ATATGCGC");
+  string s1("GGGGGGGC");
+  Sequence seq1(s1);
+  cout << "seq1 rev = " << seq1.rev_comp() << endl;
+
+  Mussa analysis;
+  analysis.add_a_seq(s0);
+  analysis.add_a_seq(s1);
+  analysis.analyze(4,3);
+  Nway_Paths npath = analysis.get_paths();
+  for (std::list<std::vector<int> >::iterator pathz_i = npath.pathz.begin();
+       pathz_i != npath.pathz.end();
+       ++pathz_i)
+  {
+    for( std::vector<int>::iterator path_i = pathz_i->begin();
+         path_i != pathz_i->end();
+         ++path_i)
+    {      
+      BOOST_CHECK( *path_i == 4 || *path_i == -4);
+    }
+  }
 }