[project @ 4]
[mussa.git] / mussa_nway.cc
index ae9b69a975a70927ba8ed2aab369e13c502c21ec..fd760b8364ff66c878b2bc91f77a74f41b8f183d 100644 (file)
@@ -16,9 +16,11 @@ Nway_Paths::Nway_Paths()
 }
 
 void
-Nway_Paths::setup(int sp_num)
+Nway_Paths::setup(int sp_num, int w, int t)
 {
   species_num = sp_num;
+  threshold = t;
+  win_size = w;
   pathz.clear();
 }
 
@@ -53,10 +55,13 @@ Nway_Paths::path_search(vector<vector<FLPs> > all_comparisons, vector<int> path,
 
     if (trans_check_good)
     {
+      // this makes sure path nodes are recorded with RC status relative to
+      // the base species
       if ( path[depth-1] >= 0)
         path.push_back(*new_nodes_i);
       else
         path.push_back(*new_nodes_i * -1);
+
       if (depth < species_num - 1)
         path_search(all_comparisons, path, depth + 1);
       else
@@ -106,6 +111,56 @@ Nway_Paths::find_paths_r(vector<vector<FLPs> > all_comparisons)
   }
 }
 
+// dumbly goes thru and combines path windows exactly adjacent (ie + 1 index)
+// doesn't deal with interleaved adjacency
+void
+Nway_Paths::simple_refine()
+{
+  // ext_path remembers the first window set in an extending path
+  vector<int> ext_path, cur_path, next_path, new_path;
+  list<vector<int> >::iterator pathz_i;
+  int i2, win_ext_len = 0;
+  bool extending;
+
+
+  refined_pathz.clear();
+
+  pathz_i = pathz.begin();
+  ext_path = *pathz_i; 
+  while(pathz_i != pathz.end())
+  {
+    // keep track of current path and advance to next path
+    cur_path = *pathz_i;
+    ++pathz_i;
+    next_path = *pathz_i;
+    // if node for each seq is equal to the next node+1 then for all
+    // sequences then we are extending
+    extending = true;
+    for(i2 = 0; i2 < species_num; i2++)
+    {
+      //cout << cur_path[i2] << " vs " << next_path[i2] << endl; 
+      if (cur_path[i2] + 1 != next_path[i2])
+        extending = false;
+    }
+    if (extending)
+    {
+      //cout << "Raaaawwwrrr!  I am extending\n";
+      win_ext_len++;
+    }
+    else
+    {
+      // add the extend window length as first element and add as refined
+      new_path = ext_path;
+      new_path.insert(new_path.begin(),win_size + win_ext_len);
+      refined_pathz.push_back(new_path);
+      // reset stuff
+      win_ext_len = 0;
+      ext_path = next_path;
+    }
+  }
+}
+
 
 void
 Nway_Paths::add_path(vector<int> loaded_path)
@@ -124,27 +179,122 @@ Nway_Paths::save(string save_file_path)
 
   save_file.open(save_file_path.c_str(), ios::out);
 
-  //save_file << "<Seqcomp type=" << ana_type << " win=" << window_size;
-  //save_file << " thres=" << hard_threshold << ">\n";
-
-  path_i = pathz.begin();
-  paths_end = pathz.end();
-  if (path_i == paths_end)
-    cout << "Arrrrrrgggghhhhhh\n";
-  while(path_i != paths_end)
+  save_file << "<Mussa type=flp seq_num=" << species_num;
+  save_file << " win=" << win_size;
+  // add a function para new_thres defaults to -1 to later deal with
+  // reanalysis with higher thres - if statement whether to record base thres
+  // or new thres (ie if -1, then base)
+  save_file << " thres=" << threshold << " >\n";
+
+  path_i = refined_pathz.begin();
+  paths_end = refined_pathz.end();
+  //path_i = pathz.begin();
+  //paths_end = pathz.end();
+  while (path_i != paths_end)
   {
     a_path = *path_i;
     //cout << a_path.size() << endl;
-    for(i = 0; i < species_num; ++i)
-      save_file << i << "," << a_path[i] << " ";
+    //first entry is the window length of the windows in the path
+    save_file << a_path[0] << ":";
+    for(i = 1; i <= species_num; ++i)
+    {
+      save_file << a_path[i];
+      if (i != species_num)
+        save_file << ",";
+    }
     save_file << endl;
     ++path_i;
   }
 
-//save_file << "</Mussa>\n";
+  save_file << "</Mussa>\n";
   save_file.close();
 }
 
+/*
+  if (path_i == paths_end)
+    cout << "Arrrrrrgggghhhhhh\n";
+*/
+
+int
+Nway_Paths::load(string load_file_path)
+{
+  fstream load_file;
+  string file_data_line, header_data, data, path_node, path_length;
+  int i, space_split_i, equal_split_i, comma_split_i, colon_split_i;
+  vector<int> loaded_path;
+
+
+  load_file.open(load_file_path.c_str(), ios::in);
+
+  // get header data
+  // grab mussa tag - discard for now...maybe check in future...
+  getline(load_file,file_data_line);
+  space_split_i = file_data_line.find(" ");
+  file_data_line = file_data_line.substr(space_split_i+1);
+  // grab type tag - need in future to distinguish between flp and vlp paths
+  space_split_i = file_data_line.find(" ");
+  file_data_line = file_data_line.substr(space_split_i+1);
+  // get species/seq number
+  space_split_i = file_data_line.find(" ");
+  header_data = file_data_line.substr(0,space_split_i); 
+  equal_split_i = header_data.find("=");
+  data = file_data_line.substr(equal_split_i+1); 
+  species_num = atoi (data.c_str());
+  file_data_line = file_data_line.substr(space_split_i+1);
+  // get window size
+  space_split_i = file_data_line.find(" ");
+  header_data = file_data_line.substr(0,space_split_i); 
+  equal_split_i = header_data.find("=");
+  data = file_data_line.substr(equal_split_i+1); 
+  win_size = atoi (data.c_str());
+  file_data_line = file_data_line.substr(space_split_i+1);
+  // get window size
+  space_split_i = file_data_line.find(" ");
+  header_data = file_data_line.substr(0,space_split_i); 
+  equal_split_i = header_data.find("=");
+  data = file_data_line.substr(equal_split_i+1); 
+  threshold = atoi (data.c_str());
+  file_data_line = file_data_line.substr(space_split_i+1);
+
+  cout << "seq_num=" << species_num << " win=" << win_size;
+  cout << " thres=" << threshold << endl;
+
+  // clear out the current data
+  refined_pathz.clear();
+
+  int temp;
+
+  getline(load_file,file_data_line);
+  while ( (!load_file.eof()) && (file_data_line != "</Mussa>") )
+  {
+    if (file_data_line != "")
+    {
+      loaded_path.clear();
+      colon_split_i = file_data_line.find(":");
+      path_length = file_data_line.substr(0,colon_split_i); 
+      loaded_path.push_back(atoi (path_length.c_str()));
+      file_data_line = file_data_line.substr(colon_split_i+1);
+      for(i = 0; i < species_num; i++)
+      {
+        comma_split_i = file_data_line.find(",");
+        path_node = file_data_line.substr(0, comma_split_i); 
+        temp = atoi (path_node.c_str());
+        loaded_path.push_back(temp);
+        file_data_line = file_data_line.substr(comma_split_i+1);
+      }
+      refined_pathz.push_back(loaded_path);
+    }
+    getline(load_file,file_data_line);
+  }
+
+
+  load_file.close();
+
+  return species_num;
+}
+
+
+
 void
 Nway_Paths::save_old(string save_file_path)
 {