[project @ 2]
authorDiane Trout <diane@caltech.edu>
Fri, 16 Sep 2005 22:41:59 +0000 (22:41 +0000)
committerDiane Trout <diane@caltech.edu>
Fri, 16 Sep 2005 22:41:59 +0000 (22:41 +0000)
Checked in mussa_pI.1

12 files changed:
Makefile [new file with mode: 0644]
flp.cc [new file with mode: 0644]
flp.hh [new file with mode: 0644]
flp_seqcomp.cc [new file with mode: 0644]
mussa.cc [new file with mode: 0644]
mussa_class.cc [new file with mode: 0644]
mussa_class.hh [new file with mode: 0644]
mussa_nway.cc [new file with mode: 0644]
mussa_nway.hh [new file with mode: 0644]
seqcomp.cc [new file with mode: 0644]
sequence.cc [new file with mode: 0644]
sequence.hh [new file with mode: 0644]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..2fcc233
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,48 @@
+#CFLAGS=-ftemplate-depth-20
+CFLAGS=-O -ftemplate-depth-20
+LDFLAGS=
+
+#all: seqcomp
+
+sequence.o : sequence.cc sequence.hh
+       g++ $(CFLAGS) -c sequence.cc
+
+flp.o : flp.cc flp.hh
+       g++ $(CFLAGS) -c flp.cc
+
+flp_seqcomp.o : flp_seqcomp.cc flp.hh
+       g++ $(CFLAGS) -c flp_seqcomp.cc
+
+seqcomp : seqcomp.cc flp.o flp_seqcomp.o sequence.o
+       g++ $(CFLAGS) -o seqcomp seqcomp.cc flp.o flp_seqcomp.o sequence.o
+
+mussa_nway.o : mussa_nway.cc mussa_nway.hh
+       g++ $(CFLAGS) -c mussa_nway.cc
+
+mussa_class.o : mussa_class.cc mussa_class.hh
+       g++ $(CFLAGS) -c mussa_class.cc
+
+mussa : mussa.cc mussa_class.o mussa_nway.o flp_seqcomp.o flp.o sequence.o
+       g++ $(CFLAGS) -o mussa mussa.cc mussa_class.o mussa_nway.o flp_seqcomp.o flp.o sequence.o
+
+
+#match_list_type.o : modules/match_list_type.c modules/match_list_type.h \
+#              modules/seqcomp_defs.h
+#      gcc $(CFLAGS) -c modules/match_list_type.c
+
+#match_list_head_type.o : modules/match_list_head_type.c \
+#              modules/match_list_head_type.h modules/seqcomp_defs.h
+#      gcc $(CFLAGS) -c modules/match_list_head_type.c
+
+#result_type.o : modules/result_type.c modules/result_type.h \
+#               modules/seqcomp_defs.h
+#      gcc $(CFLAGS) -c modules/result_type.c
+
+#compare.o : modules/compare.c modules/compare.h \
+#               modules/seqcomp_defs.h
+#      gcc $(CFLAGS) -c modules/compare.c
+
+#seqcomp : seqcomp.c modules/seqcomp_defs.h sequence_type.o match_list_type.o \
+#               match_list_head_type.o result_type.o compare.o
+#      gcc $(LDFLAGS) -o seqcomp seqcomp.c sequence_type.o match_list_type.o \
+#              match_list_head_type.o result_type.o compare.o
diff --git a/flp.cc b/flp.cc
new file mode 100644 (file)
index 0000000..300dd8a
--- /dev/null
+++ b/flp.cc
@@ -0,0 +1,165 @@
+//                        ----------------------------------------
+//                            ---------- flp.cc  -----------
+//                        ----------------------------------------
+
+#include "flp.hh"
+
+
+FLPs::FLPs()
+{
+}
+
+void
+FLPs::setup(string type, int win_size, int hard_thres, int len1, int len2)
+{
+  list<match> empty_match_list;
+  int i;
+
+  window_size = win_size;
+  hard_threshold = hard_thres;
+  seq1_length = len1;
+  seq2_length = len2;
+  ana_type = type;
+  seq1_win_num = seq1_length - win_size + 1;
+  seq2_win_num = seq2_length - win_size + 1;
+  all_matches.reserve(seq1_win_num);
+  
+  empty_match_list.clear();
+  for(i = 0; i < seq1_win_num; i++)
+    all_matches.push_back(empty_match_list);
+}
+
+int
+FLPs::win_num()
+{
+  return seq1_win_num;
+}
+
+/*
+bool
+FLPs::match_less(match *match1, match *match2)
+{
+  if (match1->score < match2->score)
+    return true;
+  else if ( (match1->score == match2->score) &&
+            (match1->index < match2->index) )
+    return true;
+  else
+    return false;
+}
+
+void
+FLPs::sort()
+{
+  int i;
+
+  for(i = 0; i < seq1_win_num; i++)
+    if (!all_matches[i].empty())
+      all_matches[i].sort(&FLPs::match_less);
+}
+*/
+
+list<int>
+FLPs::matches(int index)
+{
+  list<int> these_matches;
+  list<match>::iterator list_i, list_end;
+
+  index = abs(index);
+  list_i = all_matches[index].begin();
+  list_end = all_matches[index].end();
+  //if (list_i == list_end)
+  //cout << "its fuckin empty!!!!";
+  while (list_i != list_end)
+  {
+    these_matches.push_back(list_i->index);
+    //cout << list_i->index << " ";
+    ++list_i;
+  }
+  //cout << endl;
+
+  return these_matches;
+}
+
+
+void
+FLPs::file_save(string save_file_path)
+{
+  fstream save_file;
+  list<match>::iterator match_i, match_list_end;
+  int i;
+
+  save_file.open(save_file_path.c_str(), ios::out);
+
+  save_file << "<Seqcomp type=" << ana_type << " win=" << window_size;
+  save_file << " thres=" << hard_threshold << ">\n";
+
+  for(i = 0; i < seq1_win_num; i++)
+  {
+    if (!all_matches[i].empty())
+    {
+      match_i = all_matches[i].begin();
+      match_list_end = all_matches[i].end();
+      while (match_i != match_list_end)
+      {
+        save_file << match_i->index << "," << match_i->score << " ";
+        ++match_i;
+      }
+    }
+    save_file << endl;
+  }
+
+  save_file << "</Seqcomp>\n";
+
+  save_file.close();
+}
+
+void
+FLPs::file_load(string file_path)
+{
+  fstream data_file;
+  string file_data, file_data_line, index_data, score_data;
+  int type, window, hard_thres, index, score;
+  bool tag_open = false;
+
+
+  data_file.open(file_path.c_str(), ios::in);
+
+  getline(data_file,file_data_line);
+  // parse seqcomp open tag and parameters
+  // eg <Seqcomp type=mussa win=30 thres=21>
+  // if parse successful...
+  tag_open = true;
+
+  while ((!data_file.eof()) && tag_open)
+  {
+    getline(data_file,file_data_line);
+    /*
+    if ( = "</Seqcomp>")
+      tag_open = false;
+    // parse line of matches
+    else
+    {
+      needs stuff...um, code...
+      {
+      index = atoi(index_data);
+      core = atoi(score_data);
+      cout << index << "," << score << " ";
+      data_file >> index_data;
+      {
+    }
+    cout << "\n";
+    */
+  }
+
+  data_file.close();
+}
+
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe  ";
+      cout << "fum\n";
+*/
+
+
diff --git a/flp.hh b/flp.hh
new file mode 100644 (file)
index 0000000..dfbc7ef
--- /dev/null
+++ b/flp.hh
@@ -0,0 +1,39 @@
+//                        ----------------------------------------
+//                            ---------- flp.hh  -----------
+//                        ----------------------------------------
+
+#include "sequence.hh"
+
+
+// FLP = Fixed Length Pairs (Data)
+// vector of linked lists of the match type struct
+class FLPs
+{
+  private:
+    int window_size;
+    int hard_threshold;
+    int seq1_length, seq2_length, seq1_win_num, seq2_win_num;
+    string ana_type;
+
+
+    struct match
+    {
+      int index;
+      int score;
+    };
+
+    vector<list<match> > all_matches;
+
+
+  public:
+    FLPs();
+    void setup(string type, int win_size, int hard_thres, int len1, int len2);
+    inline void add(int seq1_i, int seq2_i, int a_score, int i2_offset);
+    void seqcomp(string seq1, string seq2, bool is_RC);
+  //bool FLPs::match_less(match *match1, match *match2);
+  //void FLPs::sort();
+    list<int> matches(int index);  //later maybe version with int threshold 
+    int win_num();
+    void file_save(string save_file_path);
+    void file_load(string file_path);
+};
diff --git a/flp_seqcomp.cc b/flp_seqcomp.cc
new file mode 100644 (file)
index 0000000..44efc2e
--- /dev/null
@@ -0,0 +1,130 @@
+//                        ----------------------------------------
+//                         ---------- flp_seqcomp.cc  -----------
+//                        ----------------------------------------
+
+#include "flp.hh"
+
+// Titus thinks the start of an RC window should be its "rear" in the
+// normal version of the sequence.  He's wrong, of course, and in the
+// future my followers will annihilate his in the Final Battle, but for
+// now I'll let him have his way.
+// note seq2_i is actually the index of the leaving bp, so need to +1
+
+inline void
+FLPs::add(int seq1_i, int seq2_i, int a_score, int i2_offset)
+{
+  match a_match;
+
+
+  if (a_score >= hard_threshold)
+  {
+    a_match.index = seq2_i + i2_offset;
+    a_match.score = a_score;
+
+    all_matches[seq1_i].push_back(a_match);
+  }
+}
+
+
+void
+FLPs::seqcomp(string sseq1, string sseq2, bool is_RC)
+{
+  int start_i, seq1_i, seq2_i, win_i;      // loop variables
+  int matches;                    // number of matches in to a window
+  int i2_offset;
+  char * seq1, * seq2;
+
+
+  seq1 = (char *) sseq1.c_str();
+  seq2 = (char *) sseq2.c_str();
+
+
+  if (is_RC)
+    i2_offset = window_size - seq2_length;     
+  else
+    i2_offset = 1;
+
+
+  // loop thru the start positions for sequence 1
+  for(start_i = 0; start_i < seq1_win_num; start_i++)
+  {
+    matches = 0;
+    // compare initial window
+    for(win_i = 0; win_i < window_size; win_i++)
+      if ((seq1[start_i+win_i] == seq2[win_i]) &&
+          (seq1[start_i+win_i] != 'N'))       // N's match nothing **
+        matches++;
+
+    //seq2_i is always 0 for initial win
+    seq2_i = 0;
+
+    // call inlined function that adds match if it is above threshold
+    add(start_i, seq2_i, matches, i2_offset);
+
+    // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
+    // compare the bps leaving and entering the window and modify matches count
+    seq1_i = start_i;
+    while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
+    {
+      // copmpare the bp leaving the window
+      if ((seq1[seq1_i] == seq2[seq2_i]) &&
+         (seq1[seq1_i] != 'N'))       // N's match nothing
+        matches = matches -1;
+
+      // compare the bp entering the window
+      if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
+         (seq1[seq1_i+window_size] != 'N'))       // N's match nothing
+        matches = matches + 1;
+
+      add(seq1_i + 1, seq2_i, matches, i2_offset);
+
+      seq1_i = seq1_i + 1;     // increment seq1_i to next window 
+      seq2_i = seq2_i + 1;     // increment seq2_i to next window
+    } // end of loop thru the current offset sequence
+  } // end of loop thru the start positions of seq1 sequence
+
+  // loop thru the start positions for sequence 1
+  for(start_i = 1; start_i < seq2_win_num; start_i++)
+  {
+    matches = 0;
+
+    // compare initial window
+    for(win_i = 0; win_i < window_size; win_i++)
+      if ((seq2[start_i+win_i] == seq1[win_i]) &&
+         (seq2[start_i+win_i] != 'N'))       // N's match nothing
+        matches++;
+
+    //seq2_i is always start_i for initial window
+    seq2_i = start_i;
+
+    add(0, seq2_i, matches, i2_offset);
+
+    // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
+    // compare the bps leaving and entering the window and modify matches count
+    seq1_i = 0;
+    while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
+    {
+      // copmpare the bp leaving the window
+      if ((seq1[seq1_i] == seq2[seq2_i]) &&
+         (seq1[seq1_i] != 'N'))       // N's match nothing
+        matches = matches - 1;
+
+      // compare the bp entering the window
+      if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
+         (seq1[seq1_i+window_size] != 'N'))       // N's match nothing
+        matches = matches + 1;
+
+      add(seq1_i + 1, seq2_i, matches, i2_offset);
+
+      seq1_i = seq1_i + 1;     // increment seq1_i to next window
+      seq2_i = seq2_i + 1;     // increment seq2_i to next window
+    } // end of loop thru the current offset sequence
+  } // end of loop thru the start positions of seq2 sequence
+}
+
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe\n";
+      cout << "fum\n";
+*/
diff --git a/mussa.cc b/mussa.cc
new file mode 100644 (file)
index 0000000..2682772
--- /dev/null
+++ b/mussa.cc
@@ -0,0 +1,71 @@
+#include "mussa_class.hh"
+#include "time.h"
+
+int main(int argc, char **argv) 
+{
+  Mussa overlord;
+  char * para_file;
+  time_t t1, t2, begin, end;
+  double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime;
+
+  begin = time(NULL);
+
+  // need more sophisticated arg reading structure
+  // support -w and -t override options...other stuff??
+  para_file = * ++argv;
+
+  t1 = time(NULL);
+  overlord.setup(para_file);
+  t2 = time(NULL);
+  setuptime = difftime(t2, t1);
+
+
+      cout << "fee\n";
+  t1 = time(NULL);
+  overlord.get_Seqs();
+  t2 = time(NULL);
+  seqloadtime = difftime(t2, t1);
+
+
+      cout << "fie\n";
+  t1 = time(NULL);
+  overlord.seqcomp();
+  t2 = time(NULL);
+  seqcomptime = difftime(t2, t1);
+
+
+      cout << "foe\n";
+  t1 = time(NULL);
+  overlord.nway();
+  t2 = time(NULL);
+  nwaytime = difftime(t2, t1);
+
+
+      cout << "fum\n";
+  t1 = time(NULL);
+  overlord.save();
+  t2 = time(NULL);
+  savetime = difftime(t2, t1);
+
+  end = time(NULL);
+  totaltime = difftime(end, begin);
+
+  cout << "setup\tseqload\tseqcomp\tnway\tsave\ttotal\n";
+  cout << setuptime << "\t"; 
+  cout << seqloadtime << "\t";
+  cout << seqcomptime << "\t";
+  cout << nwaytime << "\t";
+  cout << savetime << "\t";
+  cout << totaltime << "\n";
+
+}
+
+
+
+
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe\n";
+      cout << "fum\n";
+*/
diff --git a/mussa_class.cc b/mussa_class.cc
new file mode 100644 (file)
index 0000000..442af68
--- /dev/null
@@ -0,0 +1,191 @@
+//                        ----------------------------------------
+//                          ---------- mussa_class.cc -----------
+//                        ----------------------------------------
+
+#include "mussa_class.hh"
+
+
+Mussa::Mussa()
+{
+}
+
+void
+Mussa::setup(char * para_file_path)
+{
+  ifstream para_file;
+  string file_data_line;
+  string param, value, annot_file;
+  int split_index, fasta_index;
+  bool seq_params, did_seq;
+  int bogo;
+
+
+  win_append = false;
+  thres_append = false;
+
+  para_file.open(para_file_path, ios::in);
+
+  getline(para_file,file_data_line);
+  split_index = file_data_line.find(" ");
+  param = file_data_line.substr(0,split_index);
+  value = file_data_line.substr(split_index+1);
+    
+
+  while (!para_file.eof())
+  {
+    did_seq = false;
+    if (param == "ANA_NAME")
+      ana_name = value;
+    else if (param == "APPEND_WIN")
+      win_append = true;
+    else if (param == "APPEND_THRES")
+      thres_append = true;
+    else if (param == "SEQUENCE_NUM")
+      seq_num = atoi(value.c_str());
+    else if (param == "WINDOW")
+      window = atoi(value.c_str());
+    else if (param == "THRESHOLD")
+      threshold = atoi(value.c_str());
+    else if (param == "SEQUENCE")
+    {
+      seq_files.push_back(value);
+      fasta_index = 1;
+      annot_file = "";
+      seq_params = true;
+
+      while ((!para_file.eof()) && seq_params)
+      {
+        getline(para_file,file_data_line);
+        split_index = file_data_line.find(" ");
+        param = file_data_line.substr(0,split_index);
+        value = file_data_line.substr(split_index+1);
+
+        if (param == "FASTA_INDEX")
+          fasta_index = atoi(value.c_str());
+        else if (param == "ANNOTATION")
+          annot_file = value;
+        //ignore empty lines or that start with '#'
+        else if ((param == "") || (param == "#")) {}
+        else seq_params = false;
+      }
+
+
+      fasta_indices.push_back(fasta_index);
+      annot_files.push_back(annot_file);
+      did_seq = true;
+      
+    }
+    //ignore empty lines or that start with '#'
+    else if ((param == "") || (param == "#")) {}
+    else
+    {
+      cout << "Illegal/misplaced mussa parameter in file\n";
+      cout << param << "\n";
+    }
+
+    if (!did_seq)
+    {
+      getline(para_file,file_data_line);
+      split_index = file_data_line.find(" ");
+      param = file_data_line.substr(0,split_index);
+      value = file_data_line.substr(split_index+1);
+      did_seq = false;
+    }
+  }
+
+  para_file.close();
+
+  cout << "ana_name = " << ana_name << win_append << win_append << "\n";
+  cout << "window = " << window << " threshold = " << threshold << "\n";
+}
+
+
+void
+Mussa::get_Seqs()
+{
+  list<string>::iterator seq_files_i, annot_files_i;
+  list<int>::iterator fasta_indices_i;
+  Sequence aSeq;
+
+  seq_files_i = seq_files.begin();
+  fasta_indices_i = fasta_indices.begin();
+  annot_files_i = annot_files.begin();
+
+  while ( (seq_files_i != seq_files.end()) &&
+          (fasta_indices_i != fasta_indices.end()) &&
+          (annot_files_i != annot_files.end()) )
+  {
+    aSeq.load_fasta(*seq_files_i, *fasta_indices_i);
+    the_Seqs.push_back(aSeq);
+    cout << aSeq.hdr() << endl;
+    aSeq.clear();
+    ++seq_files_i;
+    ++fasta_indices_i;
+    ++annot_files_i;
+  }
+}
+
+
+void
+Mussa::seqcomp()
+{
+  int i, i2;     // loop vars over sequences to analyze
+  vector<int> seq_lens;
+  vector<FLPs> empty_FLP_vector;
+  FLPs dummy_comp;
+  string save_file_string;
+
+  empty_FLP_vector.clear();
+  for(i = 0; i < seq_num; i++)
+  {
+    all_comps.push_back(empty_FLP_vector); 
+    for(i2 = 0; i2 < seq_num; i2++)
+      all_comps[i].push_back(dummy_comp);
+  }
+
+  for(i = 0; i < seq_num; i++)
+    seq_lens.push_back(the_Seqs[i].len());
+
+  for(i = 0; i < seq_num; i++)
+    for(i2 = i+1; i2 < seq_num; i2++)
+    {
+      all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]);
+      all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false);
+      all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true);
+    }
+}
+
+
+void
+Mussa::nway()
+{
+  the_paths.setup(seq_num);
+  the_paths.find_paths_r(all_comps);
+}
+
+
+void
+Mussa::save()
+{
+  int i;
+
+  save_file.open(ana_name.c_str(), ios::out);
+
+  for(i = 0; i < seq_num; i++)
+    save_file << the_Seqs[i].seq() << endl;
+
+  save_file << window << endl;
+  save_file.close();
+  //note more complex eventually since ana_name may need to have
+  //window size, threshold and other stuff to modify it...
+  the_paths.save_old(ana_name);
+}
+
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe\n";
+      cout << "fum\n";
+*/
+
+
diff --git a/mussa_class.hh b/mussa_class.hh
new file mode 100644 (file)
index 0000000..a84c159
--- /dev/null
@@ -0,0 +1,31 @@
+//                        ----------------------------------------
+//                          ---------- mussa_class.hh -----------
+//                        ----------------------------------------
+
+#include "mussa_nway.hh"
+
+
+class Mussa
+{
+  private:
+    string ana_name;
+    int seq_num, window, threshold;
+    list<string> seq_files, annot_files;
+    list<int> fasta_indices;
+    bool win_append, thres_append;
+
+    vector<Sequence> the_Seqs;
+    vector<vector<FLPs> > all_comps;
+    Nway_Paths the_paths;
+
+  public:
+    Mussa();
+    void setup(char * para_file_path);
+    void get_Seqs();
+    void seqcomp();
+    void nway();
+    void save();
+
+    fstream save_file;
+};
+
diff --git a/mussa_nway.cc b/mussa_nway.cc
new file mode 100644 (file)
index 0000000..ecbbb0d
--- /dev/null
@@ -0,0 +1,231 @@
+//                        ----------------------------------------
+//                         ---------- mussa_nway.cc  -----------
+//                        ----------------------------------------
+
+#include "mussa_nway.hh"
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe\n";
+      cout << "fum\n";
+*/
+
+
+Nway_Paths::Nway_Paths()
+{
+}
+
+void
+Nway_Paths::setup(int sp_num)
+{
+  species_num = sp_num;
+}
+
+
+void
+Nway_Paths::path_search(vector<vector<FLPs> > all_comparisons, vector<int> path, int depth)
+{
+  list<int> new_nodes, trans_check_nodes;
+  list<int>::iterator new_nodes_i, new_nodes_end;
+  int i;
+  bool trans_check_good;
+
+  new_nodes = all_comparisons[depth - 1][depth].matches(path[depth-1]);
+  new_nodes_i = new_nodes.begin();
+  new_nodes_end = new_nodes.end();
+  while(new_nodes_i != new_nodes_end)
+  {
+    //cout << "    * species " << depth << " node: " << *new_nodes_i << endl;
+    // check transitivity with previous nodes in path
+    for(i = 0; i < depth - 1; i++)
+    {
+      trans_check_good = true;
+      trans_check_nodes = all_comparisons[i][depth].matches(path[i]);
+      if (trans_check_nodes.end() == find(trans_check_nodes.begin(),trans_check_nodes.end(), *new_nodes_i) )
+        trans_check_good = false;
+    }
+
+    if (trans_check_good)
+    {
+      path.push_back(*new_nodes_i);
+      if (depth < species_num - 1)
+        path_search(all_comparisons, path, depth + 1);
+      else
+        pathz.push_back(path);
+      path.pop_back();
+    } 
+    ++new_nodes_i;
+  }
+}
+//          cout << "    ****I have the power...\n";
+
+/* use this if I ever get the friggin seqcomp match lists to sort...
+      if (binary_search(trans_check_nodes.begin(), trans_check_nodes.end(), 
+                        *new_nodes_i))
+*/
+
+void
+Nway_Paths::find_paths_r(vector<vector<FLPs> > all_comparisons)
+{
+  vector<int> path;
+  int win_i, window_num;
+  list<int> new_nodes;
+  list<int>::iterator new_nodes_i, new_nodes_end;
+
+  pathz.clear();
+  window_num = all_comparisons[0][1].win_num();
+  cout << window_num << endl;
+  // loop thru all windows in first species
+  for (win_i = 0; win_i < window_num; win_i++)
+  {
+    path.clear();
+    path.push_back(win_i);
+    new_nodes = all_comparisons[0][1].matches(path[0]);
+    new_nodes_i = new_nodes.begin();
+    new_nodes_end = new_nodes.end();
+    //if (new_nodes_i != new_nodes_end)
+    //cout << "* species 0 node: " << win_i << endl;
+    //  cout << "foookin hell\n";
+    path.push_back(0);
+    while(new_nodes_i != new_nodes_end)
+    {
+      //cout << "  * species 1 node: " << *new_nodes_i << endl;
+      path[1] = *new_nodes_i;
+      path_search(all_comparisons, path, 2);
+      ++new_nodes_i;
+    }
+  }
+}
+
+void
+Nway_Paths::save(string save_file_path)
+{
+  fstream save_file;
+  list<vector<int> >::iterator path_i, paths_end;
+  vector<int> a_path;
+  int i;
+
+  save_file.open(save_file_path.c_str(), ios::out);
+
+  //save_file << "<Seqcomp type=" << ana_type << " win=" << window_size;
+  //save_file << " thres=" << hard_threshold << ">\n";
+
+  path_i = pathz.begin();
+  paths_end = pathz.end();
+  if (path_i == paths_end)
+    cout << "Arrrrrrgggghhhhhh\n";
+  while(path_i != paths_end)
+  {
+    a_path = *path_i;
+    //cout << a_path.size() << endl;
+    for(i = 0; i < species_num; ++i)
+      save_file << i << "," << a_path[i] << " ";
+    save_file << endl;
+    ++path_i;
+  }
+
+//save_file << "</Mussa>\n";
+  save_file.close();
+}
+
+void
+Nway_Paths::save_old(string save_file_path)
+{
+  fstream save_file;
+  list<vector<int> >::iterator path_i, paths_end;
+  vector<int> a_path;
+  int i;
+
+  save_file.open(save_file_path.c_str(), ios::app);
+
+  path_i = pathz.begin();
+  paths_end = pathz.end();
+  while(path_i != paths_end)
+  {
+    a_path = *path_i;
+    //cout << a_path.size() << endl;
+    for(i = 0; i < species_num; ++i)
+      save_file << i << "," << a_path[i] << " ";
+    save_file << endl;
+    ++path_i;
+  }
+  save_file.close();
+}
+
+
+/*
+void
+Nway_Paths::find_paths(vector<vector<FLPs> > all_comparisons)
+{
+  int win_i, sp_i;
+  vector<list<list<int> > > path_src_tree;
+  list<int> new_nodes;
+  list<int>::iterator node_i, node_end;
+  <list<list<int> > >::iterator branch_i, branch_end;  
+
+  pathz.clear();
+  path_src_tree.reserve(species_num - 1);
+
+
+  // loop thru all windows in first species
+  for (win_i = 0; win_i < window_num; win_i++)
+  {
+    // clear the path search tree
+    for(i = 0; i < species_num; i++)
+      path_src_tree[i].clear();
+
+    // top level kept empty even tho implicity has one entry of the first
+    // species at this window - why bother, adds a little speed
+
+    // get connection list for first species, creating a list of nodes
+    // of second species connected to the first species at this window
+    new_nodes = all_comparisons[0][1];
+    path_src_tree[1].push_back(new_nodes);
+
+    // loop thru rest of species for this window to see if any paths of matches
+    // go across all species
+    // if path search tree becomes empty, break out of loop, no reason to search further
+    sp_i = 1;
+    while ((sp_i < species_num) && (path tree not empty))
+    {
+      branch_i = path_src_tree[1].begin();
+      branch_end = path_src_tree[1].end();
+      while (branch_i != branch_end)
+      {
+        node_i = branch_i->begin();
+        node_end = branch_i->end();
+      }
+
+
+      // loop over all current nodes
+         // get connection list for each node
+         // loop over each previous node in list
+            // get those nodes connection list
+            // intersect previous node connections with current
+
+      ++sp_i;
+    }
+
+    // insert any of the paths found into the master list of paths
+
+    // add no paths if tmp_pathz is empty...
+  }
+}
+
+void Nway_Paths::refine()
+{
+}
+
+void Nway_Paths::print()
+{
+  list<list<int> >::iterator pathz_i;
+
+  cout << "printing list of lists\n";
+  for (pathz_i = pathz.begin(); pathz_i != pathz.end(); ++pathz_i)
+  {
+    for (path_i = pathz_i->begin(); path_i != pathz_i->end(); ++path_i)
+      cout << *path_i << " ";
+    cout << endl;
+  }
+}
+*/
diff --git a/mussa_nway.hh b/mussa_nway.hh
new file mode 100644 (file)
index 0000000..6b4d0a1
--- /dev/null
@@ -0,0 +1,28 @@
+//                        ----------------------------------------
+//                         ----------  mussa_nway.hh  -----------
+//                        ----------------------------------------
+
+#include "flp.hh"
+
+
+class Nway_Paths
+{
+  private:
+    int species_num;
+    int threshold;
+    int win_size;
+    list<vector<int> > pathz;
+
+
+  public:
+    Nway_Paths();
+    void setup(int sp_num);
+    void find_paths_r(vector<vector<FLPs> > all_comparisons);
+    void path_search(vector<vector<FLPs> > all_comparisons, vector<int> path, int depth);
+    void save(string save_file_path);
+    void save_old(string save_file_path);
+    void load(string load_file_path);
+    void find_paths(vector<vector<FLPs> > all_comparisons);
+    void refine();
+    void print();
+};
diff --git a/seqcomp.cc b/seqcomp.cc
new file mode 100644 (file)
index 0000000..d0a19ec
--- /dev/null
@@ -0,0 +1,84 @@
+#include "flp.hh"
+#include <time.h>
+#include <iomanip.h>
+
+int main(int argc, char **argv) 
+{
+  Sequence seqA, seqB;
+  FLPs analysis;
+  string seqA_file, seqB_file, output_file;
+  string ana_type;
+  int window, threshold;
+  int seqA_len, seqB_len;
+
+  time_t t1, t2, begin, end;
+  double setuptime, comp1time, comp2time, sorttime, savetime, totaltime;
+
+  begin = time(NULL);
+
+  cout << "fee fie foe fum" << endl;
+
+  ana_type = * ++argv;
+  seqA_file = * ++argv;
+  seqB_file = * ++argv;
+  window = atoi(* ++argv);
+  threshold = atoi(* ++argv);
+  output_file = * ++argv;
+
+  t1 = time(NULL);
+  seqA.load_fasta(seqA_file, 1);
+  seqA_len = seqA.len();
+  //cout << setw(60) << seqA.hdr() << "\n";
+
+  seqB.load_fasta(seqB_file, 1);
+  seqB_len = seqB.len();
+  //cout << seqB.hdr() << "\n";
+
+
+  cout << "Length: Seq A = " << seqA_len;
+  cout << "; Seq B = " << seqB_len << "\n";
+
+  analysis.setup(ana_type, window, threshold, seqA_len, seqB_len);
+
+  t2 = time(NULL);
+  setuptime = difftime(t2, t1);
+
+  t1 = time(NULL);
+  analysis.seqcomp(seqA.seq(), seqB.seq(), false);
+  t2 = time(NULL);
+  comp1time = difftime(t2, t1);
+
+  t1 = time(NULL);
+  analysis.seqcomp(seqA.seq(), seqB.rev_comp(), true);
+  t2 = time(NULL);
+  comp2time = difftime(t2, t1);
+  /*
+  t1 = time(NULL);
+  analysis.sort();
+  t2 = time(NULL);
+  sorttime = difftime(t2, t1);
+  */
+  t1 = time(NULL);
+  analysis.file_save(output_file);
+  t2 = time(NULL);
+  savetime = difftime(t2, t1);
+
+  end = time(NULL);
+  totaltime = difftime(end, begin);
+
+  cout << "setup\tcomp\trc_comp\tsave\ttotal\n";
+  cout << setuptime << "\t"; 
+  cout << comp1time << "\t";
+  cout << comp2time << "\t";
+  //cout << sorttime << "\t";
+  cout << savetime << "\t";
+  cout << totaltime << "\n";
+}
+
+/*
+      cout << "fee\n";
+      cout << "fie\n";
+      cout << "foe  ";
+      cout << "fum\n";
+*/
diff --git a/sequence.cc b/sequence.cc
new file mode 100644 (file)
index 0000000..c2a836d
--- /dev/null
@@ -0,0 +1,132 @@
+//                        ----------------------------------------
+//                           ---------- sequence.cc -----------
+//                        ----------------------------------------
+
+#include "sequence.hh"
+
+
+Sequence::Sequence()
+{
+}
+
+
+void
+Sequence::load_fasta(string file_path, int seq_num)
+{
+  fstream data_file;
+  string file_data_line;
+  int header_counter = 0;
+  bool read_seq = true;
+
+
+  data_file.open(file_path.c_str(), ios::in);
+
+
+  // search for the header of the fasta sequence we want
+  while ( (!data_file.eof()) && (header_counter < seq_num) )
+  {
+    getline(data_file,file_data_line);
+    if (file_data_line.substr(0,1) == ">")
+      header_counter++;
+  }
+
+  header = file_data_line.substr(1);
+
+  while ( !data_file.eof() && read_seq )
+  {
+    getline(data_file,file_data_line);
+    if (file_data_line.substr(0,1) == ">")
+      read_seq = false;
+    else sequence += file_data_line;
+  }
+
+  data_file.close();
+
+  transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
+}
+
+
+void
+Sequence::load_annot(string file_path)
+{
+}
+
+
+int 
+Sequence::len()
+{
+  return sequence.length();
+}
+
+
+string
+Sequence::seq()
+{
+  return sequence;
+}
+
+string
+Sequence::rev_comp()
+{
+  string rev_comp;
+  char conversionTable[257];
+  int seq_i, table_i, len;
+
+  len = sequence.length();
+  rev_comp.reserve(len);
+  // make a conversion table
+  // init all parts of conversion table to '~' character
+  // '~' I doubt will ever appear in a sequence file (jeez, I hope)
+  // and may the fleas of 1000 camels infest the genitals of any biologist (and
+  // seven generations of their progeny) who decides to make it mean
+  // something special!!!
+  // PS - double the curse for any smartass non-biologist who tries it as well
+  for(table_i=0; table_i < 256; table_i++)
+  {
+    conversionTable[table_i] = '~';
+  }
+  // add end of string character for printing out table for testing purposes
+  conversionTable[256] = '\0';
+
+  // add in the characters for the bases we want to convert
+  conversionTable['A'] = 'T';
+  conversionTable['T'] = 'A';
+  conversionTable['G'] = 'C';
+  conversionTable['C'] = 'G';
+  conversionTable['N'] = 'N';
+
+  // finally, the actual conversion loop
+  for(seq_i = len - 1; seq_i >= 0; seq_i--)
+  {
+    table_i = (int) sequence[seq_i];
+    rev_comp += conversionTable[table_i];
+  }
+
+  return rev_comp;
+}
+
+
+string 
+Sequence::hdr()
+{
+  return header;
+}
+
+/*
+string 
+Sequence::species()
+{
+  return species;
+}
+*/
+
+void
+Sequence::clear()
+{
+  sequence = "";
+  length = 0;
+  header = "";
+  species = "";
+  species_num = -4;
+  annots.clear();
+}
diff --git a/sequence.hh b/sequence.hh
new file mode 100644 (file)
index 0000000..b287c72
--- /dev/null
@@ -0,0 +1,46 @@
+//                        ----------------------------------------
+//                           ---------- sequence.hh -----------
+//                        ----------------------------------------
+
+
+#include <iostream>
+#include <list>
+#include <vector>
+#include <string>
+#include <fstream.h>
+#include <stdlib.h>
+#include <algorithm>
+
+// Sequence data class
+
+class Sequence
+{
+  private:
+    string sequence;
+    int length;
+    string header;
+    string species;
+    int species_num; 
+
+    struct annot
+    {
+      int start, end;
+      string name, type, color;
+    };
+
+    list<annot> annots;
+
+
+
+
+  public:
+    Sequence();
+    void load_fasta(string file_path, int seq_num);
+    void load_annot(string file_path);
+    string seq();
+    string rev_comp();
+    int len();
+    string hdr();
+  //  string species();
+    void clear();
+};