1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- flp_seqcomp.cc -----------
13 // ----------------------------------------
15 #include "mussa_exceptions.hpp"
16 #include "alg/flp.hpp"
17 #include "alg/sequence.hpp"
22 // Note one recording RC matches. This version of 'seqcomp' records RC matches
23 // as index of the window start in its nonRC form. However, past versions of
24 // the analysis recorded it as the index of the rear of the window, as the
25 // following legacy comment indicates. FR still uses this method
27 // Titus thinks the start of an RC window should be its "rear" in the
28 // normal version of the sequence. He's wrong, of course, and in the
29 // future my followers will annihilate his in the Final Battle, but for
30 // now I'll let him have his way.
32 // note seq2_i is actually the index of the leaving bp, so need to +1
35 FLPs::add(int seq1_i, int seq2_i, int a_score, int i2_offset)
39 if (a_score >= hard_threshold)
41 a_match.index = seq2_i + i2_offset;
42 a_match.score = a_score;
44 (*all_matches)[seq1_i].push_back(a_match);
50 FLPs::seqcomp(const Sequence& sseq1, const Sequence& sseq2, bool is_RC)
52 int start_i, seq1_i, seq2_i, win_i; // loop variables
53 int matches; // number of matches in to a window
55 const char *seq1 = sseq1.c_str();
56 const char *seq2 = sseq2.c_str();
58 int seq1_win_num = sseq1.size() - window_size + 1;
59 int seq2_win_num = sseq2.size() - window_size + 1;
60 alloc_matches(sseq1.size());
61 if (seq1_win_num != size()) {
63 msg << "Assert failed, seq1_win_num["
64 << seq1_win_num << "] != size[" << size() << "]" << endl;
65 throw mussa_error(msg.str());
69 i2_offset = window_size - sseq2.size();
73 // I'm not sure why we're -2 instead of -1, but I experimentally
74 // determined that I need to -2 to get seqcomp_i to == seqcomp_end
75 seqcomp_end = size() + seq2_win_num -2;
77 // this does the "upper diagonals" of the search
79 // loop thru the start positions for sequence 1
80 for(start_i = 0; start_i != size(); ++start_i, ++seqcomp_i)
83 // compare initial window
84 for(win_i = 0; win_i < window_size; win_i++)
85 if ((seq1[start_i+win_i] == seq2[win_i]) &&
86 (seq1[start_i+win_i] != 'N')) // N's match nothing **
89 //seq2_i is always 0 for initial win
92 // call inlined function that adds match if it is above threshold
93 add(start_i, seq2_i, matches, i2_offset);
95 // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
96 // compare the bps leaving and entering the window and modify matches count
98 while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
100 // copmpare the bp leaving the window
101 if ((seq1[seq1_i] == seq2[seq2_i]) && (seq1[seq1_i] != 'N')) {
103 matches = matches -1;
106 // compare the bp entering the window
107 if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
108 (seq1[seq1_i+window_size] != 'N')) { // N's match nothing
109 matches = matches + 1;
111 add(seq1_i + 1, seq2_i + 1, matches, i2_offset);
113 seq1_i = seq1_i + 1; // increment seq1_i to next window
114 seq2_i = seq2_i + 1; // increment seq2_i to next window
115 } // end of loop thru the current offset sequence
116 } // end of loop thru the start positions of seq1 sequence
118 // this does the "lower diagonals" of the search
120 // loop thru the start positions for sequence 1
121 for(start_i = 1; start_i != seq2_win_num; ++start_i, ++seqcomp_i)
125 // compare initial window
126 for(win_i = 0; win_i < window_size; win_i++)
127 if ((seq2[start_i+win_i] == seq1[win_i]) &&
128 (seq2[start_i+win_i] != 'N')) // N's match nothing
131 //seq2_i is always start_i for initial window
134 add(0, seq2_i, matches, i2_offset);
136 // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
137 // compare the bps leaving and entering the window and modify matches count
139 while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
141 // copmpare the bp leaving the window
142 if ((seq1[seq1_i] == seq2[seq2_i]) &&
143 (seq1[seq1_i] != 'N')) // N's match nothing
144 matches = matches - 1;
146 // compare the bp entering the window
147 if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
148 (seq1[seq1_i+window_size] != 'N')) // N's match nothing
149 matches = matches + 1;
151 add(seq1_i + 1, seq2_i + 1, matches, i2_offset);
153 seq1_i = seq1_i + 1; // increment seq1_i to next window
154 seq2_i = seq2_i + 1; // increment seq2_i to next window
155 } // end of loop thru the current offset sequence
156 } // end of loop thru the start positions of seq2 sequence
158 if (seqcomp_i != seqcomp_end) {
159 clog << "seqcomp_i " << seqcomp_i << " " << seqcomp_end << endl;
160 //throw mussa_error("internal error with seqcomp seqcomp_i != seqcomp_end");
162 seqcomp_i = seqcomp_end = FLPs::seqcomp_not_running;