1 // This file is part of the Mussa source distribution.
2 // http://mussa.caltech.edu/
3 // Contact author: Tristan De Buysscher, tristan@caltech.edu
5 // This program and all associated source code files are Copyright (C) 2005
6 // the California Institute of Technology, Pasadena, CA, 91125 USA. It is
7 // under the GNU Public License; please see the included LICENSE.txt
8 // file for more information, or contact Tristan directly.
11 // ----------------------------------------
12 // ---------- flp_seqcomp.cc -----------
13 // ----------------------------------------
15 #include "mussa_exceptions.hpp"
16 #include "alg/flp.hpp"
17 #include "alg/sequence.hpp"
22 // Note one recording RC matches. This version of 'seqcomp' records RC matches
23 // as index of the window start in its nonRC form. However, past versions of
24 // the analysis recorded it as the index of the rear of the window, as the
25 // following legacy comment indicates. FR still uses this method
27 // Titus thinks the start of an RC window should be its "rear" in the
28 // normal version of the sequence. He's wrong, of course, and in the
29 // future my followers will annihilate his in the Final Battle, but for
30 // now I'll let him have his way.
32 // note seq2_i is actually the index of the leaving bp, so need to +1
35 FLPs::add(int seq1_i, int seq2_i, int a_score, int i2_offset)
39 if (a_score >= hard_threshold)
41 a_match.index = seq2_i + i2_offset;
42 a_match.score = a_score;
44 (*all_matches)[seq1_i].push_back(a_match);
48 void FLPs::validate_sequence(const Sequence& seq) const
50 if (seq.size() < window_size) {
52 msg << "Sequence " << seq.get_name() << " of length " << seq.size()
53 << " must be longer than window size " << window_size;
54 throw seqcomp_error(msg.str());
59 FLPs::seqcomp(const Sequence& sseq1, const Sequence& sseq2, bool is_RC)
61 int start_i, seq1_i, seq2_i, win_i; // loop variables
62 int matches; // number of matches in to a window
64 std::string sseq1_str(sseq1.get_sequence());
65 std::string sseq2_str(sseq2.get_sequence());
66 const char *seq1 = sseq1_str.c_str();
67 const char *seq2 = sseq2_str.c_str();
69 validate_sequence(sseq1);
70 validate_sequence(sseq2);
72 size_type seq1_win_num = sseq1.size() - window_size + 1;
73 size_type seq2_win_num = sseq2.size() - window_size + 1;
74 alloc_matches(sseq1.size());
75 if (seq1_win_num != size()) {
77 msg << "Assert failed, seq1_win_num["
78 << seq1_win_num << "] != size[" << size() << "]" << endl;
79 throw mussa_error(msg.str());
83 i2_offset = window_size - sseq2.size();
87 // I'm not sure why we're -2 instead of -1, but I experimentally
88 // determined that I need to -2 to get seqcomp_i to == seqcomp_end
89 seqcomp_end = size() + seq2_win_num -2;
91 // this does the "upper diagonals" of the search
93 // loop thru the start positions for sequence 1
94 for(start_i = 0; start_i != size(); ++start_i, ++seqcomp_i)
97 // compare initial window
98 for(win_i = 0; win_i < window_size; win_i++)
99 if ((seq1[start_i+win_i] == seq2[win_i]) &&
100 (seq1[start_i+win_i] != 'N')) // N's match nothing **
103 //seq2_i is always 0 for initial win
106 // call inlined function that adds match if it is above threshold
107 add(start_i, seq2_i, matches, i2_offset);
109 // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
110 // compare the bps leaving and entering the window and modify matches count
112 while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
114 // copmpare the bp leaving the window
115 if ((seq1[seq1_i] == seq2[seq2_i]) && (seq1[seq1_i] != 'N')) {
117 matches = matches -1;
120 // compare the bp entering the window
121 if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
122 (seq1[seq1_i+window_size] != 'N')) { // N's match nothing
123 matches = matches + 1;
125 add(seq1_i + 1, seq2_i + 1, matches, i2_offset);
127 seq1_i = seq1_i + 1; // increment seq1_i to next window
128 seq2_i = seq2_i + 1; // increment seq2_i to next window
129 } // end of loop thru the current offset sequence
130 } // end of loop thru the start positions of seq1 sequence
132 // this does the "lower diagonals" of the search
134 // loop thru the start positions for sequence 1
135 for(start_i = 1; start_i != seq2_win_num; ++start_i, ++seqcomp_i)
139 // compare initial window
140 for(win_i = 0; win_i < window_size; win_i++)
141 if ((seq2[start_i+win_i] == seq1[win_i]) &&
142 (seq2[start_i+win_i] != 'N')) // N's match nothing
145 //seq2_i is always start_i for initial window
148 add(0, seq2_i, matches, i2_offset);
150 // run through rest of seq1 and seq2 with current seq1 offset (ie start_i)
151 // compare the bps leaving and entering the window and modify matches count
153 while( (seq1_i < seq1_win_num-1) && (seq2_i < seq2_win_num-1) )
155 // copmpare the bp leaving the window
156 if ((seq1[seq1_i] == seq2[seq2_i]) &&
157 (seq1[seq1_i] != 'N')) // N's match nothing
158 matches = matches - 1;
160 // compare the bp entering the window
161 if ((seq1[seq1_i+window_size] == seq2[seq2_i+window_size]) &&
162 (seq1[seq1_i+window_size] != 'N')) // N's match nothing
163 matches = matches + 1;
165 add(seq1_i + 1, seq2_i + 1, matches, i2_offset);
167 seq1_i = seq1_i + 1; // increment seq1_i to next window
168 seq2_i = seq2_i + 1; // increment seq2_i to next window
169 } // end of loop thru the current offset sequence
170 } // end of loop thru the start positions of seq2 sequence
172 if (seqcomp_i != seqcomp_end) {
173 clog << "seqcomp_i " << seqcomp_i << " " << seqcomp_end << endl;
174 //throw mussa_error("internal error with seqcomp seqcomp_i != seqcomp_end");
176 seqcomp_i = seqcomp_end = FLPs::seqcomp_not_running;