4 * Created on: October 15, 2009
12 #include "color_dec.h"
20 -1, 1, 1, 2, 1, 2, 2, 3,
21 1, 2, 2, 3, 2, 3, 3, 4
23 static int firsts[] = {
24 -1, 0, 1, 0, 2, 0, 1, 0,
25 3, 0, 1, 0, 2, 0, 1, 0
29 * Given a nucleotide mask, pick one matching nucleotide at random.
31 static int randFromMask(int mask) {
33 if(alts[mask] == 1) return firsts[mask];
36 int r = rand() % alts[mask];
38 assert_lt(r, alts[mask]);
39 for(int i = 0; i < 4; i++) {
40 if((mask & (1 << i)) != 0) {
45 cerr << "Shouldn't get here" << endl;
51 * Does a 2-bit-encoded base match any bit in a mask?
53 static inline bool matches(int i, int j) {
54 return ((1 << i) & j) != 0;
58 * Given the dynamic programming table, trace backwards from the last
59 * column and populate the 's' and 'cmm' strings accordingly. Whenever
60 * there are multiple equally good ways of backtracking, choose one at
63 static void backtrack(int table[4][6][1025], // filled-in table
64 const char *read, size_t readi, size_t readf,
65 const char *ref, size_t refi, size_t reff,
66 char *s, // final nucleotide string
67 char *cmm, // color mismatches
68 char *nmm, // nucleotide mismatches
69 int& cmms, // # color mismatches
70 int& nmms) // # nucleotide mismatches
72 const size_t len = reff-refi;
76 // Determine best base in final column of table
77 for(int i = 0; i < 4; i++) {
78 // Install minimum and backtrack info
79 int m = table[i][4][len-1];
87 // i <- position of rightmost nucleotide
89 // to <- rightmost nucleotide
90 int to = randFromMask(bests);
92 bests = table[to][5][i]; // get next best mask
93 s[i--] = to; // install best nucleotide
94 if(i < 0) break; // done
97 to = randFromMask(bests); // select
99 // Determine what reference nucleotides were matched against
100 for(size_t i = 0; i < len; i++) {
101 if(matches(s[i], ref[refi+i])) {
102 assert_eq(1, alts[(int)ref[refi+i]]);
103 // Just plain matched
106 // If ref is ambiguous here, does it matter which one we
107 // choose? I don't think so.
108 assert_eq(1, alts[(int)ref[refi+i]]);
114 for(size_t i = 0; i < len-1; i++) {
115 int c1 = (int)read[readi+i]; // actual
116 int c2 = dinuc2color[(int)s[i]][(int)s[i+1]]; // decoded
117 assert_leq(c1, 4); assert_geq(c1, 0);
118 if(c1 != c2 || c1 == 4) {
120 assert_lt(c2, 4); assert_geq(c2, 0);
121 cmm[i] = "0123."[c2];
131 * Decode the colorspace read 'read' as aligned against the reference
132 * string 'ref', assuming that it's a hit.
135 const char *read, // ASCII colors, '0', '1', '2', '3', '.'
136 const char *qual, // ASCII quals, Phred+33 encoded
137 size_t readi, // offset of first character within 'read' to consider
138 size_t readf, // offset of last char (exclusive) in 'read' to consider
139 const char *ref, // reference sequence, as masks
140 size_t refi, // offset of first character within 'ref' to consider
141 size_t reff, // offset of last char (exclusive) in 'ref' to consider
142 int snpPhred, // penalty incurred by a SNP
143 char *ns, // decoded nucleotides are appended here
144 char *cmm, // where the color mismatches are in the string
145 char *nmm, // where nucleotide mismatches are in the string
146 int& cmms, // number of color mismatches
147 int& nmms) // number of nucleotide mismatches
149 assert_lt(refi, reff);
150 assert_lt(readi, readf);
151 assert_eq(reff-refi-1, readf-readi);
154 // Dynamic programming table; good for colorspace reads up to 1024
157 int table[4][6][1025];
158 // 0 -> A, 1 -> C, 2 -> G, 3 -> T, 4 -> min(A, C, G, T),
159 // 5 -> backtrack mask, 6 -> min mismatches
161 // The first column of the table just considers the first
162 // nucleotide and whether it matches the ref nucleotide.
163 for(int to = 0; to < 4; to++) {
164 if(matches(to, ref[refi])) {
165 // The assigned subject nucleotide matches the reference;
172 table[to][5][0] = 15;
174 // The assigned subject nucleotide does not match the
175 // reference nucleotide, so we add a SNP penalty
176 table[to][0][0] = snpPhred;
177 table[to][1][0] = snpPhred;
178 table[to][2][0] = snpPhred;
179 table[to][3][0] = snpPhred;
180 table[to][4][0] = snpPhred;
181 table[to][5][0] = 15;
185 // Successive columns examine successive alignment positions
186 int omin = INT_MAX, t = 0;
187 int lastOmin = INT_MAX;
188 for(size_t c = readi; c < readf; c++) {
189 const int readc = (int)read[c];
190 assert_leq(readc, 4);
191 assert_geq(readc, 0);
194 // t <- index of column in dynamic programming table
196 const int refc = ref[refi + t];
197 int from[] = { table[0][4][t-1], table[1][4][t-1],
198 table[2][4][t-1], table[3][4][t-1] };
199 // For each downstream nucleotide
200 for(int to = 0; to < 4; to++) {
201 // For each upstream nucleotide
203 const int goodfrom = nuccol2nuc[to][readc];
205 // Reward the preceding position
206 if(goodfrom < 4) from[goodfrom] -= q;
212 } else if(from[1] == min) {
213 table[to][5][t] |= 2;
218 } else if(from[2] == min) {
219 table[to][5][t] |= 4;
224 } else if(from[3] == min) {
225 table[to][5][t] |= 8;
228 if(!matches(to, refc)) {
231 table[to][4][t] = min;
232 if(min < omin) omin = min;
233 if(goodfrom < 4) from[goodfrom] += q;
238 assert_eq(t, (int)(reff - refi));
239 // Install the best backward path into ns, cmm, nmm
241 read, readi, readi + t - 1,
243 ns, cmm, nmm, cmms, nmms);
246 #ifdef MAIN_COLOR_DEC
251 static const char *short_opts = "s:m:r:e:";
252 static struct option long_opts[] = {
253 {(char*)"snppen", required_argument, 0, 's'},
254 {(char*)"misspen", required_argument, 0, 'm'},
255 {(char*)"seed", required_argument, 0, 'r'},
256 {(char*)"maxpen", required_argument, 0, 'e'}
260 T parse(const char *s) {
267 int main(int argc, char **argv) {
268 int option_index = 0;
275 next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
276 switch (next_option) {
277 case 's': snppen = parse<int>(optarg); break;
278 case 'm': misspen = parse<int>(optarg); break;
279 case 'r': seed = parse<unsigned>(optarg); break;
280 case 'e': maxPenalty = parse<int>(optarg); break;
283 cerr << "Unknown option: " << (char)next_option << endl;
287 } while(next_option != -1);
289 if(argc - optind < 2) {
290 cerr << "Not enough options" << endl;
295 for(size_t i = 0; i < read.length(); i++) {
296 read[i] = asc2col[(int)read[i]];
297 assert_leq(read[i], 4);
298 assert_geq(read[i], 0);
300 ref = argv[optind+1];
301 for(size_t i = 0; i < ref.length(); i++) {
303 int alts[] = {4, 4, 4, 4};
304 decodeNuc(toupper(ref[i]), num, alts);
308 for(int j = 0; j < num; j++) {
309 ref[i] |= (1 << alts[j]);
314 quals.resize(read.length(), misspen);
316 int score = decode(read, quals, 0, read.length(),
317 ref, 0, ref.length(), maxPenalty,
318 snppen, ns, cmm, nmm);
319 cout << " Score: " << score << " (max: " << maxPenalty << ")" << endl;
321 for(size_t i = 0; i < cmm.length(); i++) {
322 cout << cmm[i] << " ";
326 for(size_t i = 0; i < read.length(); i++) {
327 printColor((int)read[i]);
332 for(size_t i = 0; i < ns.length(); i++) {
333 cout << "ACGTN"[(int)ns[i]] << " ";
337 for(size_t i = 0; i < ref.length(); i++) {
338 cout << mask2iupac[(int)ref[i]] << " ";
342 for(size_t i = 0; i < ref.length(); i++) {
343 cout << nmm[i] << " ";