Imported Upstream version 0.12.7
[bowtie.git] / SeqAn-1.1 / seqan / find / find_score.h
1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de 
4  ============================================================================
5   Copyright (C) 2007
6
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17  ============================================================================
18   $Id: find_score.h,v 1.1 2008/08/25 16:20:07 langmead Exp $
19  ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_FIND_SCORE_H
22 #define SEQAN_HEADER_FIND_SCORE_H
23
24 namespace SEQAN_NAMESPACE_MAIN
25 {
26
27 //////////////////////////////////////////////////////////////////////////////
28 // DPSearch
29 //////////////////////////////////////////////////////////////////////////////
30
31 template <typename TScore>
32 struct DPSearch;
33
34
35 /**
36 .Spec.DPSearch:
37 ..cat:Searching
38 ..general:Class.Pattern
39 ..summary:A dynamic programming algorithm for approximate string-matching with a user-definable scoring function.
40 ..signature:Pattern<TNeedle, DPSearch<TScore> >
41 ..param.TNeedle:The needle type.
42 ...type:Class.String
43 ..param.TScore:The scoring function.
44 ...type:Class.Score
45 ..remarks.text:The algorithm is based on the Sellers/Needleman-Wunsch dynamic progamming algorithm. 
46 The $Pattern$ object only contains the right-most column of the dynamic programming matrix.
47 ...note:At the moment, the algorithm only works on linear gap costs.
48 */
49
50 ///.Class.Pattern.param.TSpec.type:Class.Score
51
52
53 template <typename TNeedle, typename TScore>
54 class Pattern<TNeedle, DPSearch<TScore> >
55 {
56 public:
57         typedef typename Value<TScore>::Type TScoreValue;
58
59         Holder<TNeedle>         data_needle;
60         TScore                          data_score;
61         TScoreValue                     data_limit;
62         String<TScoreValue>     data_tab;
63
64 public: 
65         Pattern(): 
66                 data_limit(0)
67         { 
68 SEQAN_CHECKPOINT
69         }
70
71         Pattern(TNeedle & _needle, 
72                         TScore & _score_func, 
73                         TScoreValue _limit = 0): 
74                 data_score(_score_func),
75                 data_limit(_limit)
76         { 
77 SEQAN_CHECKPOINT
78                 setHost(*this, _needle);
79         }
80
81         Pattern(TNeedle & _needle,
82                         TScoreValue _limit = 0): 
83                 data_limit(_limit)
84         { 
85 SEQAN_CHECKPOINT
86                 setHost(*this, _needle);
87         }
88
89         Pattern(TScoreValue _limit): 
90                 data_limit(_limit)
91         { 
92 SEQAN_CHECKPOINT
93                 create(data_score);
94         }
95
96         Pattern(Pattern const & other): 
97                 data_needle( other.data_needle ),
98                 data_score( other.data_score ), 
99                 data_limit( other.data_limit ),
100                 data_tab( other.data_tab )
101         {
102 SEQAN_CHECKPOINT
103         }
104
105         inline Pattern & 
106         operator = (Pattern const & other) 
107         { 
108 SEQAN_CHECKPOINT
109                 this->data_needle = other.data_needle;
110                 this->data_score = other.data_score;
111                 this->data_limit = other.data_limit;
112                 this->data_tab = other.data_tab;
113
114                 return *this;
115         }
116 };
117
118
119 //////////////////////////////////////////////////////////////////////////////
120 // Host
121 //////////////////////////////////////////////////////////////////////////////
122
123 /* //see find_base.h
124
125 template <typename TNeedle, typename TScore>
126 struct Host< Pattern<TNeedle, DPSearch<TScore> > >
127 {
128         typedef TNeedle Type;
129 };
130
131 template <typename TNeedle, typename TScore>
132 struct Host< Pattern<TNeedle, DPSearch<TScore> > const>
133 {
134         typedef TNeedle const Type;
135 };
136 */
137
138 //////////////////////////////////////////////////////////////////////////////
139
140
141 template <typename TNeedle, typename TScore>
142 inline typename Host<Pattern<TNeedle, DPSearch<TScore> > >::Type & 
143 host(Pattern<TNeedle, DPSearch<TScore> > & me)
144 {
145 SEQAN_CHECKPOINT
146         return value(me.data_needle);
147 }
148
149 template <typename TNeedle, typename TScore>
150 inline typename Host<Pattern<TNeedle, DPSearch<TScore> > const>::Type & 
151 host(Pattern<TNeedle, DPSearch<TScore> >  const & me)
152 {
153 SEQAN_CHECKPOINT
154         return value(me.data_needle);
155 }
156
157
158 //____________________________________________________________________________
159
160 template <typename TNeedle, typename TScore, typename TNeedle2>
161 void 
162 setHost(Pattern<TNeedle, DPSearch<TScore> > & me, 
163                 TNeedle2 & ndl)
164 {
165         me.data_needle = ndl;
166         clear(me.data_tab);
167 }
168 template <typename TNeedle, typename TScore, typename TNeedle2>
169 void 
170 setHost(Pattern<TNeedle, DPSearch<TScore> > & me, 
171                 TNeedle2 const & ndl)
172 {
173         me.data_needle = ndl;
174         clear(me.data_tab);
175 }
176
177
178 //____________________________________________________________________________
179
180 /**.Function.scoringScheme
181 ..cat:Searching
182 ..summary:The @glos:scoring scheme@ used for finding or aligning.
183 ..signature:scoringScheme(obj)
184 ..param.obj:Object that holds a @glos:scoring scheme@
185 ...type:Spec.DPSearch
186 ..returns:The @glos:scoring scheme@ used in $obj$
187 ..see:glos:scoring scheme
188 */
189
190 template <typename TNeedle, typename TScore>
191 inline TScore const & 
192 scoringScheme(Pattern<TNeedle, DPSearch<TScore> > & me)
193 {
194 SEQAN_CHECKPOINT
195         return me.data_score;
196 }
197
198 //____________________________________________________________________________
199
200 /**.Function.setScoringScheme
201 ..cat:Searching
202 ..summary:Sets the @glos:scoring scheme@ used for finding or aligning.
203 ..signature:setScoringScheme(obj, score)
204 ..param.obj:Object that holds a @glos:scoring scheme@.
205 ...type:Spec.DPSearch
206 ..param.score:The new @glos:scoring scheme@ used by $obj$.
207 ..see:glos:scoring scheme
208 ..see:Function.scoringScheme
209 */
210
211 template <typename TNeedle, typename TScore, typename TScore2>
212 inline void
213 setScoringScheme(Pattern<TNeedle, DPSearch<TScore> > & me, 
214                                  TScore2 & score)
215 {
216 SEQAN_CHECKPOINT
217         me.data_score = score;
218         clear(me.data_tab);
219 }
220 template <typename TNeedle, typename TScore, typename TScore2>
221 inline void
222 setScoringScheme(Pattern<TNeedle, DPSearch<TScore> > & me, 
223                                  TScore2 const & score)
224 {
225 SEQAN_CHECKPOINT
226         me.data_score = score;
227         clear(me.data_tab);
228 }
229
230 //____________________________________________________________________________
231
232
233 /**.Function.scoreLimit
234 ..cat:Searching
235 ..summary:The minimal score a match must reach in approximate searching.
236 ..signature:scoreLimit(pattern)
237 ..param.pattern:A @Concept.Pattern|pattern@ that can be used for approximate searching.
238 ...type:Spec.DPSearch
239 ..returns:The current score limit of $pattern$.
240 */
241
242 template <typename TNeedle, typename TScore>
243 inline typename Value<TScore>::Type 
244 scoreLimit(Pattern<TNeedle, DPSearch<TScore> > const & me)
245 {
246 SEQAN_CHECKPOINT
247         return me.data_limit;
248 }
249
250 //____________________________________________________________________________
251
252 /**.Function.setScoreLimit
253 ..cat:Searching
254 ..summary:Sets the minimal score a match must reach in approximate searching.
255 ..signature:setScoreLimit(pattern, limit)
256 ..param.pattern:A @Concept.Pattern|pattern@ that can be used for approximate searching.
257 ...type:Spec.DPSearch
258 ..param.limit:The new score limit.
259 ..see:Function.scoreLimit
260 */
261
262 template <typename TNeedle, typename TScore, typename TScoreValue>
263 inline void 
264 setScoreLimit(Pattern<TNeedle, DPSearch<TScore> > & me, 
265                           TScoreValue _limit)
266 {
267 SEQAN_CHECKPOINT
268         me.data_limit = _limit;
269 }
270
271 //____________________________________________________________________________
272 // returns the score of the last hit position found (note:position = end of occurrence in haystack)
273
274 /**.Function.getScore
275 ..cat:Searching
276 ..summary:Score of the last found match in approximate searching.
277 ..signature:getScore(pattern)
278 ..param.pattern:A @Concept.Pattern|pattern@ that can be used for approximate searching.
279 ...type:Spec.DPSearch
280 ..returns:The score of the last match found using $pattern$.
281 ...remarks:If no match was found, the value is undefined.
282 ..see:Function.scoreLimit
283 ..see:Function.setScoreLimit
284 ..see:Function.find
285 */
286
287 template <typename TNeedle, typename TScore>
288 inline typename Value<TScore>::Type
289 getScore(Pattern<TNeedle, DPSearch<TScore> > & me)
290 {
291         return front(me.data_tab);
292 }
293
294 //////////////////////////////////////////////////////////////////////////////
295
296
297 template <typename TNeedle, typename TScore>
298 inline void _patternInit (Pattern<TNeedle, DPSearch<TScore> > & me) 
299 {
300         typedef Pattern<TNeedle, DPSearch<TScore> > TPattern;
301         typedef typename Value<TScore>::Type TScoreValue;
302
303         typedef typename Size<TPattern>::Type TSize;
304
305         typedef String<TScoreValue> TTab;
306         typedef typename Iterator<TTab, Standard>::Type TIterator;
307
308         TScoreValue score_gap = scoreGapExtend(scoringScheme(me));
309
310         TTab & string_tab = me.data_tab;
311
312         //allocate enough memory for one column of DP matrix
313         TSize need_length = length(needle(me));
314         SEQAN_ASSERT(need_length);
315
316         TSize got_length = resize(string_tab, need_length);
317         SEQAN_ASSERT(got_length >= need_length);
318
319 //      if (length(_dataNeedle(me)) < got_length) throw(0); //???TODO: Throw "not enough memory" exception
320
321         //init matrix
322         //note: The column is stored in reverse order
323         TIterator tab_end = begin(string_tab, Standard());
324         TIterator tab = end(string_tab, Standard());
325         
326         TScoreValue x = score_gap;
327
328         while (tab > tab_end)
329         {
330                 --tab;
331                 *tab = x;
332                 x += score_gap;
333         }
334 }
335
336
337
338 //////////////////////////////////////////////////////////////////////////////
339 // find, findNext
340 //////////////////////////////////////////////////////////////////////////////
341
342 //proportional gap cost: Needleman-Wunsch 
343
344 //???TODO: Ukkonen trick?
345 //???TODO: Finder for affine gap costs?
346
347 template <typename TFinder, typename TNeedle, typename TScore>
348 bool 
349 _find_score_simple_proportional(TFinder & finder, Pattern<TNeedle, DPSearch<TScore> > & me)
350 {
351         typedef typename Value<TScore>::Type TScoreValue;
352         typedef String<TScoreValue> TTab;
353         typedef typename Iterator<TTab, Standard>::Type TTabIterator;
354         typedef typename Iterator<TNeedle const, Standard>::Type TNeedleIterator;
355         typedef typename Value<typename Haystack<TFinder>::Type>::Type THaystackValue;
356
357         String<TScoreValue> & string_tab = me.data_tab;
358
359         TScoreValue score_gap = scoreGapExtend(scoringScheme(me));
360         TScoreValue score_match = scoreMatch(scoringScheme(me));
361         TScoreValue score_mismatch = scoreMismatch(scoringScheme(me));
362
363         //init table
364
365         if (empty(finder))
366         {
367                 clear(me.data_tab);
368                 _finderSetNonEmpty(finder);
369         }
370         else
371         {
372                 goNext(finder);
373         }
374
375         if (! length(me.data_tab))
376         {
377                 _patternInit(me);
378         }
379         //start searching
380
381         TTabIterator tab_begin = end(string_tab, Standard());
382
383         TNeedleIterator it_begin = begin(host(me), Standard());
384         TNeedleIterator it_end = end(host(me), Standard());
385
386         //for each character in haystack, do...
387         for (; !atEnd(finder); ++finder)
388         {
389                 //get character
390                 THaystackValue c = *finder;
391
392                 //init some variables
393                 TNeedleIterator it = it_begin;
394                 TScoreValue * tab = tab_begin;
395                 TScoreValue h = 0;
396                 TScoreValue v = 0;
397
398                 //fill the column
399                 while (it < it_end)
400                 {
401                         --tab; //note: the column is stored in "reverse order"
402
403                         TScoreValue m2 = (c == *it) ? h + score_match : h + score_mismatch;
404                         h = *tab;
405                         TScoreValue m1 = (h > v) ? h + score_gap : v + score_gap;
406
407                         v = (m1 > m2) ? m1 : m2;
408                         *tab = v;
409
410                         ++it;
411                 }
412
413                 if (*tab >= scoreLimit(me) )
414                 {//found a hit
415                         return true;
416                 }
417
418         }
419
420         //found nothing
421         return false;
422 }
423
424 //////////////////////////////////////////////////////////////////////////////
425
426 template <typename TFinder, typename TNeedle, typename TScore>
427 inline bool 
428 find(TFinder & finder, 
429          Pattern<TNeedle, DPSearch<TScore> > & me)
430 {
431         SEQAN_ASSERT(scoreGapOpen(scoringScheme(me)) == scoreGapExtend(scoringScheme(me))) //this finder is only defined for linear gap costs
432         return _find_score_simple_proportional(finder, me);
433 }
434
435 template <typename TFinder, typename TNeedle, typename TScore>
436 inline bool 
437 find(TFinder & finder, 
438          Pattern<TNeedle, DPSearch<TScore> > & me,
439          int const limit_)
440 {
441         SEQAN_ASSERT(scoreGapOpen(scoringScheme(me)) == scoreGapExtend(scoringScheme(me))) //this finder is only defined for linear gap costs
442         setScoreLimit(me, limit_);
443         return _find_score_simple_proportional(finder, me);
444 }
445
446
447 //////////////////////////////////////////////////////////////////////////////
448
449 }// namespace SEQAN_NAMESPACE_MAIN
450
451 #endif //#ifndef SEQAN_HEADER_...