Commit patch to not break on spaces.
[bowtie.git] / SeqAn-1.1 / seqan / find / find_horspool.h
1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de 
4  ============================================================================
5   Copyright (C) 2007
6
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17  ============================================================================
18   $Id: find_horspool.h,v 1.1 2008/08/25 16:20:06 langmead Exp $
19  ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_FIND_HORSPOOL_H
22 #define SEQAN_HEADER_FIND_HORSPOOL_H
23
24 namespace SEQAN_NAMESPACE_MAIN
25 {
26
27 //////////////////////////////////////////////////////////////////////////////
28 // Horspool
29 //////////////////////////////////////////////////////////////////////////////
30
31 /**
32 .Spec.Horspool:
33 ..summary: Exact string matching using Horspool's algorithm (1980).
34 ..general:Class.Pattern
35 ..cat:Searching
36 ..signature:Pattern<TNeedle, Horspool>
37 ..param.TNeedle:The needle type.
38 ...type:Class.String
39 */
40
41 ///.Class.Pattern.param.TSpec.type:Spec.Horspool
42
43 struct _Horspool;
44 typedef Tag<_Horspool> Horspool;
45
46 //////////////////////////////////////////////////////////////////////////////
47
48 template <typename TNeedle>
49 class Pattern<TNeedle, Horspool>
50 {
51 //____________________________________________________________________________
52
53 public:
54         typedef typename Size<TNeedle>::Type TSize;
55
56         Holder<TNeedle>         data_needle;
57         String<TSize>           data_map;
58
59 //____________________________________________________________________________
60
61 public:
62         Pattern() {}
63
64         Pattern(Pattern const & other_):
65                 data_map(other_.data_map) {}
66
67         template <typename TNeedle2>
68         Pattern(TNeedle2 const & ndl)
69         {
70                 setHost(*this, ndl);
71         }
72
73         Pattern const &
74         operator = (Pattern const & other_)
75         {
76                 data_map = other_.data_map;
77                 return *this;
78         }
79 //____________________________________________________________________________
80 };
81
82
83 template <typename TNeedle, typename TNeedle2>
84 void
85 setHost(Pattern<TNeedle, Horspool> & me, TNeedle2 const & ndl)
86 {
87         typedef typename Value<TNeedle>::Type TValue;
88         typedef typename Size<TNeedle>::Type TSize;
89
90         SEQAN_ASSERT(!empty(ndl));
91
92         TSize value_size = ValueSize<TValue>::VALUE;
93
94         //make room for map
95         resize(me.data_map, value_size);
96
97         //fill map
98         typename Value<String<TSize> >::Type jump_width = length(ndl); //das ist so umstaendlich wegen VC++ 2003
99         arrayFill(begin(me.data_map, Standard()), begin(me.data_map, Standard()) + value_size, jump_width);
100
101         typename Iterator<TNeedle2 const, Standard>::Type it;
102         it = begin(ndl, Standard());
103         while (jump_width > 1)
104         {
105                 --jump_width;
106                 unsigned int pos_ = *it; //conversion value type to unsigned int
107                 me.data_map[pos_] = jump_width;
108                 ++it;
109         }
110
111         me.data_needle = ndl;
112 }
113
114 template <typename TNeedle, typename TNeedle2>
115 void
116 setHost(Pattern<TNeedle, Horspool> & horsp, TNeedle2 & ndl)
117 {
118         setHost(horsp, reinterpret_cast<TNeedle2 const &>(ndl));
119 }
120
121 //____________________________________________________________________________
122
123
124 template <typename TNeedle>
125 inline void _patternInit (Pattern<TNeedle, Horspool> &) {}
126
127 //____________________________________________________________________________
128
129 template <typename TNeedle>
130 inline typename Host<Pattern<TNeedle, Horspool> >::Type & 
131 host(Pattern<TNeedle, Horspool> & me)
132 {
133 SEQAN_CHECKPOINT
134         return value(me.data_needle);
135 }
136
137 template <typename TNeedle>
138 inline typename Host<Pattern<TNeedle, Horspool> const>::Type & 
139 host(Pattern<TNeedle, Horspool> const & me)
140 {
141 SEQAN_CHECKPOINT
142         return value(me.data_needle);
143 }
144
145 //____________________________________________________________________________
146
147 template <typename TFinder, typename TNeedle2>
148 bool
149 find_horspool(TFinder & finder, 
150                           Pattern<TNeedle2, Horspool> & me,
151                           bool find_first)
152 {
153 SEQAN_CHECKPOINT
154         typedef typename Haystack<TFinder>::Type THaystack;
155         THaystack & hayst = haystack(finder);
156
157         typedef Pattern<TNeedle2, Horspool> TPattern;
158         typedef typename Needle<TPattern>::Type TNeedle;
159         TNeedle & ndl = needle(me);
160
161         typedef typename Size<TNeedle>::Type TNeedleSize;
162         TNeedleSize ndl_size = length(ndl);
163
164         typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
165         THaystackIterator haystack_end = end(hayst, Standard());
166         THaystackIterator it = begin(hayst, Standard());
167         it += position(finder) + ndl_size - 1; //it points to the last character
168         THaystackIterator it_next = it;
169
170         typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
171         TNeedleIterator nit; //needle iterator
172         TNeedleIterator nit_begin = begin(ndl, Standard());
173         TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
174
175         unsigned int char_i;
176
177         if (find_first)
178         {
179                 goto VALIDATE;
180         }
181
182 MOVE_FURTHER:
183         //move to next position
184         char_i = *it; //conversion to unsigned integer
185         it_next = it + me.data_map[char_i];
186         if (it_next >= haystack_end)
187         {//found nothing
188                 return false;
189         }
190
191         it = it_next;
192
193 VALIDATE:
194         //validate current position
195         for (nit = nit_end; nit >= nit_begin; --nit)
196         {
197                 if (*nit != *it_next)
198                 {//invalid!
199                         goto MOVE_FURTHER;
200                 }
201                 --it_next;
202         }
203
204         //valid! return hit
205         setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
206         return true;
207 }
208
209 //____________________________________________________________________________
210 // Sentinel variant (not used at the moment)
211 //TODO: if not enough space at the end of the haystack: call non-sentinel search
212 /*
213 template <typename TFinder, typename TNeedle2>
214 bool
215 find_horspool_sentinel(TFinder & finder, 
216                                            Pattern<TNeedle2, Horspool> & me,
217                                            bool find_first)
218 {
219 SEQAN_CHECKPOINT
220         typedef typename Haystack<TFinder>::Type THaystack;
221         THaystack & hayst = haystack(finder);
222         
223
224         typedef Pattern<TNeedle2, Horspool> TPattern;
225         typedef typename Needle<TPattern>::Type TNeedle;
226         TNeedle & ndl = needle(me);
227
228         //implant sentinel
229         typename Size<THaystack>::Type old_haystack_size = length(hayst);
230         if (find_first)
231         {
232                 typedef typename Position<TFinder>::Type TFinderPosition;
233                 TFinderPosition finder_pos = position(finder);
234
235                 append(hayst, ndl, Exact());
236                 if (length(hayst) != old_haystack_size + length(ndl))
237                 {//not enough place in haystack
238 //TODO!!!
239 printf("error!");
240 return false;
241                 }
242                 setPosition(finder, finder_pos);
243         }
244         else
245         {
246                 _setLength(hayst, old_haystack_size + length(ndl));
247         }
248
249         typedef typename Size<TNeedle>::Type TNeedleSize;
250         TNeedleSize ndl_size = length(ndl);
251
252         typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
253         THaystackIterator it = begin(hayst, Standard());
254         THaystackIterator haystack_end = it + old_haystack_size;
255         it += position(finder) + ndl_size - 1; //it points to the last character
256
257         typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
258         TNeedleIterator nit; //needle iterator
259         TNeedleIterator nit_begin = begin(ndl, Standard());
260         TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
261
262         typedef typename Value<TNeedle>::Type TNeedleValue;
263         TNeedleValue char_needle_last = *nit_end;
264         TNeedleValue char_haystack_last;
265
266         char_haystack_last = *it;
267
268         if (find_first)
269         {
270                 goto VALIDATE;
271         }
272
273         //main loop
274 MOVE_FURTHER:
275         it += me.data_map[_ord(char_haystack_last)];
276         char_haystack_last = *it;
277         if (char_haystack_last != char_needle_last) goto MOVE_FURTHER;
278
279
280         if (it >= haystack_end)
281         {//found nothing
282                 resize(hayst, old_haystack_size);
283                 return false;
284         }
285
286 VALIDATE:
287         //validate current position
288         THaystackIterator it_back = it;
289         for (nit = nit_end; nit >= nit_begin; --nit)
290         {
291                 if (*nit != *it_back)
292                 {//invalid!
293                         goto MOVE_FURTHER;
294                 }
295                 --it_back;
296         }
297
298         //valid! return hit
299         setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
300         resize(hayst, old_haystack_size);
301         return true;
302 }
303 */
304 //____________________________________________________________________________
305 //spec for file reader haystacks
306
307 template <typename TFormat, typename TFile, typename TSpec>
308 struct FileReader;
309
310 template <typename TValue, typename TFormat, typename TFile, typename FileReaderTSpec, typename TFinderSpec, typename TNeedle2>
311 bool
312 find_horspool(Finder<String<TValue, FileReader<TFormat, TFile, FileReaderTSpec> >, TFinderSpec > & finder, 
313                           Pattern<TNeedle2, Horspool> & me,
314                           bool find_first)
315 {
316 SEQAN_CHECKPOINT
317         typedef Finder<String<TValue, FileReader<TFormat, TFile, FileReaderTSpec> >, TFinderSpec > TFinder;
318         typedef typename Haystack<TFinder>::Type THaystack;
319         THaystack & hayst = haystack(finder);
320
321         typedef Pattern<TNeedle2, Horspool> TPattern;
322         typedef typename Needle<TPattern>::Type TNeedle;
323         TNeedle & ndl = needle(me);
324
325         typedef typename Size<TNeedle>::Type TNeedleSize;
326         TNeedleSize ndl_size = length(ndl);
327
328         typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
329         THaystackIterator it(hayst, position(finder) + ndl_size - 1); //it points to the last character
330
331         typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
332         TNeedleIterator nit; //needle iterator
333         TNeedleIterator nit_begin = begin(ndl, Standard());
334         TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
335
336         unsigned int char_i;
337
338         if (find_first)
339         {
340                 goto VALIDATE;
341         }
342
343 MOVE_FURTHER:
344         //move to next position
345         char_i = *it; //conversion to unsigned integer
346         it += me.data_map[char_i];
347         if (atEnd(it))
348         {//found nothing
349                 return false;
350         }
351
352 VALIDATE:
353         //validate current position
354         for (nit = nit_end; nit >= nit_begin; --nit)
355         {
356                 if (*nit != *it)
357                 {//invalid!
358                         it += (nit_end - nit);
359                         goto MOVE_FURTHER;
360                 }
361                 --it;
362         }
363
364         //valid! return hit
365         setPosition(finder, it - begin(hayst, Standard()) + 1);
366         return true;
367
368 }
369
370 //____________________________________________________________________________
371 /* groepl variante
372
373 template <typename TFinder, typename TNeedle2>
374 bool
375 find_horspool(TFinder & finder, 
376         Pattern<TNeedle2, Horspool> & me,
377         bool find_first)
378 {
379 SEQAN_CHECKPOINT
380         typedef typename Haystack<TFinder>::Type THaystack;
381         THaystack & hayst = haystack(finder);
382
383         typedef Pattern<TNeedle2, Horspool> TPattern;
384         typedef typename Needle<TPattern>::Type TNeedle;
385         TNeedle & ndl = needle(me);
386
387         typename Size<TNeedle>::Type ndl_size = length(ndl);
388
389         typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
390         THaystackIterator haystack_end = end(hayst, Standard());
391         THaystackIterator it = begin(hayst, Standard());
392         it += position(finder) + ndl_size - 1; //it points to the last character
393         THaystackIterator it2;
394
395         typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
396         TNeedleIterator nit; //needle iterator
397         TNeedleIterator nit_begin = begin(ndl, Standard());
398         TNeedleIterator nit_end = end(ndl, Standard()) - 2; //here the verification begins
399
400         typedef typename Value<TNeedle>::Type TNeedleValue;
401         TNeedleValue last_needle_char = value(ndl, ndl_size - 1);
402
403         unsigned int char_i;
404
405         if (!find_first)
406         {
407                 ++it;
408         }
409
410 MOVE_FURTHER:
411         //scan for the last character
412         while (true)
413         {
414                 if (it >= haystack_end)
415                 {//found nothing
416                         return false;
417                 }
418                 if (*it == last_needle_char) 
419                 {
420                         break;
421                 }
422                 ++it;
423         }
424
425 VALIDATE:
426         it2 = it;
427         //validate current position
428         for (nit = nit_end; nit >= nit_begin; --nit)
429         {
430                 --it2;
431                 if (*nit != *it2)
432                 {//invalid! skip!
433                         char_i = *it; //conversion to unsigned integer
434                         it += me.data_map[char_i];
435                         goto MOVE_FURTHER;
436                 }
437         }
438
439         //valid! return hit
440         setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
441         return true;
442 }
443 */
444
445 template <typename TFinder, typename TNeedle2>
446 bool
447 find(TFinder & finder, Pattern<TNeedle2, Horspool> & me)
448 {
449 SEQAN_CHECKPOINT
450         bool find_first = empty(finder);
451         if (find_first)
452         {
453                 _patternInit(me);
454                 _finderSetNonEmpty(finder);
455         }
456
457         SEQAN_ASSERT(length(needle(me)) > 0)
458
459         return find_horspool(finder, me, find_first);
460 }
461
462 //////////////////////////////////////////////////////////////////////////////
463 // Host
464 //////////////////////////////////////////////////////////////////////////////
465
466 template <typename TNeedle>
467 struct Host< Pattern<TNeedle, Horspool> >
468 {
469         typedef TNeedle Type;
470 };
471
472 template <typename TNeedle>
473 struct Host< Pattern<TNeedle, Horspool> const>
474 {
475         typedef TNeedle const Type;
476 };
477
478
479 //////////////////////////////////////////////////////////////////////////////
480
481 }// namespace SEQAN_NAMESPACE_MAIN
482
483 #endif //#ifndef SEQAN_HEADER_...