1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
4 ============================================================================
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 ============================================================================
18 $Id: find_horspool.h,v 1.1 2008/08/25 16:20:06 langmead Exp $
19 ==========================================================================*/
21 #ifndef SEQAN_HEADER_FIND_HORSPOOL_H
22 #define SEQAN_HEADER_FIND_HORSPOOL_H
24 namespace SEQAN_NAMESPACE_MAIN
27 //////////////////////////////////////////////////////////////////////////////
29 //////////////////////////////////////////////////////////////////////////////
33 ..summary: Exact string matching using Horspool's algorithm (1980).
34 ..general:Class.Pattern
36 ..signature:Pattern<TNeedle, Horspool>
37 ..param.TNeedle:The needle type.
41 ///.Class.Pattern.param.TSpec.type:Spec.Horspool
44 typedef Tag<_Horspool> Horspool;
46 //////////////////////////////////////////////////////////////////////////////
48 template <typename TNeedle>
49 class Pattern<TNeedle, Horspool>
51 //____________________________________________________________________________
54 typedef typename Size<TNeedle>::Type TSize;
56 Holder<TNeedle> data_needle;
57 String<TSize> data_map;
59 //____________________________________________________________________________
64 Pattern(Pattern const & other_):
65 data_map(other_.data_map) {}
67 template <typename TNeedle2>
68 Pattern(TNeedle2 const & ndl)
74 operator = (Pattern const & other_)
76 data_map = other_.data_map;
79 //____________________________________________________________________________
83 template <typename TNeedle, typename TNeedle2>
85 setHost(Pattern<TNeedle, Horspool> & me, TNeedle2 const & ndl)
87 typedef typename Value<TNeedle>::Type TValue;
88 typedef typename Size<TNeedle>::Type TSize;
90 SEQAN_ASSERT(!empty(ndl));
92 TSize value_size = ValueSize<TValue>::VALUE;
95 resize(me.data_map, value_size);
98 typename Value<String<TSize> >::Type jump_width = length(ndl); //das ist so umstaendlich wegen VC++ 2003
99 arrayFill(begin(me.data_map, Standard()), begin(me.data_map, Standard()) + value_size, jump_width);
101 typename Iterator<TNeedle2 const, Standard>::Type it;
102 it = begin(ndl, Standard());
103 while (jump_width > 1)
106 unsigned int pos_ = *it; //conversion value type to unsigned int
107 me.data_map[pos_] = jump_width;
111 me.data_needle = ndl;
114 template <typename TNeedle, typename TNeedle2>
116 setHost(Pattern<TNeedle, Horspool> & horsp, TNeedle2 & ndl)
118 setHost(horsp, reinterpret_cast<TNeedle2 const &>(ndl));
121 //____________________________________________________________________________
124 template <typename TNeedle>
125 inline void _patternInit (Pattern<TNeedle, Horspool> &) {}
127 //____________________________________________________________________________
129 template <typename TNeedle>
130 inline typename Host<Pattern<TNeedle, Horspool> >::Type &
131 host(Pattern<TNeedle, Horspool> & me)
134 return value(me.data_needle);
137 template <typename TNeedle>
138 inline typename Host<Pattern<TNeedle, Horspool> const>::Type &
139 host(Pattern<TNeedle, Horspool> const & me)
142 return value(me.data_needle);
145 //____________________________________________________________________________
147 template <typename TFinder, typename TNeedle2>
149 find_horspool(TFinder & finder,
150 Pattern<TNeedle2, Horspool> & me,
154 typedef typename Haystack<TFinder>::Type THaystack;
155 THaystack & hayst = haystack(finder);
157 typedef Pattern<TNeedle2, Horspool> TPattern;
158 typedef typename Needle<TPattern>::Type TNeedle;
159 TNeedle & ndl = needle(me);
161 typedef typename Size<TNeedle>::Type TNeedleSize;
162 TNeedleSize ndl_size = length(ndl);
164 typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
165 THaystackIterator haystack_end = end(hayst, Standard());
166 THaystackIterator it = begin(hayst, Standard());
167 it += position(finder) + ndl_size - 1; //it points to the last character
168 THaystackIterator it_next = it;
170 typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
171 TNeedleIterator nit; //needle iterator
172 TNeedleIterator nit_begin = begin(ndl, Standard());
173 TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
183 //move to next position
184 char_i = *it; //conversion to unsigned integer
185 it_next = it + me.data_map[char_i];
186 if (it_next >= haystack_end)
194 //validate current position
195 for (nit = nit_end; nit >= nit_begin; --nit)
197 if (*nit != *it_next)
205 setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
209 //____________________________________________________________________________
210 // Sentinel variant (not used at the moment)
211 //TODO: if not enough space at the end of the haystack: call non-sentinel search
213 template <typename TFinder, typename TNeedle2>
215 find_horspool_sentinel(TFinder & finder,
216 Pattern<TNeedle2, Horspool> & me,
220 typedef typename Haystack<TFinder>::Type THaystack;
221 THaystack & hayst = haystack(finder);
224 typedef Pattern<TNeedle2, Horspool> TPattern;
225 typedef typename Needle<TPattern>::Type TNeedle;
226 TNeedle & ndl = needle(me);
229 typename Size<THaystack>::Type old_haystack_size = length(hayst);
232 typedef typename Position<TFinder>::Type TFinderPosition;
233 TFinderPosition finder_pos = position(finder);
235 append(hayst, ndl, Exact());
236 if (length(hayst) != old_haystack_size + length(ndl))
237 {//not enough place in haystack
242 setPosition(finder, finder_pos);
246 _setLength(hayst, old_haystack_size + length(ndl));
249 typedef typename Size<TNeedle>::Type TNeedleSize;
250 TNeedleSize ndl_size = length(ndl);
252 typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
253 THaystackIterator it = begin(hayst, Standard());
254 THaystackIterator haystack_end = it + old_haystack_size;
255 it += position(finder) + ndl_size - 1; //it points to the last character
257 typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
258 TNeedleIterator nit; //needle iterator
259 TNeedleIterator nit_begin = begin(ndl, Standard());
260 TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
262 typedef typename Value<TNeedle>::Type TNeedleValue;
263 TNeedleValue char_needle_last = *nit_end;
264 TNeedleValue char_haystack_last;
266 char_haystack_last = *it;
275 it += me.data_map[_ord(char_haystack_last)];
276 char_haystack_last = *it;
277 if (char_haystack_last != char_needle_last) goto MOVE_FURTHER;
280 if (it >= haystack_end)
282 resize(hayst, old_haystack_size);
287 //validate current position
288 THaystackIterator it_back = it;
289 for (nit = nit_end; nit >= nit_begin; --nit)
291 if (*nit != *it_back)
299 setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
300 resize(hayst, old_haystack_size);
304 //____________________________________________________________________________
305 //spec for file reader haystacks
307 template <typename TFormat, typename TFile, typename TSpec>
310 template <typename TValue, typename TFormat, typename TFile, typename FileReaderTSpec, typename TFinderSpec, typename TNeedle2>
312 find_horspool(Finder<String<TValue, FileReader<TFormat, TFile, FileReaderTSpec> >, TFinderSpec > & finder,
313 Pattern<TNeedle2, Horspool> & me,
317 typedef Finder<String<TValue, FileReader<TFormat, TFile, FileReaderTSpec> >, TFinderSpec > TFinder;
318 typedef typename Haystack<TFinder>::Type THaystack;
319 THaystack & hayst = haystack(finder);
321 typedef Pattern<TNeedle2, Horspool> TPattern;
322 typedef typename Needle<TPattern>::Type TNeedle;
323 TNeedle & ndl = needle(me);
325 typedef typename Size<TNeedle>::Type TNeedleSize;
326 TNeedleSize ndl_size = length(ndl);
328 typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
329 THaystackIterator it(hayst, position(finder) + ndl_size - 1); //it points to the last character
331 typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
332 TNeedleIterator nit; //needle iterator
333 TNeedleIterator nit_begin = begin(ndl, Standard());
334 TNeedleIterator nit_end = end(ndl, Standard()) - 1; //here the verification begins
344 //move to next position
345 char_i = *it; //conversion to unsigned integer
346 it += me.data_map[char_i];
353 //validate current position
354 for (nit = nit_end; nit >= nit_begin; --nit)
358 it += (nit_end - nit);
365 setPosition(finder, it - begin(hayst, Standard()) + 1);
370 //____________________________________________________________________________
373 template <typename TFinder, typename TNeedle2>
375 find_horspool(TFinder & finder,
376 Pattern<TNeedle2, Horspool> & me,
380 typedef typename Haystack<TFinder>::Type THaystack;
381 THaystack & hayst = haystack(finder);
383 typedef Pattern<TNeedle2, Horspool> TPattern;
384 typedef typename Needle<TPattern>::Type TNeedle;
385 TNeedle & ndl = needle(me);
387 typename Size<TNeedle>::Type ndl_size = length(ndl);
389 typedef typename Iterator<THaystack, Standard>::Type THaystackIterator;
390 THaystackIterator haystack_end = end(hayst, Standard());
391 THaystackIterator it = begin(hayst, Standard());
392 it += position(finder) + ndl_size - 1; //it points to the last character
393 THaystackIterator it2;
395 typedef typename Iterator<TNeedle, Standard>::Type TNeedleIterator;
396 TNeedleIterator nit; //needle iterator
397 TNeedleIterator nit_begin = begin(ndl, Standard());
398 TNeedleIterator nit_end = end(ndl, Standard()) - 2; //here the verification begins
400 typedef typename Value<TNeedle>::Type TNeedleValue;
401 TNeedleValue last_needle_char = value(ndl, ndl_size - 1);
411 //scan for the last character
414 if (it >= haystack_end)
418 if (*it == last_needle_char)
427 //validate current position
428 for (nit = nit_end; nit >= nit_begin; --nit)
433 char_i = *it; //conversion to unsigned integer
434 it += me.data_map[char_i];
440 setPosition(finder, it - begin(hayst, Standard()) - ndl_size + 1);
445 template <typename TFinder, typename TNeedle2>
447 find(TFinder & finder, Pattern<TNeedle2, Horspool> & me)
450 bool find_first = empty(finder);
454 _finderSetNonEmpty(finder);
457 SEQAN_ASSERT(length(needle(me)) > 0)
459 return find_horspool(finder, me, find_first);
462 //////////////////////////////////////////////////////////////////////////////
464 //////////////////////////////////////////////////////////////////////////////
466 template <typename TNeedle>
467 struct Host< Pattern<TNeedle, Horspool> >
469 typedef TNeedle Type;
472 template <typename TNeedle>
473 struct Host< Pattern<TNeedle, Horspool> const>
475 typedef TNeedle const Type;
479 //////////////////////////////////////////////////////////////////////////////
481 }// namespace SEQAN_NAMESPACE_MAIN
483 #endif //#ifndef SEQAN_HEADER_...