1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
4 ============================================================================
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 ============================================================================
18 $Id: find_multiple_shiftand.h,v 1.1 2008/08/25 16:20:07 langmead Exp $
19 ==========================================================================*/
21 #ifndef SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H
22 #define SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H
24 namespace SEQAN_NAMESPACE_MAIN
27 //////////////////////////////////////////////////////////////////////////////
29 //////////////////////////////////////////////////////////////////////////////
32 .Spec.MultipleShiftAnd:
33 ..summary: Multiple exact string matching using bit parallelism. The total size of the patterns should fit into a computer word.
34 ..general:Class.Pattern
36 ..signature:Pattern<TNeedle, MultipleShiftAnd>
37 ..param.TNeedle:The needle type, a string of keywords.
39 ..remarks.text:The types of all keywords in the needle and the haystack have to match.
42 ///.Class.Pattern.param.TSpec.type:Spec.MultipleShiftAnd
44 struct _MultipleShiftAnd;
45 typedef Tag<_MultipleShiftAnd> MultipleShiftAnd;
47 //////////////////////////////////////////////////////////////////////////////
49 template <typename TNeedle>
50 class Pattern<TNeedle, MultipleShiftAnd> {
51 //____________________________________________________________________________
53 Pattern(Pattern const& other);
54 Pattern const& operator=(Pattern const & other);
56 //____________________________________________________________________________
58 typedef unsigned int TWord;
59 typedef typename Size<TNeedle>::Type TSize;
60 Holder<TNeedle> data_needle;
61 TWord* table; // Look up table for each character in the alphabet (called B in "Navarro")
62 TWord* prefSufMatch; // Set of all the prefixes of needle that match a suffix of haystack (called D in "Navarro")
63 TWord* di; // Initialization word
64 TWord* df; // Final test word
65 TWord alphabetSize; // e.g., char --> 256
66 TWord totalLength; // Lenght of concatenated keywords
67 TWord blockCount; // #unsigned ints required to store needle
68 std::deque<Pair<TSize, TSize> > data_keyword; // All keywords that produced a hit here
69 TSize data_keywordIndex; // Last keyword index
70 TSize data_needleLength; // Last needle length
72 //____________________________________________________________________________
81 template <typename TNeedle2>
82 Pattern(TNeedle2 const & ndl)
94 deallocate(this, table, alphabetSize * blockCount);
95 deallocate(this, prefSufMatch, blockCount);
96 deallocate(this, di, blockCount);
97 deallocate(this, df, blockCount);
100 //____________________________________________________________________________
103 //////////////////////////////////////////////////////////////////////////////
104 // Host Metafunctions
105 //////////////////////////////////////////////////////////////////////////////
107 template <typename TNeedle>
108 struct Host< Pattern<TNeedle, MultipleShiftAnd> >
110 typedef TNeedle Type;
113 template <typename TNeedle>
114 struct Host< Pattern<TNeedle, MultipleShiftAnd> const>
116 typedef TNeedle const Type;
119 //////////////////////////////////////////////////////////////////////////////
121 //////////////////////////////////////////////////////////////////////////////
123 template <typename TNeedle, typename TNeedle2>
124 void setHost (Pattern<TNeedle, MultipleShiftAnd> & me, TNeedle2 const & needle) {
126 typedef unsigned int TWord;
127 typedef typename Value<TNeedle>::Type TKeyword;
128 typedef typename Value<TKeyword>::Type TAlphabet;
130 deallocate(me, me.table, me.alphabetSize * me.blockCount);
131 deallocate(me, me.di, me.blockCount);
132 deallocate(me, me.df, me.blockCount);
135 typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle);
137 for(;!atEnd(it);goNext(it)) {
138 me.totalLength += length(*it);
140 me.alphabetSize = ValueSize<TAlphabet>::VALUE;
141 if (me.totalLength<1) me.blockCount=1;
142 else me.blockCount=((me.totalLength-1) / BitsPerValue<TWord>::VALUE)+1;
144 allocate (me, me.table, me.blockCount * me.alphabetSize);
145 arrayFill (me.table, me.table + me.blockCount * me.alphabetSize, 0);
147 allocate (me, me.di, me.blockCount);
148 arrayFill (me.di, me.di + me.blockCount, 0);
150 allocate (me, me.df, me.blockCount);
151 arrayFill (me.df, me.df + me.blockCount, 0);
155 for(;!atEnd(it);goNext(it)) {
156 me.di[j / BitsPerValue<TWord>::VALUE] |= (1<<(j%BitsPerValue<TWord>::VALUE));
157 for (TWord posInKeyword = 0; posInKeyword < length(*it); ++posInKeyword) {
158 // Determine character position in array table
159 TWord index = convert<TWord>(getValue(*it,posInKeyword));
160 me.table[me.blockCount*index + j / BitsPerValue<TWord>::VALUE] |= (1<<(j%BitsPerValue<TWord>::VALUE));
163 me.df[(j - 1) / BitsPerValue<TWord>::VALUE] |= (1<<((j-1)%BitsPerValue<TWord>::VALUE));
165 setValue(me.data_needle, needle);
169 std::cout << "Alphabet size: " << me.alphabetSize << ::std::endl;
170 std::cout << "Needle length: " << me.totalLength << ::std::endl;
171 std::cout << "Block count: " << me.blockCount << ::std::endl;
174 for(;!atEnd(it);goNext(it)) {
177 std::cout << ::std::endl;
178 std::cout << "Table: " << ::std::endl;
179 for(unsigned int i=0;i<me.alphabetSize;++i) {
180 //if ((i<97) || (i>122)) continue;
181 std::cout << TAlphabet(i) << ": ";
182 for(int j=0;j<me.blockCount;++j) {
183 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
184 std::cout << ((me.table[me.blockCount*i+j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
187 std::cout << ::std::endl;
189 std::cout << "DI and DF: " << ::std::endl;
191 for(int j=0;j<me.blockCount;++j) {
192 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
193 std::cout << ((me.di[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
196 std::cout << ::std::endl;
198 for(int j=0;j<me.blockCount;++j) {
199 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
200 std::cout << ((me.df[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
203 std::cout << ::std::endl;
207 template <typename TNeedle, typename TNeedle2>
208 void setHost (Pattern<TNeedle, MultipleShiftAnd> & me, TNeedle2 & needle)
210 setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
213 //____________________________________________________________________________
216 template <typename TNeedle>
217 inline void _patternInit (Pattern<TNeedle, MultipleShiftAnd> & me)
220 typedef unsigned int TWord;
222 if (me.prefSufMatch != 0) {
223 deallocate(me, me.prefSufMatch, me.blockCount);
225 allocate (me, me.prefSufMatch, me.blockCount);
226 arrayFill (me.prefSufMatch, me.prefSufMatch + me.blockCount, 0);
227 me.data_keyword.clear();
228 me.data_keywordIndex = 0;
231 //____________________________________________________________________________
233 template <typename TNeedle>
234 inline typename Host<Pattern<TNeedle, MultipleShiftAnd>const>::Type &
235 host(Pattern<TNeedle, MultipleShiftAnd> & me)
238 return value(me.data_needle);
241 template <typename TNeedle>
242 inline typename Host<Pattern<TNeedle, MultipleShiftAnd>const>::Type &
243 host(Pattern<TNeedle, MultipleShiftAnd> const & me)
246 return value(me.data_needle);
249 //____________________________________________________________________________
252 template <typename TNeedle>
253 inline typename Size<TNeedle>::Type
254 position(Pattern<TNeedle, MultipleShiftAnd> & me)
256 return me.data_keywordIndex;
260 template <typename TFinder, typename TNeedle>
261 bool _findShiftAnd_SmallNeedle(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
263 typedef unsigned int TWord;
264 typedef typename Size<TNeedle>::Type TSize;
265 while (!atEnd(finder)) {
266 TWord pos = convert<TWord>(*finder);
267 me.prefSufMatch[0] = ((me.prefSufMatch[0] << 1) | me.di[0]) & me.table[me.blockCount*pos];
272 for(int j=0;j<me.blockCount;++j) {
273 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
274 std::cout << ((me.prefSufMatch[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
277 std::cout << ::std::endl;
280 if ((me.prefSufMatch[0] & me.df[0]) != 0) {
281 // Check which pattern has matched
282 typename Iterator<TNeedle, Rooted>::Type it = begin(value(me.data_needle));
284 for(;!atEnd(it);goNext(it)) {
286 TWord test = (1<<((j-1)%BitsPerValue<TWord>::VALUE));
290 for(int j=0;j<me.blockCount;++j) {
291 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
292 std::cout << ((test & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
295 std::cout << ::std::endl;
297 if ((me.prefSufMatch[0] & test) != 0) {
298 me.data_keyword.push_back(Pair<TSize,TSize>(position(it),length(*it)));
301 me.data_keywordIndex = (me.data_keyword.front()).i1;
302 me.data_needleLength = (me.data_keyword.front()).i2;
303 me.data_keyword.pop_front();
304 finder -= (me.data_needleLength - 1);
312 template <typename TFinder, typename TNeedle>
313 bool _findShiftAnd_LargeNeedle(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
315 typedef typename Size<TNeedle>::Type TSize;
316 typedef unsigned int TWord;
317 while (!atEnd(finder)) {
318 TWord pos = convert<TWord>(*finder);
320 for(TWord block=0;block<me.blockCount;++block) {
321 bool newCarry = ((me.prefSufMatch[block] & (1<< (BitsPerValue<TWord>::VALUE - 1)))!=0);
322 me.prefSufMatch[block]<<=1;
323 me.prefSufMatch[block]|=carry;
326 for(TWord block=0;block<me.blockCount;++block) me.prefSufMatch[block] |= me.di[block];
327 for(TWord block=0;block<me.blockCount;++block) me.prefSufMatch[block] &= me.table[me.blockCount*pos+block];
332 for(int j=0;j<me.blockCount;++j) {
333 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
334 std::cout << ((me.prefSufMatch[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
337 std::cout << ::std::endl;
341 for(TWord block=0;block<me.blockCount;++block) {
342 if ((me.prefSufMatch[block] & me.df[block]) != 0) {
348 // Check which pattern has matched
349 typename Iterator<TNeedle, Rooted>::Type it = begin(value(me.data_needle));
351 for(;!atEnd(it);goNext(it)) {
354 allocate (me, test, me.blockCount);
355 arrayFill (test, test + me.blockCount, 0);
356 test[(j - 1) / BitsPerValue<TWord>::VALUE] |= (1<<((j-1)%BitsPerValue<TWord>::VALUE));
361 for(int i=0;i<me.blockCount;++i) {
362 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
363 std::cout << ((test[i] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
366 std::cout << ::std::endl;
369 if ((me.prefSufMatch[(j - 1) / BitsPerValue<TWord>::VALUE] & test[(j - 1) / BitsPerValue<TWord>::VALUE]) != 0) {
370 me.data_keyword.push_back(Pair<TSize,TSize>(position(it),length(*it)));
372 deallocate(me, test, me.blockCount);
374 me.data_keywordIndex = (me.data_keyword.front()).i1;
375 me.data_needleLength = (me.data_keyword.front()).i2;
376 me.data_keyword.pop_front();
377 finder -= (me.data_needleLength - 1);
385 template <typename TFinder, typename TNeedle>
386 inline bool find(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
389 // Check for left-over keywords
390 if ((!empty(finder)) &&
391 (!me.data_keyword.empty())) {
392 finder += me.data_needleLength - 1;
393 me.data_keywordIndex = (me.data_keyword.front()).i1;
394 me.data_needleLength = (me.data_keyword.front()).i2;
395 me.data_keyword.pop_front();
396 finder -= (me.data_needleLength - 1);
403 _finderSetNonEmpty(finder);
405 finder += me.data_needleLength;
407 // Fast algorithm for needles < machine word?
408 if (me.blockCount == 1) {
409 return _findShiftAnd_SmallNeedle(finder, me);
411 return _findShiftAnd_LargeNeedle(finder, me);
416 }// namespace SEQAN_NAMESPACE_MAIN
418 #endif //#ifndef SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H