Commit patch to not break on spaces.
[bowtie.git] / SeqAn-1.1 / seqan / find / find_multiple_shiftand.h
1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de 
4  ============================================================================
5   Copyright (C) 2007
6
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17  ============================================================================
18   $Id: find_multiple_shiftand.h,v 1.1 2008/08/25 16:20:07 langmead Exp $
19  ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H
22 #define SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H
23
24 namespace SEQAN_NAMESPACE_MAIN
25 {
26
27 //////////////////////////////////////////////////////////////////////////////
28 // Multiple ShiftAnd
29 //////////////////////////////////////////////////////////////////////////////
30
31 /**
32 .Spec.MultipleShiftAnd:
33 ..summary: Multiple exact string matching using bit parallelism. The total size of the patterns should fit into a computer word.
34 ..general:Class.Pattern
35 ..cat:Searching
36 ..signature:Pattern<TNeedle, MultipleShiftAnd>
37 ..param.TNeedle:The needle type, a string of keywords.
38 ...type:Class.String
39 ..remarks.text:The types of all keywords in the needle and the haystack have to match.
40 */
41
42 ///.Class.Pattern.param.TSpec.type:Spec.MultipleShiftAnd
43
44 struct _MultipleShiftAnd;
45 typedef Tag<_MultipleShiftAnd> MultipleShiftAnd;
46
47 //////////////////////////////////////////////////////////////////////////////
48
49 template <typename TNeedle>
50 class Pattern<TNeedle, MultipleShiftAnd> {
51 //____________________________________________________________________________
52 private:
53         Pattern(Pattern const& other);
54         Pattern const& operator=(Pattern const & other);
55
56 //____________________________________________________________________________
57 public:
58         typedef unsigned int TWord;
59         typedef typename Size<TNeedle>::Type TSize;
60         Holder<TNeedle> data_needle;
61         TWord* table;                   // Look up table for each character in the alphabet (called B in "Navarro")
62         TWord* prefSufMatch;    // Set of all the prefixes of needle that match a suffix of haystack (called D in "Navarro")
63         TWord* di;                              // Initialization word
64         TWord* df;                              // Final test word
65         TWord alphabetSize;             // e.g., char --> 256
66         TWord totalLength;              // Lenght of concatenated keywords
67         TWord blockCount;               // #unsigned ints required to store needle      
68         std::deque<Pair<TSize, TSize> > data_keyword;  // All keywords that produced a hit here
69         TSize data_keywordIndex;  // Last keyword index
70         TSize data_needleLength;  // Last needle length
71
72 //____________________________________________________________________________
73
74         Pattern() {
75                 table = 0;
76                 prefSufMatch=0;
77                 di = 0;
78                 df = 0;
79         }
80
81         template <typename TNeedle2>
82         Pattern(TNeedle2 const & ndl)
83         {
84                 table = 0;
85                 prefSufMatch=0;
86                 di = 0;
87                 df = 0;
88                 setHost(*this, ndl);
89         }
90
91         ~Pattern() {
92                 SEQAN_CHECKPOINT
93                 if (table != 0) {
94                         deallocate(this, table, alphabetSize * blockCount);
95                         deallocate(this, prefSufMatch, blockCount);
96                         deallocate(this, di, blockCount);
97                         deallocate(this, df, blockCount);
98                 }
99         }               
100 //____________________________________________________________________________
101 };
102
103 //////////////////////////////////////////////////////////////////////////////
104 // Host Metafunctions
105 //////////////////////////////////////////////////////////////////////////////
106
107 template <typename TNeedle>
108 struct Host< Pattern<TNeedle, MultipleShiftAnd> >
109 {
110         typedef TNeedle Type;
111 };
112
113 template <typename TNeedle>
114 struct Host< Pattern<TNeedle, MultipleShiftAnd> const>
115 {
116         typedef TNeedle const Type;
117 };
118
119 //////////////////////////////////////////////////////////////////////////////
120 // Functions
121 //////////////////////////////////////////////////////////////////////////////
122
123 template <typename TNeedle, typename TNeedle2>
124 void setHost (Pattern<TNeedle, MultipleShiftAnd> & me, TNeedle2 const & needle) {
125         SEQAN_CHECKPOINT
126         typedef unsigned int TWord;
127         typedef typename Value<TNeedle>::Type TKeyword;
128         typedef typename Value<TKeyword>::Type TAlphabet;
129         if (me.table != 0) {
130                 deallocate(me, me.table, me.alphabetSize * me.blockCount);
131                 deallocate(me, me.di, me.blockCount);
132                 deallocate(me, me.df, me.blockCount);
133         }
134
135         typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle);
136         me.totalLength = 0;
137         for(;!atEnd(it);goNext(it)) {
138                 me.totalLength += length(*it);
139         }
140         me.alphabetSize = ValueSize<TAlphabet>::VALUE;
141         if (me.totalLength<1) me.blockCount=1;
142         else me.blockCount=((me.totalLength-1) / BitsPerValue<TWord>::VALUE)+1;
143                         
144         allocate (me, me.table, me.blockCount * me.alphabetSize);
145         arrayFill (me.table, me.table + me.blockCount * me.alphabetSize, 0);
146
147         allocate (me, me.di, me.blockCount);
148         arrayFill (me.di, me.di + me.blockCount, 0);
149
150         allocate (me, me.df, me.blockCount);
151         arrayFill (me.df, me.df + me.blockCount, 0);
152
153         goBegin(it);
154         TWord j = 0;
155         for(;!atEnd(it);goNext(it)) {
156                 me.di[j / BitsPerValue<TWord>::VALUE] |= (1<<(j%BitsPerValue<TWord>::VALUE));
157                 for (TWord posInKeyword = 0; posInKeyword < length(*it); ++posInKeyword) {
158                         // Determine character position in array table
159                         TWord index = convert<TWord>(getValue(*it,posInKeyword));
160                         me.table[me.blockCount*index + j / BitsPerValue<TWord>::VALUE] |= (1<<(j%BitsPerValue<TWord>::VALUE));
161                         ++j;
162                 }
163                 me.df[(j - 1) / BitsPerValue<TWord>::VALUE] |= (1<<((j-1)%BitsPerValue<TWord>::VALUE));
164         }
165         setValue(me.data_needle, needle);
166
167         /*
168         // Debug code
169         std::cout << "Alphabet size: " << me.alphabetSize << ::std::endl;
170         std::cout << "Needle length: " << me.totalLength << ::std::endl;
171         std::cout << "Block count: " << me.blockCount << ::std::endl;
172         std::cout << "K: ";
173         goBegin(it);
174         for(;!atEnd(it);goNext(it)) {
175                 std::cout << *it;
176         }
177         std::cout << ::std::endl;
178         std::cout << "Table: " << ::std::endl;
179         for(unsigned int i=0;i<me.alphabetSize;++i) {
180                 //if ((i<97) || (i>122)) continue;
181                 std::cout << TAlphabet(i) << ": ";
182                 for(int j=0;j<me.blockCount;++j) {
183                         for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
184                                 std::cout << ((me.table[me.blockCount*i+j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
185                         }
186                 }
187                 std::cout << ::std::endl;
188         }
189         std::cout << "DI and DF: " << ::std::endl;
190         std::cout << "I: ";
191         for(int j=0;j<me.blockCount;++j) {
192                 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
193                         std::cout << ((me.di[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
194                 }
195         }
196         std::cout << ::std::endl;
197         std::cout << "F: ";
198         for(int j=0;j<me.blockCount;++j) {
199                 for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
200                         std::cout << ((me.df[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
201                 }
202         }
203         std::cout << ::std::endl;
204         */
205 }
206
207 template <typename TNeedle, typename TNeedle2>
208 void setHost (Pattern<TNeedle, MultipleShiftAnd> & me, TNeedle2 & needle)
209 {
210         setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
211 }
212
213 //____________________________________________________________________________
214
215
216 template <typename TNeedle>
217 inline void _patternInit (Pattern<TNeedle, MultipleShiftAnd> & me) 
218 {
219 SEQAN_CHECKPOINT
220         typedef unsigned int TWord;
221
222         if (me.prefSufMatch != 0) {
223                 deallocate(me, me.prefSufMatch, me.blockCount);
224         }
225         allocate (me, me.prefSufMatch, me.blockCount);
226         arrayFill (me.prefSufMatch, me.prefSufMatch + me.blockCount, 0);
227         me.data_keyword.clear();
228         me.data_keywordIndex = 0;
229 }
230
231 //____________________________________________________________________________
232
233 template <typename TNeedle>
234 inline typename Host<Pattern<TNeedle, MultipleShiftAnd>const>::Type & 
235 host(Pattern<TNeedle, MultipleShiftAnd> & me)
236 {
237 SEQAN_CHECKPOINT
238         return value(me.data_needle);
239 }
240
241 template <typename TNeedle>
242 inline typename Host<Pattern<TNeedle, MultipleShiftAnd>const>::Type & 
243 host(Pattern<TNeedle, MultipleShiftAnd> const & me)
244 {
245 SEQAN_CHECKPOINT
246         return value(me.data_needle);
247 }
248
249 //____________________________________________________________________________
250
251
252 template <typename TNeedle>
253 inline typename Size<TNeedle>::Type
254 position(Pattern<TNeedle, MultipleShiftAnd> & me)
255 {
256         return me.data_keywordIndex;
257 }
258
259
260 template <typename TFinder, typename TNeedle>
261 bool _findShiftAnd_SmallNeedle(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
262         SEQAN_CHECKPOINT
263         typedef unsigned int TWord;
264         typedef typename Size<TNeedle>::Type TSize;
265         while (!atEnd(finder)) {
266                 TWord pos = convert<TWord>(*finder);
267                 me.prefSufMatch[0] = ((me.prefSufMatch[0] << 1) | me.di[0]) & me.table[me.blockCount*pos];
268
269                 /*
270                 // Debug code
271                 std::cout << "   ";
272                 for(int j=0;j<me.blockCount;++j) {
273                         for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
274                                 std::cout << ((me.prefSufMatch[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
275                         }
276                 }
277                 std::cout << ::std::endl;
278                 */
279
280                 if ((me.prefSufMatch[0] & me.df[0]) != 0) {
281                         // Check which pattern has matched
282                         typename Iterator<TNeedle, Rooted>::Type it = begin(value(me.data_needle));
283                         TWord j = 0;
284                         for(;!atEnd(it);goNext(it)) {
285                                 j += length(*it);
286                                 TWord test = (1<<((j-1)%BitsPerValue<TWord>::VALUE));
287                                 /*
288                                 // Debug code
289                                 std::cout << "Tes";
290                                 for(int j=0;j<me.blockCount;++j) {
291                                         for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
292                                                 std::cout << ((test & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
293                                         }
294                                 }
295                                 std::cout << ::std::endl;
296                                 */
297                                 if ((me.prefSufMatch[0] & test) != 0) {
298                                         me.data_keyword.push_back(Pair<TSize,TSize>(position(it),length(*it)));
299                                 }
300                         }
301                         me.data_keywordIndex = (me.data_keyword.front()).i1;
302                         me.data_needleLength = (me.data_keyword.front()).i2;
303                         me.data_keyword.pop_front();
304                         finder -= (me.data_needleLength - 1);
305                         return true;
306                 }
307                 goNext(finder);
308         }
309         return false;
310 }
311
312 template <typename TFinder, typename TNeedle>
313 bool _findShiftAnd_LargeNeedle(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
314         SEQAN_CHECKPOINT
315         typedef typename Size<TNeedle>::Type TSize;
316         typedef unsigned int TWord;
317         while (!atEnd(finder)) {
318                 TWord pos = convert<TWord>(*finder);
319                 TWord carry = 1;
320                 for(TWord block=0;block<me.blockCount;++block) {
321                         bool newCarry = ((me.prefSufMatch[block] & (1<< (BitsPerValue<TWord>::VALUE - 1)))!=0); 
322                         me.prefSufMatch[block]<<=1;
323                         me.prefSufMatch[block]|=carry;
324                         carry = newCarry;
325                 }
326                 for(TWord block=0;block<me.blockCount;++block) me.prefSufMatch[block] |= me.di[block];
327                 for(TWord block=0;block<me.blockCount;++block) me.prefSufMatch[block] &= me.table[me.blockCount*pos+block];
328
329                 /*
330                 // Debug code
331                 std::cout << "   ";
332                 for(int j=0;j<me.blockCount;++j) {
333                         for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
334                                 std::cout << ((me.prefSufMatch[j] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
335                         }
336                 }
337                 std::cout << ::std::endl;
338                 */
339
340                 bool match = false;
341                 for(TWord block=0;block<me.blockCount;++block) {
342                         if ((me.prefSufMatch[block] & me.df[block]) != 0) {
343                                 match = true;
344                                 break;
345                         }
346                 }
347                 if (match) {
348                         // Check which pattern has matched
349                         typename Iterator<TNeedle, Rooted>::Type it = begin(value(me.data_needle));
350                         TWord j = 0;
351                         for(;!atEnd(it);goNext(it)) {
352                                 j += length(*it);
353                                 TWord* test;
354                                 allocate (me, test, me.blockCount);
355                                 arrayFill (test, test + me.blockCount, 0);
356                                 test[(j - 1) / BitsPerValue<TWord>::VALUE] |= (1<<((j-1)%BitsPerValue<TWord>::VALUE));
357
358                                 /*
359                                 // Debug code
360                                 std::cout << "Tes";
361                                 for(int i=0;i<me.blockCount;++i) {
362                                         for(int bit_pos=0;bit_pos<BitsPerValue<unsigned int>::VALUE;++bit_pos) {
363                                                 std::cout << ((test[i] & (1<<(bit_pos % BitsPerValue<unsigned int>::VALUE))) !=0);
364                                         }
365                                 }
366                                 std::cout << ::std::endl;
367                                 */
368
369                                 if ((me.prefSufMatch[(j - 1) / BitsPerValue<TWord>::VALUE] & test[(j - 1) / BitsPerValue<TWord>::VALUE]) != 0) {
370                                         me.data_keyword.push_back(Pair<TSize,TSize>(position(it),length(*it)));                 
371                                 }
372                                 deallocate(me, test, me.blockCount);
373                         }
374                         me.data_keywordIndex = (me.data_keyword.front()).i1;
375                         me.data_needleLength = (me.data_keyword.front()).i2;
376                         me.data_keyword.pop_front();
377                         finder -= (me.data_needleLength - 1);
378                         return true;
379                 }
380                 goNext(finder);
381         }
382         return false;
383 }
384
385 template <typename TFinder, typename TNeedle>
386 inline bool find(TFinder & finder, Pattern<TNeedle, MultipleShiftAnd> & me) {
387         SEQAN_CHECKPOINT
388
389         // Check for left-over keywords
390         if ((!empty(finder)) &&
391                 (!me.data_keyword.empty())) {
392                 finder += me.data_needleLength - 1;
393                 me.data_keywordIndex = (me.data_keyword.front()).i1;
394                 me.data_needleLength = (me.data_keyword.front()).i2;
395                 me.data_keyword.pop_front();
396                 finder -= (me.data_needleLength - 1);
397                 return true;
398         }
399
400
401         if (empty(finder)) {
402                 _patternInit(me);
403                 _finderSetNonEmpty(finder);
404         } else
405                 finder += me.data_needleLength;
406
407         // Fast algorithm for needles < machine word?
408         if (me.blockCount == 1) {
409                 return _findShiftAnd_SmallNeedle(finder, me);
410         } else {
411                 return _findShiftAnd_LargeNeedle(finder, me);
412         }
413         return false;
414 }
415
416 }// namespace SEQAN_NAMESPACE_MAIN
417
418 #endif //#ifndef SEQAN_HEADER_FIND_MULTIPLESHIFTAND_H