Imported Upstream version 0.12.7
[bowtie.git] / SeqAn-1.1 / seqan / find / find_set_horspool.h
1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de 
4  ============================================================================
5   Copyright (C) 2007
6
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17  ============================================================================
18   $Id: find_set_horspool.h,v 1.1 2008/08/25 16:20:07 langmead Exp $
19  ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H
22 #define SEQAN_HEADER_FIND_SETHORSPOOL_H
23
24 namespace SEQAN_NAMESPACE_MAIN
25 {
26
27 //////////////////////////////////////////////////////////////////////////////
28 // Set Horspool Algorithm
29 //////////////////////////////////////////////////////////////////////////////
30
31 /**
32 .Spec.SetHorspool:
33 ..summary: Multiple exact string matching using set horspool algorithm.
34 ..general:Class.Pattern
35 ..cat:Searching
36 ..signature:Pattern<TNeedle, SetHorspool>
37 ..param.TNeedle:The needle type, a string of keywords.
38 ...type:Class.String
39 ..remarks.text:The types of all keywords in the needle and the haystack have to match.
40 */
41
42 ///.Class.Pattern.param.TSpec.type:Spec.SetHorspool
43
44 struct _SetHorspool;
45 typedef Tag<_SetHorspool> SetHorspool;
46
47 //////////////////////////////////////////////////////////////////////////////
48
49 template <typename TNeedle>
50 class Pattern<TNeedle, SetHorspool> {
51 //____________________________________________________________________________
52 private:
53         Pattern(Pattern const& other);
54         Pattern const& operator=(Pattern const & other);
55
56 //____________________________________________________________________________
57 public:
58         typedef typename Size<TNeedle>::Type TSize;
59         typedef typename Value<TNeedle>::Type TValue;
60         typedef typename Value<TValue>::Type TAlphabet;
61         typedef Graph<Automaton<TAlphabet> > TGraph;
62         typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
63         
64         Holder<TNeedle> data_needle;
65         Graph<Automaton<TAlphabet> > data_reverseTrie;  // Search trie
66         String<String<TSize> > data_terminalStateMap;
67         String<TSize> data_dMap;        // Jump table
68         TSize data_lmin;
69         String<TSize> data_endPositions;        // All remaining keyword indices
70         TSize data_keywordIndex;                        // Current keyword that produced a hit
71         TSize data_needleLength;                        // Last length of needle to reposition finder
72         TVertexDescriptor data_lastState;   // Last state in the trie
73
74 //____________________________________________________________________________
75
76         Pattern() {
77         }
78
79         template <typename TNeedle2>
80         Pattern(TNeedle2 const & ndl)
81         {
82                 setHost(*this, ndl);
83         }
84
85         ~Pattern() {
86                 SEQAN_CHECKPOINT
87         }               
88 //____________________________________________________________________________
89 };
90
91 //////////////////////////////////////////////////////////////////////////////
92 // Host Metafunctions
93 //////////////////////////////////////////////////////////////////////////////
94
95 template <typename TNeedle>
96 struct Host< Pattern<TNeedle, SetHorspool> >
97 {
98         typedef TNeedle Type;
99 };
100
101 template <typename TNeedle>
102 struct Host< Pattern<TNeedle, SetHorspool> const>
103 {
104         typedef TNeedle const Type;
105 };
106
107 //////////////////////////////////////////////////////////////////////////////
108 // Functions
109 //////////////////////////////////////////////////////////////////////////////
110
111 template <typename TNeedle, typename TNeedle2>
112 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 const & needle) {
113         SEQAN_CHECKPOINT
114         typedef typename Value<TNeedle>::Type TKeyword;
115         typedef typename Size<TKeyword>::Type TSize;
116         typedef typename Value<TKeyword>::Type TAlphabet;
117
118         // clean-up
119         clear(me.data_reverseTrie);
120         clear(me.data_terminalStateMap);
121         clear(me.data_endPositions);
122         clear(me.data_dMap);
123         me.data_lmin=0;
124
125         // Create Trie
126         createTrieOnReverse(me.data_reverseTrie,me.data_terminalStateMap,needle);
127         assignRoot(me.data_reverseTrie,0);
128         setValue(me.data_needle, needle);
129
130         // Create jump map
131         TSize alphabet_size = ValueSize<TAlphabet>::VALUE;
132         resize(me.data_dMap, alphabet_size);
133         me.data_lmin = _getInfinity<TSize>();
134         typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle);
135         for(;!atEnd(it);goNext(it)) {
136                 TSize tmp = length(*it);
137                 if (tmp<me.data_lmin) me.data_lmin = tmp;
138         }
139         for(TSize i=0;i<alphabet_size;++i) {
140                 me.data_dMap[i]=me.data_lmin;
141         }
142         goBegin(it);
143         for(;!atEnd(it);goNext(it)) {
144                 for(TSize pos = 0;pos < length(*it) - 1; ++pos) {
145                         TSize ind = ordValue((TAlphabet)(*it)[pos]);    
146                         if ((length(*it)- 1 - pos) < me.data_dMap[ind]) {
147                                 me.data_dMap[ind] = (length(*it) - 1 - pos);
148                         }
149                 }
150         }
151
152         /*
153         fstream strm;
154         strm.open(TEST_PATH "my_trie.dot", ios_base::out | ios_base::trunc);
155         String<String<char> > nodeMap;
156         _createTrieNodeNames(me.data_reverseTrie, me.data_terminalStateMap, nodeMap);
157         String<String<char> > edgeMap;
158         _createEdgeNames(me.data_reverseTrie,edgeMap);
159         write(strm,me.data_reverseTrie,nodeMap,edgeMap,DotDrawing());
160         strm.close();
161         */
162 }
163
164 template <typename TNeedle, typename TNeedle2>
165 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 & needle)
166 {
167         setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
168 }
169
170 //____________________________________________________________________________
171
172
173 template <typename TNeedle>
174 inline void _patternInit (Pattern<TNeedle, SetHorspool> & me) 
175 {
176 SEQAN_CHECKPOINT
177         clear(me.data_endPositions);
178         me.data_keywordIndex = 0;
179         me.data_lastState = getRoot(me.data_reverseTrie);
180 }
181
182
183 //____________________________________________________________________________
184
185 template <typename TNeedle>
186 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type & 
187 host(Pattern<TNeedle, SetHorspool> & me)
188 {
189 SEQAN_CHECKPOINT
190         return value(me.data_needle);
191 }
192
193 template <typename TNeedle>
194 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type & 
195 host(Pattern<TNeedle, SetHorspool> const & me)
196 {
197 SEQAN_CHECKPOINT
198         return value(me.data_needle);
199 }
200
201 //____________________________________________________________________________
202
203
204 template <typename TNeedle>
205 inline typename Size<TNeedle>::Type
206 position(Pattern<TNeedle, SetHorspool> & me)
207 {
208         return me.data_keywordIndex;
209 }
210
211
212 template <typename TFinder, typename TNeedle>
213 inline bool find(TFinder & finder, Pattern<TNeedle, SetHorspool> & me) {
214         SEQAN_CHECKPOINT
215         typedef typename Value<TNeedle>::Type TKeyword;
216         typedef typename Size<TKeyword>::Type TSize;
217         typedef typename Value<TKeyword>::Type TAlphabet;
218         typedef Graph<Automaton<TAlphabet> > TGraph;
219         typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
220
221         TVertexDescriptor current = getRoot(me.data_reverseTrie); 
222
223         // Process left-over hits
224         if ((!empty(finder)) &&
225                 (!empty(me.data_endPositions))) {
226                 finder += me.data_needleLength;
227                 current = me.data_lastState;
228                 me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
229                 me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
230                 if (length(me.data_endPositions) > 1) resize(me.data_endPositions, (length(me.data_endPositions)-1));
231                 else clear(me.data_endPositions);
232                 me.data_lastState = current;
233                 finder -= me.data_needleLength;
234                 return true;
235         }
236
237         TVertexDescriptor nilVal = getNil<TVertexDescriptor>();
238         TSize j = 0;
239         if (empty(finder)) {
240                 _patternInit(me);
241                 _finderSetNonEmpty(finder);
242                 finder += me.data_lmin - 1;
243         } else {
244                 finder += me.data_needleLength;
245                 j = me.data_needleLength + 1;
246                 current = me.data_lastState;
247         }
248
249         TSize haystackLength = length(container(finder));
250         bool oldMatch = true;
251         // Do not change to !atEnd(finder) because of jump map!
252         while(position(finder) < haystackLength) {
253                 while ((position(finder)>=j) && 
254                                 (getSuccessor(me.data_reverseTrie, current, *(finder-j))!= nilVal))
255                 {
256                         me.data_endPositions = getProperty(me.data_terminalStateMap,current);
257                         if ((!oldMatch) && (!empty(me.data_endPositions))) break;
258                         current = getSuccessor(me.data_reverseTrie, current, *(finder-j));
259                         if (current == nilVal) break;
260                         ++j;
261                         oldMatch = false;
262                 }
263                 me.data_endPositions = getProperty(me.data_terminalStateMap,current);
264                 if ((!oldMatch) &&
265                         (!empty(me.data_endPositions)))
266                 {
267                         me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
268                         me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
269                         if (length(me.data_endPositions) > 1) resize(me.data_endPositions, length(me.data_endPositions)-1);
270                         else clear(me.data_endPositions);
271                         me.data_lastState = current;
272                         finder -= me.data_needleLength;
273                         return true;
274                 }
275                 oldMatch = false;
276                 TSize ind = ordValue(*finder);
277                 setPosition(finder, position(finder) + getValue(me.data_dMap, ind));
278                 j = 0;
279                 current = getRoot(me.data_reverseTrie);
280         }
281         return false;
282 }
283
284 }// namespace SEQAN_NAMESPACE_MAIN
285
286 #endif //#ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H