1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
4 ============================================================================
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 ============================================================================
18 $Id: find_set_horspool.h,v 1.1 2008/08/25 16:20:07 langmead Exp $
19 ==========================================================================*/
21 #ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H
22 #define SEQAN_HEADER_FIND_SETHORSPOOL_H
24 namespace SEQAN_NAMESPACE_MAIN
27 //////////////////////////////////////////////////////////////////////////////
28 // Set Horspool Algorithm
29 //////////////////////////////////////////////////////////////////////////////
33 ..summary: Multiple exact string matching using set horspool algorithm.
34 ..general:Class.Pattern
36 ..signature:Pattern<TNeedle, SetHorspool>
37 ..param.TNeedle:The needle type, a string of keywords.
39 ..remarks.text:The types of all keywords in the needle and the haystack have to match.
42 ///.Class.Pattern.param.TSpec.type:Spec.SetHorspool
45 typedef Tag<_SetHorspool> SetHorspool;
47 //////////////////////////////////////////////////////////////////////////////
49 template <typename TNeedle>
50 class Pattern<TNeedle, SetHorspool> {
51 //____________________________________________________________________________
53 Pattern(Pattern const& other);
54 Pattern const& operator=(Pattern const & other);
56 //____________________________________________________________________________
58 typedef typename Size<TNeedle>::Type TSize;
59 typedef typename Value<TNeedle>::Type TValue;
60 typedef typename Value<TValue>::Type TAlphabet;
61 typedef Graph<Automaton<TAlphabet> > TGraph;
62 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
64 Holder<TNeedle> data_needle;
65 Graph<Automaton<TAlphabet> > data_reverseTrie; // Search trie
66 String<String<TSize> > data_terminalStateMap;
67 String<TSize> data_dMap; // Jump table
69 String<TSize> data_endPositions; // All remaining keyword indices
70 TSize data_keywordIndex; // Current keyword that produced a hit
71 TSize data_needleLength; // Last length of needle to reposition finder
72 TVertexDescriptor data_lastState; // Last state in the trie
74 //____________________________________________________________________________
79 template <typename TNeedle2>
80 Pattern(TNeedle2 const & ndl)
88 //____________________________________________________________________________
91 //////////////////////////////////////////////////////////////////////////////
93 //////////////////////////////////////////////////////////////////////////////
95 template <typename TNeedle>
96 struct Host< Pattern<TNeedle, SetHorspool> >
101 template <typename TNeedle>
102 struct Host< Pattern<TNeedle, SetHorspool> const>
104 typedef TNeedle const Type;
107 //////////////////////////////////////////////////////////////////////////////
109 //////////////////////////////////////////////////////////////////////////////
111 template <typename TNeedle, typename TNeedle2>
112 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 const & needle) {
114 typedef typename Value<TNeedle>::Type TKeyword;
115 typedef typename Size<TKeyword>::Type TSize;
116 typedef typename Value<TKeyword>::Type TAlphabet;
119 clear(me.data_reverseTrie);
120 clear(me.data_terminalStateMap);
121 clear(me.data_endPositions);
126 createTrieOnReverse(me.data_reverseTrie,me.data_terminalStateMap,needle);
127 assignRoot(me.data_reverseTrie,0);
128 setValue(me.data_needle, needle);
131 TSize alphabet_size = ValueSize<TAlphabet>::VALUE;
132 resize(me.data_dMap, alphabet_size);
133 me.data_lmin = _getInfinity<TSize>();
134 typename Iterator<TNeedle2 const, Rooted>::Type it = begin(needle);
135 for(;!atEnd(it);goNext(it)) {
136 TSize tmp = length(*it);
137 if (tmp<me.data_lmin) me.data_lmin = tmp;
139 for(TSize i=0;i<alphabet_size;++i) {
140 me.data_dMap[i]=me.data_lmin;
143 for(;!atEnd(it);goNext(it)) {
144 for(TSize pos = 0;pos < length(*it) - 1; ++pos) {
145 TSize ind = ordValue((TAlphabet)(*it)[pos]);
146 if ((length(*it)- 1 - pos) < me.data_dMap[ind]) {
147 me.data_dMap[ind] = (length(*it) - 1 - pos);
154 strm.open(TEST_PATH "my_trie.dot", ios_base::out | ios_base::trunc);
155 String<String<char> > nodeMap;
156 _createTrieNodeNames(me.data_reverseTrie, me.data_terminalStateMap, nodeMap);
157 String<String<char> > edgeMap;
158 _createEdgeNames(me.data_reverseTrie,edgeMap);
159 write(strm,me.data_reverseTrie,nodeMap,edgeMap,DotDrawing());
164 template <typename TNeedle, typename TNeedle2>
165 void setHost (Pattern<TNeedle, SetHorspool> & me, TNeedle2 & needle)
167 setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
170 //____________________________________________________________________________
173 template <typename TNeedle>
174 inline void _patternInit (Pattern<TNeedle, SetHorspool> & me)
177 clear(me.data_endPositions);
178 me.data_keywordIndex = 0;
179 me.data_lastState = getRoot(me.data_reverseTrie);
183 //____________________________________________________________________________
185 template <typename TNeedle>
186 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type &
187 host(Pattern<TNeedle, SetHorspool> & me)
190 return value(me.data_needle);
193 template <typename TNeedle>
194 inline typename Host<Pattern<TNeedle, SetHorspool>const>::Type &
195 host(Pattern<TNeedle, SetHorspool> const & me)
198 return value(me.data_needle);
201 //____________________________________________________________________________
204 template <typename TNeedle>
205 inline typename Size<TNeedle>::Type
206 position(Pattern<TNeedle, SetHorspool> & me)
208 return me.data_keywordIndex;
212 template <typename TFinder, typename TNeedle>
213 inline bool find(TFinder & finder, Pattern<TNeedle, SetHorspool> & me) {
215 typedef typename Value<TNeedle>::Type TKeyword;
216 typedef typename Size<TKeyword>::Type TSize;
217 typedef typename Value<TKeyword>::Type TAlphabet;
218 typedef Graph<Automaton<TAlphabet> > TGraph;
219 typedef typename VertexDescriptor<TGraph>::Type TVertexDescriptor;
221 TVertexDescriptor current = getRoot(me.data_reverseTrie);
223 // Process left-over hits
224 if ((!empty(finder)) &&
225 (!empty(me.data_endPositions))) {
226 finder += me.data_needleLength;
227 current = me.data_lastState;
228 me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
229 me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
230 if (length(me.data_endPositions) > 1) resize(me.data_endPositions, (length(me.data_endPositions)-1));
231 else clear(me.data_endPositions);
232 me.data_lastState = current;
233 finder -= me.data_needleLength;
237 TVertexDescriptor nilVal = getNil<TVertexDescriptor>();
241 _finderSetNonEmpty(finder);
242 finder += me.data_lmin - 1;
244 finder += me.data_needleLength;
245 j = me.data_needleLength + 1;
246 current = me.data_lastState;
249 TSize haystackLength = length(container(finder));
250 bool oldMatch = true;
251 // Do not change to !atEnd(finder) because of jump map!
252 while(position(finder) < haystackLength) {
253 while ((position(finder)>=j) &&
254 (getSuccessor(me.data_reverseTrie, current, *(finder-j))!= nilVal))
256 me.data_endPositions = getProperty(me.data_terminalStateMap,current);
257 if ((!oldMatch) && (!empty(me.data_endPositions))) break;
258 current = getSuccessor(me.data_reverseTrie, current, *(finder-j));
259 if (current == nilVal) break;
263 me.data_endPositions = getProperty(me.data_terminalStateMap,current);
265 (!empty(me.data_endPositions)))
267 me.data_keywordIndex = me.data_endPositions[length(me.data_endPositions)-1];
268 me.data_needleLength = length(getValue(host(me), me.data_keywordIndex))-1;
269 if (length(me.data_endPositions) > 1) resize(me.data_endPositions, length(me.data_endPositions)-1);
270 else clear(me.data_endPositions);
271 me.data_lastState = current;
272 finder -= me.data_needleLength;
276 TSize ind = ordValue(*finder);
277 setPosition(finder, position(finder) + getValue(me.data_dMap, ind));
279 current = getRoot(me.data_reverseTrie);
284 }// namespace SEQAN_NAMESPACE_MAIN
286 #endif //#ifndef SEQAN_HEADER_FIND_SETHORSPOOL_H