1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
4 ============================================================================
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 ============================================================================
18 $Id: shape_base.h,v 1.1 2008/08/25 16:20:05 langmead Exp $
19 ==========================================================================*/
21 #ifndef SEQAN_HEADER_SHAPE_BASE_H
22 #define SEQAN_HEADER_SHAPE_BASE_H
24 namespace SEQAN_NAMESPACE_MAIN
29 typedef FixedShape<0> SimpleShape;
31 template <typename TSpec>
32 struct FixedGappedShape {};
33 typedef FixedGappedShape<Default> GappedShape;
39 ..summary:Stores hash value and shape for an ungapped or gapped q-gram.
40 ..signature:Shape<TValue, TSpec>
41 ..param.TValue:The @Metafunction.Value@ type of the string the shape is applied to (e.g. $Dna$).
42 ..param.TSpec:The specializing type.
43 ...default:@Spec.SimpleShape@, for ungapped q-grams.
44 ..remarks:The @Metafunction.ValueSize@ of Shape is the ValueSize of TValue which is the alphabet size.
45 ..remarks:To get the span or the weight of a shape call @Function.length@ or @Function.weight@.
49 ..signature:Shape<TValue, TSpec> ()
50 ..signature:Shape<TValue, TSpec> (shape)
51 ..param.shape:Other Shape object. (copy constructor)
53 template <typename TValue = Dna, typename TSpec = SimpleShape>
56 //////////////////////////////////////////////////////////////////////////////
58 ///.Metafunction.Value.param.T.type:Class.Shape
59 template <typename TValue, typename TSpec>
60 struct Value<Shape<TValue,TSpec> >
62 typedef unsigned Type;
65 ///.Metafunction.Size.param.T.type:Class.Shape
66 template <typename TValue, typename TSpec>
67 struct Size<Shape<TValue,TSpec> >
69 typedef unsigned Type;
72 ///.Metafunction.LENGTH.param.T.type:Class.Shape
73 template <typename TValue, unsigned q>
74 struct LENGTH< Shape<TValue, FixedShape<q> > >
79 ///.Metafunction.WEIGHT.param.T.type:Class.Shape
80 template <typename TValue, unsigned q>
81 struct WEIGHT< Shape<TValue, FixedShape<q> > >
86 ///.Metafunction.ValueSize.param.T.type:Class.Shape
87 template <typename TValue, typename TSpec>
88 struct ValueSize< Shape<TValue, TSpec> > {
90 ValueSize<TValue>::VALUE,
91 WEIGHT< Shape<TValue, TSpec> >::VALUE >::VALUE };
95 //////////////////////////////////////////////////////////////////////////////
100 ..summary:A variable length ungapped shape (also called q-gram or k-mer).
101 ..general:Class.Shape
102 ..signature:Shape<TValue, SimpleShape>
103 ..param.TValue:The @Metafunction.Value@ type of the string the shape is applied to (e.g. $Dna$).
104 ..remarks:A SimpleShape must be resized first to a valid length. To do so, call @Function.resize@.
105 ..see:Spec.FixedShape
108 //////////////////////////////////////////////////////////////////////////////
109 // ungapped shape with variable length
110 //////////////////////////////////////////////////////////////////////////////
112 template <typename TValue>
113 class Shape<TValue, SimpleShape>
116 //____________________________________________________________________________
119 typename Value<Shape>::Type hValue;
120 typename Value<Shape>::Type XValue;
121 typename Value<Shape>::Type leftFactor;
122 typename Value<Shape>::Type leftFactor2;
124 //____________________________________________________________________________
127 .Memfunc.SimpleShape#Shape:
128 ..class:Spec.SimpleShape
129 ..summary:Constructor
130 ..signature:Shape<TValue, SimpleShape> ()
131 ..signature:Shape<TValue, SimpleShape> (shape)
132 ..signature:Shape<TValue, SimpleShape> (q)
133 ..param.shape:Other Shape object. (copy constructor)
134 ..param.q:Length of the ungapped q-gram.
138 Shape(unsigned _span)
140 resize(*this, _span);
143 template <unsigned q>
144 Shape(Shape<TValue, FixedShape<q> > const &other)
149 //____________________________________________________________________________
151 template <unsigned q>
153 operator=(Shape<TValue, FixedShape<q> > const &other)
156 hValue = other.hValue;
157 XValue = other.XValue;
158 leftFactor = other.leftFactor;
159 leftFactor2 = other.leftFactor2;
160 leftChar = other.leftChar;
165 //////////////////////////////////////////////////////////////////////////////
166 // ungapped shape with fixed length q
167 //////////////////////////////////////////////////////////////////////////////
172 ..summary:A fixed length ungapped shape (also called q-gram or k-mer).
173 ..general:Class.Shape
174 ..signature:Shape<TValue, FixedShape<q> >
175 ..param.TValue:The @Metafunction.Value@ type of the sequence the shape is applied to (e.g. $Dna$).
176 ..param.q:The length of the shape.
179 template <typename TValue, unsigned q>
180 class Shape<TValue, FixedShape<q> >
183 //____________________________________________________________________________
186 enum { leftFactor = Power<ValueSize<TValue>::VALUE, q - 1>::VALUE };
187 enum { leftFactor2 = (Power<ValueSize<TValue>::VALUE, q>::VALUE - 1) / (ValueSize<TValue>::VALUE - 1) };
188 // Sigma^(q-1) + Sigma^(q-2) + ... + Sigma + 1
190 typename Value<Shape>::Type hValue; // current hash value
191 typename Value<Shape>::Type XValue; // Sum_{i=0..q-1} (x_i + 1)
192 TValue leftChar; // left-most character
193 //____________________________________________________________________________
199 //////////////////////////////////////////////////////////////////////////////
201 ///.Function.value.param.object.type:Class.Shape
202 template <typename TValue, typename TSpec>
203 inline typename Value< Shape<TValue, TSpec> >::Type
204 value(Shape<TValue, TSpec> &me)
209 //____________________________________________________________________________
211 ///.Function.length.param.object.type:Class.Shape
212 template <typename TValue, typename TSpec>
213 inline typename Size< Shape<TValue, TSpec> >::Type
214 length(Shape<TValue, TSpec> const &me)
220 //____________________________________________________________________________
224 ..summary:Number of relevant positions in a shape.
225 ..signature:weight(shape)
226 ..param.shape:Shape object for which the number of relevant positions is determined.
228 ..returns:Number of relevant positions.
229 ..remarks.text:For ungapped shapes the return value is the result of the @Function.length@ function.
230 For gapped shapes this is the number of '1's.
232 template <typename TValue, typename TSpec>
233 inline typename Size< Shape<TValue, TSpec> >::Type
234 weight(Shape<TValue, TSpec> const &me)
240 //____________________________________________________________________________
242 ///.Function.resize.param.object.type:Spec.SimpleShape
243 template <typename TValue, typename TSize>
244 inline typename Size< Shape<TValue, SimpleShape> >::Type
245 resize(Shape<TValue, SimpleShape> & me, TSize new_length)
248 me.leftFactor = _intPow((unsigned)ValueSize<TValue>::VALUE, new_length - 1);
249 me.leftFactor2 = (_intPow((unsigned)ValueSize<TValue>::VALUE, new_length) - 1) / (ValueSize<TValue>::VALUE - 1);
250 return me.span = new_length;
253 //____________________________________________________________________________
257 ..summary:Computes a (lower) hash value for a shape applied to a sequence.
258 ..signature:hash(shape, it)
259 ..signature:hash(shape, it, charsLeft)
260 ..param.shape:Shape to be used for hashing.
262 ..param.it:Sequence iterator pointing to the first character of the shape.
263 ..param.charsLeft:The distance of $it$ to the string end.
264 If $charsLeft$ is smaller than the shape's span, the hash value corresponds to the smallest shape beginning with $charsLeft$ characters.
265 ..returns:Hash value of the shape.
268 template <typename TValue, typename TIter>
269 typename Value< Shape<TValue, SimpleShape> >::Type
270 hash(Shape<TValue, SimpleShape> &me, TIter it)
273 typedef typename Value< Shape<TValue, SimpleShape> >::Type THValue;
274 typedef typename Size< Shape<TValue, SimpleShape> >::Type TSize;
276 me.hValue = ordValue(me.leftChar = *it);
277 for(TSize i = 1; i < me.span; ++i) {
279 me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
284 //____________________________________________________________________________
285 // fixed ungapped shapes
287 // loop unrolling ...
288 template <typename THValue, typename TValue, typename TIter>
290 _hashFixedShape(THValue hash, TIter &, TValue const, FixedShape<1> const) {
294 template <typename THValue, typename TValue, typename TIter, unsigned q>
296 _hashFixedShape(THValue hash, TIter &it, TValue const, FixedShape<q> const) {
298 return _hashFixedShape(
299 hash * ValueSize<TValue>::VALUE + ordValue((TValue)*it),
300 it, TValue(), FixedShape<q - 1>());
303 // ... for fixed ungapped shapes
304 template <typename TValue, unsigned q, typename TIter>
305 inline typename Value< Shape<TValue, FixedShape<q> > >::Type
306 hash(Shape<TValue, FixedShape<q> > &me, TIter it)
309 typedef typename Value< Shape<TValue, FixedShape<q> > >::Type THValue;
310 typedef typename Size< Shape<TValue, FixedShape<q> > >::Type TSize;
312 me.hValue = ordValue(me.leftChar = *it);
313 return me.hValue = _hashFixedShape(me.hValue, it, TValue(), FixedShape<q>());
317 template <typename TValue, typename TSpec, typename TIter, typename TSize>
318 inline typename Value< Shape<TValue, TSpec> >::Type
319 hash(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
322 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
324 TSize iEnd = me.span;
325 if (iEnd > charsLeft) iEnd = charsLeft;
329 me.hValue = ordValue(me.leftChar = *it);
330 for(i = 1; i < iEnd; ++i) {
332 me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
335 return me.hValue = 0;
337 // fill shape with zeros
338 for(; i < (TSize)me.span; ++i)
339 me.hValue *= ValueSize<TValue>::VALUE;
343 //____________________________________________________________________________
344 // Tuple -> fixed ungapped shapes
346 template <typename THValue, typename TValue, typename TTValue, unsigned SIZE, typename TCompressed>
348 _hashTuple2FixedShape(
350 Tuple<TTValue, SIZE, TCompressed> const &tuple,
354 return ordValue(tuple[0]);
357 template <typename THValue, typename TValue, typename TTValue, unsigned SIZE, typename TCompressed, unsigned q>
359 _hashTuple2FixedShape(
361 Tuple<TTValue, SIZE, TCompressed> const &tuple,
365 return _hashTuple2FixedShape(THValue(), tuple, TValue(), FixedShape<q - 1>())
366 * ValueSize<TValue>::VALUE + ordValue(tuple[q-1]);
369 // ... for fixed ungapped shapes
375 typename Value< Shape<TValue, FixedShape<q> > >::Type
377 Shape<TValue, FixedShape<q> > &me,
378 Tuple<TTValue, SIZE, Compressed> const &tuple)
381 if (ValueSize<TValue>::VALUE == (1 << BitsPerValue<TTValue>::VALUE))
385 return tuple >> (q - SIZE);
387 return me.hValue = _hashTuple2FixedShape(me.hValue, tuple, TValue(), FixedShape<q>());
394 typename TCompressed,
396 typename Value< Shape<TValue, FixedShape<q> > >::Type
398 Shape<TValue, FixedShape<q> > &me,
399 Tuple<TTValue, SIZE, TCompressed> const &tuple)
402 return me.hValue = _hashTuple2FixedShape(me.hValue, tuple, TValue(), FixedShape<q>());
405 //____________________________________________________________________________
407 /**.Function.hashUpper:
409 ..summary:Computes an upper hash value for a shape applied to a sequence.
410 ..signature:hashUpper(shape, it, charsLeft)
411 ..param.shape:Shape to be used for hashing.
413 ..param.it:Sequence iterator pointing to the first character of the shape.
414 ..param.charsLeft:The distance of $it$ to the string end.
415 If $charsLeft$ is smaller than the shape's span, the hash value corresponds to the biggest shape beginning with $charsLeft$ characters + 1.
416 ..returns:Upper hash value of the shape.
419 template <typename TValue, typename TSpec, typename TIter, typename TSize>
420 inline typename Value< Shape<TValue, TSpec> >::Type
421 hashUpper(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
424 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
426 TSize iEnd = me.span;
427 if (iEnd > charsLeft) iEnd = charsLeft;
431 me.hValue = ordValue(me.leftChar = *it);
432 for(i = 1; i < iEnd; ++i) {
434 me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
440 // fill shape with zeros
441 for(; i < (TSize)me.span; ++i)
442 me.hValue *= ValueSize<TValue>::VALUE;
446 //____________________________________________________________________________
451 ..summary:Computes the hash value for the adjacent shape.
452 ..signature:hashNext(shape, it)
453 ..param.shape:Shape to be used for hashing.
455 ..param.it:Sequence iterator pointing to the first character of the adjacent shape.
456 ..returns:Hash value of the q-gram.
457 ..remarks:@Function.hash@ has to be called before.
460 template <typename TValue, typename TSpec, typename TIter>
461 inline typename Value< Shape<TValue, TSpec> >::Type
462 hashNext(Shape<TValue, TSpec> &me, TIter &it)
465 // remove first, shift left, and add next character
466 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
468 (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor) * ValueSize<TValue>::VALUE
469 + ordValue((TValue)*(it + (THValue)me.span - 1));
474 //____________________________________________________________________________
478 ..summary:Computes a unique hash value of a shape, even if it is shorter than its span.
479 ..signature:hash2(shape, it, charsLeft)
480 ..param.shape:Shape to be used for hashing.
482 ..param.it:Sequence iterator pointing to the first character of the shape.
483 ..param.charsLeft:The distance of $it$ to the string end.
484 ..returns:Hash value of the shape.
487 template <typename TValue, typename TSpec, typename TIter, typename TSize>
488 inline typename Value< Shape<TValue, TSpec> >::Type
489 hash2(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
492 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
494 TSize iEnd = me.span;
495 if (iEnd > charsLeft) iEnd = charsLeft;
499 me.hValue = me.XValue = ordValue(me.leftChar = *it);
500 for(i = 1; i < iEnd; ++i) {
503 me.XValue += ordValue((TValue)*it);
505 me.hValue = me.hValue * ValueSize<TValue>::VALUE + me.XValue;
508 return me.hValue = me.XValue = 0;
510 // fill shape with zeros
511 for(; i < (TSize)me.span; ++i)
512 me.hValue = me.hValue * ValueSize<TValue>::VALUE + me.XValue;
513 return me.hValue += iEnd;
516 template <typename TValue, typename TSpec, typename TIter, typename TSize>
517 inline typename Value< Shape<TValue, TSpec> >::Type
518 hash2Upper(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
521 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
523 TSize iEnd = me.span;
524 if (iEnd > charsLeft) iEnd = charsLeft;
526 THValue hValue, XValue;
529 hValue = XValue = ordValue((TValue)*it);
530 for(i = 1; i < iEnd; ++i) {
533 XValue += ordValue((TValue)*it);
535 hValue = hValue * ValueSize<TValue>::VALUE + XValue;
540 if (charsLeft <= me.span) {
545 // fill shape with zeros
546 for(; i < (TSize)me.span; ++i)
547 hValue = hValue * ValueSize<TValue>::VALUE + XValue;
548 return hValue += iEnd;
551 //____________________________________________________________________________
556 ..summary:Computes a unique hash value for the adjacent shape, even if it is shorter than q.
557 ..signature:hash2Next(shape, it)
558 ..param.shape:Shape to be used for hashing the q-gram.
560 ..param.it:Sequence iterator pointing to the first character of the adjacent shape.
561 ..returns:Hash value of the shape.
562 ..remarks:@Function.hash@ has to be called before with $shape$ on the left adjacent q-gram.
565 template <typename TValue, typename TSpec, typename TIter, typename TSize>
566 inline typename Value< Shape<TValue, TSpec> >::Type
567 hash2Next(Shape<TValue, TSpec> &me, TIter &it, TSize charsLeft)
570 // remove first, shift left, and add next character
571 typedef typename Value< Shape<TValue, TSpec> >::Type THValue;
573 if (charsLeft >= me.span) {
575 me.XValue = me.XValue + ordValue((TValue)*(it + me.span - 1)) - ordValue(me.leftChar);
577 me.hValue = (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor2) * ValueSize<TValue>::VALUE + me.XValue
578 - me.span * (ValueSize<TValue>::VALUE - 1);
581 me.XValue -= ordValue(me.leftChar);
583 me.hValue = (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor2) * ValueSize<TValue>::VALUE + me.XValue
584 - charsLeft * (ValueSize<TValue>::VALUE - 1) - ValueSize<TValue>::VALUE;
591 template <typename TString, typename THash>
592 inline void unhash(TString &result, THash hash, unsigned q)
595 typedef typename Value<TString>::Type TValue;
598 for (unsigned i = q; i > 0; )
600 result[--i] = (TValue)(hash % ValueSize<TValue>::VALUE);
601 hash /= ValueSize<TValue>::VALUE;