Imported Upstream version 0.12.7
[bowtie.git] / SeqAn-1.1 / seqan / index / shape_base.h
1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de 
4  ============================================================================
5   Copyright (C) 2007
6
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17  ============================================================================
18   $Id: shape_base.h,v 1.1 2008/08/25 16:20:05 langmead Exp $
19  ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_SHAPE_BASE_H
22 #define SEQAN_HEADER_SHAPE_BASE_H
23
24 namespace SEQAN_NAMESPACE_MAIN
25 {
26
27         template <unsigned q>
28         struct FixedShape {};
29         typedef FixedShape<0> SimpleShape;
30
31         template <typename TSpec>
32         struct FixedGappedShape {};
33         typedef FixedGappedShape<Default> GappedShape;
34
35
36 /**
37 .Class.Shape:
38 ..cat:Index
39 ..summary:Stores hash value and shape for an ungapped or gapped q-gram.
40 ..signature:Shape<TValue, TSpec>
41 ..param.TValue:The @Metafunction.Value@ type of the string the shape is applied to (e.g. $Dna$).
42 ..param.TSpec:The specializing type.
43 ...default:@Spec.SimpleShape@, for ungapped q-grams.
44 ..remarks:The @Metafunction.ValueSize@ of Shape is the ValueSize of TValue which is the alphabet size.
45 ..remarks:To get the span or the weight of a shape call @Function.length@ or @Function.weight@.
46 .Memfunc.Shape#Shape:
47 ..class:Class.Shape
48 ..summary:Constructor
49 ..signature:Shape<TValue, TSpec> ()
50 ..signature:Shape<TValue, TSpec> (shape)
51 ..param.shape:Other Shape object. (copy constructor)
52 */
53         template <typename TValue = Dna, typename TSpec = SimpleShape>
54         class Shape;
55
56 //////////////////////////////////////////////////////////////////////////////
57
58 ///.Metafunction.Value.param.T.type:Class.Shape
59         template <typename TValue, typename TSpec>
60         struct Value<Shape<TValue,TSpec> >
61         {
62                 typedef unsigned Type;
63         };
64
65 ///.Metafunction.Size.param.T.type:Class.Shape
66         template <typename TValue, typename TSpec>
67         struct Size<Shape<TValue,TSpec> >
68         {
69                 typedef unsigned Type;
70         };
71
72 ///.Metafunction.LENGTH.param.T.type:Class.Shape
73     template <typename TValue, unsigned q>
74         struct LENGTH< Shape<TValue, FixedShape<q> > >
75         {
76                 enum { VALUE = q };
77         };
78
79 ///.Metafunction.WEIGHT.param.T.type:Class.Shape
80     template <typename TValue, unsigned q>
81         struct WEIGHT< Shape<TValue, FixedShape<q> > >
82         {
83                 enum { VALUE = q };
84         };
85
86 ///.Metafunction.ValueSize.param.T.type:Class.Shape
87         template <typename TValue, typename TSpec>
88         struct ValueSize< Shape<TValue, TSpec> > {
89                 enum { VALUE = Power<
90                                                 ValueSize<TValue>::VALUE, 
91                                                 WEIGHT< Shape<TValue, TSpec> >::VALUE >::VALUE };
92         };
93
94
95 //////////////////////////////////////////////////////////////////////////////
96
97 /**
98 .Spec.SimpleShape:
99 ..cat:Index
100 ..summary:A variable length ungapped shape (also called q-gram or k-mer).
101 ..general:Class.Shape
102 ..signature:Shape<TValue, SimpleShape>
103 ..param.TValue:The @Metafunction.Value@ type of the string the shape is applied to (e.g. $Dna$).
104 ..remarks:A SimpleShape must be resized first to a valid length. To do so, call @Function.resize@.
105 ..see:Spec.FixedShape
106 */
107
108         //////////////////////////////////////////////////////////////////////////////
109         // ungapped shape with variable length
110         //////////////////////////////////////////////////////////////////////////////
111
112         template <typename TValue>
113         class Shape<TValue, SimpleShape>
114         {
115         public:
116 //____________________________________________________________________________
117
118                 unsigned                                        span;
119                 typename Value<Shape>::Type     hValue;
120                 typename Value<Shape>::Type     XValue;
121                 typename Value<Shape>::Type     leftFactor;
122                 typename Value<Shape>::Type     leftFactor2;
123                 TValue                                          leftChar;
124 //____________________________________________________________________________
125                 
126 /**
127 .Memfunc.SimpleShape#Shape:
128 ..class:Spec.SimpleShape
129 ..summary:Constructor
130 ..signature:Shape<TValue, SimpleShape> ()
131 ..signature:Shape<TValue, SimpleShape> (shape)
132 ..signature:Shape<TValue, SimpleShape> (q)
133 ..param.shape:Other Shape object. (copy constructor)
134 ..param.q:Length of the ungapped q-gram.
135 */
136                 Shape() {}
137
138                 Shape(unsigned _span)
139                 {
140                         resize(*this, _span);
141                 }
142
143                 template <unsigned q>
144                 Shape(Shape<TValue, FixedShape<q> > const &other)
145                 {
146                         *this = other;
147                 }       
148
149 //____________________________________________________________________________
150
151                 template <unsigned q>
152                 inline Shape &
153                 operator=(Shape<TValue, FixedShape<q> > const &other)
154                 {
155                         span = other.span;
156                         hValue = other.hValue;
157                         XValue = other.XValue;
158                         leftFactor = other.leftFactor;
159                         leftFactor2 = other.leftFactor2;
160                         leftChar = other.leftChar;
161                         return *this;
162                 }
163         };
164
165         //////////////////////////////////////////////////////////////////////////////
166         // ungapped shape with fixed length q
167         //////////////////////////////////////////////////////////////////////////////
168
169 /**
170 .Spec.FixedShape:
171 ..cat:Index
172 ..summary:A fixed length ungapped shape (also called q-gram or k-mer).
173 ..general:Class.Shape
174 ..signature:Shape<TValue, FixedShape<q> >
175 ..param.TValue:The @Metafunction.Value@ type of the sequence the shape is applied to (e.g. $Dna$).
176 ..param.q:The length of the shape.
177 */
178
179         template <typename TValue, unsigned q>
180         class Shape<TValue, FixedShape<q> >
181         {
182         public:
183 //____________________________________________________________________________
184
185                 enum { span = q };
186                 enum { leftFactor = Power<ValueSize<TValue>::VALUE, q - 1>::VALUE };
187                 enum { leftFactor2 = (Power<ValueSize<TValue>::VALUE, q>::VALUE - 1) / (ValueSize<TValue>::VALUE - 1) };
188                 // Sigma^(q-1) + Sigma^(q-2) + ... + Sigma + 1
189
190                 typename Value<Shape>::Type     hValue;         // current hash value
191                 typename Value<Shape>::Type     XValue;         // Sum_{i=0..q-1} (x_i + 1)
192                 TValue                                          leftChar;       // left-most character
193 //____________________________________________________________________________
194                 
195         };
196
197
198
199 //////////////////////////////////////////////////////////////////////////////
200
201 ///.Function.value.param.object.type:Class.Shape
202         template <typename TValue, typename TSpec>
203         inline typename Value< Shape<TValue, TSpec> >::Type
204         value(Shape<TValue, TSpec> &me)
205         {
206                 return me.hValue;
207         }
208
209 //____________________________________________________________________________
210
211 ///.Function.length.param.object.type:Class.Shape
212         template <typename TValue, typename TSpec>
213         inline typename Size< Shape<TValue, TSpec> >::Type
214         length(Shape<TValue, TSpec> const &me)
215         {
216         SEQAN_CHECKPOINT
217                 return me.span;
218         }
219
220 //____________________________________________________________________________
221
222 /**.Function.weight:
223 ..cat:Index
224 ..summary:Number of relevant positions in a shape.
225 ..signature:weight(shape)
226 ..param.shape:Shape object for which the number of relevant positions is determined.
227 ...type:Class.Shape
228 ..returns:Number of relevant positions.
229 ..remarks.text:For ungapped shapes the return value is the result of the @Function.length@ function.
230 For gapped shapes this is the number of '1's.
231 */
232         template <typename TValue, typename TSpec>
233         inline typename Size< Shape<TValue, TSpec> >::Type
234         weight(Shape<TValue, TSpec> const &me)
235         {
236         SEQAN_CHECKPOINT
237                 return length(me);
238         }
239
240 //____________________________________________________________________________
241
242 ///.Function.resize.param.object.type:Spec.SimpleShape
243         template <typename TValue, typename TSize>
244         inline typename Size< Shape<TValue, SimpleShape> >::Type
245         resize(Shape<TValue, SimpleShape> & me, TSize new_length)
246         {
247         SEQAN_CHECKPOINT
248                 me.leftFactor = _intPow((unsigned)ValueSize<TValue>::VALUE, new_length - 1);
249                 me.leftFactor2 = (_intPow((unsigned)ValueSize<TValue>::VALUE, new_length) - 1) / (ValueSize<TValue>::VALUE - 1);
250                 return me.span = new_length;
251         }
252
253 //____________________________________________________________________________
254
255 /**.Function.hash:
256 ..cat:Index
257 ..summary:Computes a (lower) hash value for a shape applied to a sequence.
258 ..signature:hash(shape, it)
259 ..signature:hash(shape, it, charsLeft)
260 ..param.shape:Shape to be used for hashing.
261 ...type:Class.Shape
262 ..param.it:Sequence iterator pointing to the first character of the shape.
263 ..param.charsLeft:The distance of $it$ to the string end. 
264 If $charsLeft$ is smaller than the shape's span, the hash value corresponds to the smallest shape beginning with $charsLeft$ characters.
265 ..returns:Hash value of the shape.
266 */
267
268         template <typename TValue, typename TIter>
269         typename Value< Shape<TValue, SimpleShape> >::Type
270         hash(Shape<TValue, SimpleShape> &me, TIter it)
271         {
272         SEQAN_CHECKPOINT
273                 typedef typename Value< Shape<TValue, SimpleShape> >::Type      THValue;
274                 typedef typename Size< Shape<TValue, SimpleShape> >::Type       TSize;
275
276                 me.hValue = ordValue(me.leftChar = *it);
277                 for(TSize i = 1; i < me.span; ++i) {
278                         ++it;
279                         me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
280                 }
281                 return me.hValue;
282         }
283
284 //____________________________________________________________________________
285 // fixed ungapped shapes
286
287         // loop unrolling ...
288         template <typename THValue, typename TValue, typename TIter>
289         inline THValue
290         _hashFixedShape(THValue hash, TIter &, TValue const, FixedShape<1> const) {
291                 return hash;
292         }
293
294         template <typename THValue, typename TValue, typename TIter, unsigned q>
295         inline THValue
296         _hashFixedShape(THValue hash, TIter &it, TValue const, FixedShape<q> const) {
297                 ++it;
298                 return _hashFixedShape(
299                         hash * ValueSize<TValue>::VALUE + ordValue((TValue)*it),
300                         it, TValue(), FixedShape<q - 1>());
301         }
302
303         // ... for fixed ungapped shapes
304         template <typename TValue, unsigned q, typename TIter>
305         inline typename Value< Shape<TValue, FixedShape<q> > >::Type
306         hash(Shape<TValue, FixedShape<q> > &me, TIter it)
307         {
308         SEQAN_CHECKPOINT
309                 typedef typename Value< Shape<TValue, FixedShape<q> > >::Type   THValue;
310                 typedef typename Size< Shape<TValue, FixedShape<q> > >::Type    TSize;
311
312                 me.hValue = ordValue(me.leftChar = *it);
313                 return me.hValue = _hashFixedShape(me.hValue, it, TValue(), FixedShape<q>());
314         }
315
316
317         template <typename TValue, typename TSpec, typename TIter, typename TSize>
318         inline typename Value< Shape<TValue, TSpec> >::Type
319         hash(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
320         {
321         SEQAN_CHECKPOINT
322                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
323
324                 TSize iEnd = me.span;
325                 if (iEnd > charsLeft) iEnd = charsLeft;
326
327                 TSize i = 0;
328                 if (iEnd > 0) {
329                         me.hValue = ordValue(me.leftChar = *it);
330                         for(i = 1; i < iEnd; ++i) {
331                                 ++it;
332                                 me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
333                         }
334                 } else
335                         return me.hValue = 0;
336
337                 // fill shape with zeros
338                 for(; i < (TSize)me.span; ++i)
339                         me.hValue *= ValueSize<TValue>::VALUE;
340                 return me.hValue;
341         }
342
343 //____________________________________________________________________________
344 // Tuple -> fixed ungapped shapes
345
346         template <typename THValue, typename TValue, typename TTValue, unsigned SIZE, typename TCompressed>
347         inline THValue
348         _hashTuple2FixedShape(
349                 THValue const, 
350                 Tuple<TTValue, SIZE, TCompressed> const &tuple,
351                 TValue const,
352                 FixedShape<1> const) 
353         {
354                 return ordValue(tuple[0]);
355         }
356
357         template <typename THValue, typename TValue, typename TTValue, unsigned SIZE, typename TCompressed, unsigned q>
358         inline THValue
359         _hashTuple2FixedShape(
360                 THValue const, 
361                 Tuple<TTValue, SIZE, TCompressed> const &tuple,
362                 TValue const,
363                 FixedShape<q> const) 
364         {
365                 return _hashTuple2FixedShape(THValue(), tuple, TValue(), FixedShape<q - 1>()) 
366                         * ValueSize<TValue>::VALUE + ordValue(tuple[q-1]);
367         }
368
369         // ... for fixed ungapped shapes
370         template <
371                 typename TValue,
372                 typename TTValue, 
373                 unsigned SIZE, 
374                 unsigned q>
375         typename Value< Shape<TValue, FixedShape<q> > >::Type
376         hash(
377                 Shape<TValue, FixedShape<q> > &me, 
378                 Tuple<TTValue, SIZE, Compressed> const &tuple)
379         {
380         SEQAN_CHECKPOINT
381                 if (ValueSize<TValue>::VALUE == (1 << BitsPerValue<TTValue>::VALUE))
382                         if (q == SIZE)
383                                 return tuple.i;
384                         else
385                                 return tuple >> (q - SIZE);
386                 else
387                         return me.hValue = _hashTuple2FixedShape(me.hValue, tuple, TValue(), FixedShape<q>());
388         }
389
390         template <
391                 typename TValue,
392                 typename TTValue, 
393                 unsigned SIZE, 
394                 typename TCompressed, 
395                 unsigned q>
396         typename Value< Shape<TValue, FixedShape<q> > >::Type
397         hash(
398                 Shape<TValue, FixedShape<q> > &me, 
399                 Tuple<TTValue, SIZE, TCompressed> const &tuple)
400         {
401         SEQAN_CHECKPOINT
402                 return me.hValue = _hashTuple2FixedShape(me.hValue, tuple, TValue(), FixedShape<q>());
403         }
404
405 //____________________________________________________________________________
406
407 /**.Function.hashUpper:
408 ..cat:Index
409 ..summary:Computes an upper hash value for a shape applied to a sequence.
410 ..signature:hashUpper(shape, it, charsLeft)
411 ..param.shape:Shape to be used for hashing.
412 ...type:Class.Shape
413 ..param.it:Sequence iterator pointing to the first character of the shape.
414 ..param.charsLeft:The distance of $it$ to the string end. 
415 If $charsLeft$ is smaller than the shape's span, the hash value corresponds to the biggest shape beginning with $charsLeft$ characters + 1.
416 ..returns:Upper hash value of the shape.
417 */
418
419         template <typename TValue, typename TSpec, typename TIter, typename TSize>
420         inline typename Value< Shape<TValue, TSpec> >::Type
421         hashUpper(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
422         {
423         SEQAN_CHECKPOINT
424                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
425
426                 TSize iEnd = me.span;
427                 if (iEnd > charsLeft) iEnd = charsLeft;
428
429                 TSize i = 0;
430                 if (iEnd > 0) {
431                         me.hValue = ordValue(me.leftChar = *it);
432                         for(i = 1; i < iEnd; ++i) {
433                                 ++it;
434                                 me.hValue = me.hValue * ValueSize<TValue>::VALUE + ordValue((TValue)*it);
435                         }
436                         ++me.hValue;
437                 } else
438                         me.hValue = 1;
439
440                 // fill shape with zeros
441                 for(; i < (TSize)me.span; ++i)
442                         me.hValue *= ValueSize<TValue>::VALUE;
443                 return me.hValue;
444         }
445
446 //____________________________________________________________________________
447
448 /**
449 .Function.hashNext:
450 ..cat:Index
451 ..summary:Computes the hash value for the adjacent shape.
452 ..signature:hashNext(shape, it)
453 ..param.shape:Shape to be used for hashing.
454 ...type:Class.Shape
455 ..param.it:Sequence iterator pointing to the first character of the adjacent shape.
456 ..returns:Hash value of the q-gram.
457 ..remarks:@Function.hash@ has to be called before.
458 */
459
460         template <typename TValue, typename TSpec, typename TIter>
461         inline typename Value< Shape<TValue, TSpec> >::Type
462         hashNext(Shape<TValue, TSpec> &me, TIter &it)
463         {
464         SEQAN_CHECKPOINT
465                 // remove first, shift left, and add next character
466                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
467                 me.hValue = 
468                         (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor) * ValueSize<TValue>::VALUE
469                         + ordValue((TValue)*(it + (THValue)me.span - 1));
470                 me.leftChar = *it;
471                 return me.hValue;
472         }
473
474 //____________________________________________________________________________
475
476 /**.Function.hash2:
477 ..cat:Index
478 ..summary:Computes a unique hash value of a shape, even if it is shorter than its span.
479 ..signature:hash2(shape, it, charsLeft)
480 ..param.shape:Shape to be used for hashing.
481 ...type:Class.Shape
482 ..param.it:Sequence iterator pointing to the first character of the shape.
483 ..param.charsLeft:The distance of $it$ to the string end. 
484 ..returns:Hash value of the shape.
485 */
486
487         template <typename TValue, typename TSpec, typename TIter, typename TSize>
488         inline typename Value< Shape<TValue, TSpec> >::Type
489         hash2(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
490         {
491         SEQAN_CHECKPOINT
492                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
493
494                 TSize iEnd = me.span;
495                 if (iEnd > charsLeft) iEnd = charsLeft;
496
497                 TSize i = 0;
498                 if (iEnd > 0) {
499                         me.hValue = me.XValue = ordValue(me.leftChar = *it);
500                         for(i = 1; i < iEnd; ++i) {
501                                 ++it;
502                                 // update sum of x_i
503                                 me.XValue += ordValue((TValue)*it);
504                                 // shift hash
505                                 me.hValue = me.hValue * ValueSize<TValue>::VALUE + me.XValue;
506                         }
507                 } else
508                         return me.hValue = me.XValue = 0;
509
510                 // fill shape with zeros
511                 for(; i < (TSize)me.span; ++i)
512                         me.hValue = me.hValue * ValueSize<TValue>::VALUE + me.XValue;
513                 return me.hValue += iEnd;
514         }
515
516         template <typename TValue, typename TSpec, typename TIter, typename TSize>
517         inline typename Value< Shape<TValue, TSpec> >::Type
518         hash2Upper(Shape<TValue, TSpec> &me, TIter it, TSize charsLeft)
519         {
520         SEQAN_CHECKPOINT
521                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
522
523                 TSize iEnd = me.span;
524                 if (iEnd > charsLeft) iEnd = charsLeft;
525
526                 THValue hValue, XValue;
527                 TSize i = 0;
528                 if (iEnd > 0) {
529                         hValue = XValue = ordValue((TValue)*it);
530                         for(i = 1; i < iEnd; ++i) {
531                                 ++it;
532                                 // update sum of x_i
533                                 XValue += ordValue((TValue)*it);
534                                 // shift hash
535                                 hValue = hValue * ValueSize<TValue>::VALUE + XValue;
536                         }
537                 } else
538                         hValue = XValue = 0;
539
540                 if (charsLeft <= me.span) {
541                         ++XValue;
542                         ++hValue;
543                 }
544
545                 // fill shape with zeros
546                 for(; i < (TSize)me.span; ++i)
547                         hValue = hValue * ValueSize<TValue>::VALUE + XValue;
548                 return hValue += iEnd;
549         }
550
551 //____________________________________________________________________________
552
553 /**
554 .Function.hash2Next:
555 ..cat:Index
556 ..summary:Computes a unique hash value for the adjacent shape, even if it is shorter than q.
557 ..signature:hash2Next(shape, it)
558 ..param.shape:Shape to be used for hashing the q-gram.
559 ...type:Class.Shape
560 ..param.it:Sequence iterator pointing to the first character of the adjacent shape.
561 ..returns:Hash value of the shape.
562 ..remarks:@Function.hash@ has to be called before with $shape$ on the left adjacent q-gram.
563 */
564
565         template <typename TValue, typename TSpec, typename TIter, typename TSize>
566         inline typename Value< Shape<TValue, TSpec> >::Type
567         hash2Next(Shape<TValue, TSpec> &me, TIter &it, TSize charsLeft)
568         {
569         SEQAN_CHECKPOINT
570                 // remove first, shift left, and add next character
571                 typedef typename Value< Shape<TValue, TSpec> >::Type    THValue;
572
573                 if (charsLeft >= me.span) {
574                         // update sum of x_i
575                         me.XValue = me.XValue + ordValue((TValue)*(it + me.span - 1)) - ordValue(me.leftChar);
576                         // shift hash
577                         me.hValue = (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor2) * ValueSize<TValue>::VALUE + me.XValue
578                                                 - me.span * (ValueSize<TValue>::VALUE - 1);
579                 } else {
580                         // update sum of x_i
581                         me.XValue -= ordValue(me.leftChar);
582                         // shift hash
583                         me.hValue = (me.hValue - ordValue(me.leftChar) * (THValue)me.leftFactor2) * ValueSize<TValue>::VALUE + me.XValue
584                                         - charsLeft * (ValueSize<TValue>::VALUE - 1) - ValueSize<TValue>::VALUE;
585                 }
586
587                 me.leftChar = *it;
588                 return me.hValue;
589         }
590
591         template <typename TString, typename THash>
592         inline void unhash(TString &result, THash hash, unsigned q)
593         {
594         SEQAN_CHECKPOINT
595                 typedef typename Value<TString>::Type   TValue;
596
597                 resize(result, q);
598                 for (unsigned i = q; i > 0; ) 
599                 {
600                         result[--i] = (TValue)(hash % ValueSize<TValue>::VALUE);
601                         hash /= ValueSize<TValue>::VALUE;
602                 }
603         }
604
605 }       // namespace seqan
606
607 #endif