6 #include <seqan/sequence.h>
7 #include <seqan/file.h>
9 #include "assert_helpers.h"
12 using namespace seqan;
15 * Return a new TStr containing the reverse-complement of s. Ns go to
18 template<typename TStr>
19 static inline TStr reverseComplement(const TStr& s, bool color) {
20 typedef typename Value<TStr>::Type TVal;
22 size_t slen = length(s);
25 for(size_t i = 0; i < slen; i++) {
26 s_rc[i] = s[slen-i-1];
29 for(size_t i = 0; i < slen; i++) {
30 int sv = (int)s[slen-i-1];
34 s_rc[i] = (TVal)(sv ^ 3);
42 * Reverse-complement s in-place. Ns go to Ns.
44 template<typename TStr>
45 static inline void reverseComplementInPlace(TStr& s, bool color) {
46 typedef typename Value<TStr>::Type TVal;
51 size_t len = length(s);
53 for(i = 0; i < (len>>1); i++) {
54 int sv = (int)s[len-i-1];
59 s[i] = (TVal)(sv ^ 3);
64 s[len-i-1] = (TVal)(sf ^ 3);
67 if((len & 1) != 0 && (int)s[len >> 1] != 4) {
68 s[len >> 1] = (TVal)((int)s[len >> 1] ^ 3);
72 /// Reverse a string in-place
73 template <typename TStr>
74 static inline void reverseInPlace(TStr& s) {
75 typedef typename Value<TStr>::Type TVal;
76 size_t len = length(s);
77 for(size_t i = 0; i < (len>>1); i++) {
85 * Return the reverse-complement of s.
87 template<typename TStr>
88 static inline TStr reverseCopy(const TStr& s) {
89 typedef typename Value<TStr>::Type TVal;
91 size_t slen = length(s);
93 for(size_t i = 0; i < slen; i++) {
94 s_rc[i] = (TVal)((int)s[slen-i-1]);
100 * Return true iff the first string is dollar-less-than the second.
101 * This means that we pretend that a 'dollar sign' character,
102 * lexicographically larger than all other characters, exists at the
103 * end of both strings.
105 template <typename TStr>
107 dollarLt(const TStr& l, const TStr& r) {
108 return isPrefix(r, l) || (l < r && !isPrefix(l, r));
112 * Return true iff the first string is dollar-greater-than the second.
113 * This means that we pretend that a 'dollar sign' character,
114 * lexicographically larger than all other characters, exists at the
115 * end of both strings.
117 template <typename TStr>
119 dollarGt(const TStr& l, const TStr& r) {
120 return !dollarLt(l, r);
124 * Return a copy of the suffix of l starting at 'off'.
126 template <typename TStr>
127 static inline std::string
128 suffixStr(const TStr& l, size_t off) {
129 typedef typename Value<TStr>::Type TVal;
131 size_t len = seqan::length(l);
132 for(size_t i = off; i < len; i++) {
133 ret.push_back((char)(TVal)l[i]);
139 * Calculate the entropy of the given read. Handle Ns by charging them
140 * to the most frequent non-N character.
142 static inline float entropyDna5(const String<Dna5>& read) {
143 size_t cs[5] = {0, 0, 0, 0, 0};
144 size_t readLen = seqan::length(read);
145 for(size_t i = 0; i < readLen; i++) {
146 int c = (int)read[i];
152 // Charge the Ns to the non-N character with maximal count and
153 // then exclude them from the entropy calculation (i.e.,
154 // penalize Ns as much as possible)
155 if(cs[0] >= cs[1] && cs[0] >= cs[2] && cs[0] >= cs[3]) {
158 } else if(cs[1] >= cs[2] && cs[1] >= cs[3]) {
161 } else if(cs[2] >= cs[3]) {
170 for(int i = 0; i < 4; i++) {
172 float frac = (float)cs[i] / (float)readLen;
173 ent += (frac * log(frac));
177 assert_geq(ent, 0.0);
182 * Return the DNA complement of the given ASCII char.
184 static inline char comp(char c) {
186 case 'a': return 't';
187 case 'A': return 'T';
188 case 'c': return 'g';
189 case 'C': return 'G';
190 case 'g': return 'c';
191 case 'G': return 'C';
192 case 't': return 'a';
193 case 'T': return 'A';
198 extern uint8_t dna4Cat[];
199 extern uint8_t charToDna5[];
200 extern uint8_t asc2col[];
201 extern uint8_t rcCharToDna5[];
203 /// Convert an ascii char to a DNA category. Categories are:
205 /// 1 -> unambiguous a, c, g or t
208 extern uint8_t asc2dnacat[];
210 /// Convert an ascii char to a color category. Categories are:
212 /// 1 -> unambiguous 0, 1, 2 or 3
213 /// 2 -> ambiguous (not applicable for colors)
215 extern uint8_t asc2colcat[];
217 /// Convert a 2-bit nucleotide (and 4=N) and a color to the
218 /// corresponding 2-bit nucleotide
219 extern uint8_t nuccol2nuc[5][5];
222 * Return true iff c is an unambiguous Dna character.
224 static inline bool isUnambigDna(char c) {
225 return asc2dnacat[(int)c] == 1;
229 * Return true iff c is a Dna character.
231 static inline bool isDna(char c) {
232 return asc2dnacat[(int)c] > 0;
236 * Return true iff c is an unambiguous color character (0,1,2,3).
238 static inline bool isUnambigColor(char c) {
239 return asc2colcat[(int)c] == 1;
243 * Return true iff c is a color character.
245 static inline bool isColor(char c) {
246 return asc2colcat[(int)c] > 0;
249 /// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color
250 extern uint8_t dinuc2color[5][5];
252 #endif /*ALPHABETS_H_*/