Imported Upstream version 0.12.7
[bowtie.git] / ref_read.h
1 #ifndef REF_READ_H_
2 #define REF_READ_H_
3
4 #include <iostream>
5 #include <cassert>
6 #include <vector>
7 #include <string>
8 #include <ctype.h>
9 #include <fstream>
10 #include <stdexcept>
11 #include <seqan/sequence.h>
12 #include "alphabet.h"
13 #include "assert_helpers.h"
14 #include "filebuf.h"
15 #include "word_io.h"
16
17 using namespace std;
18 using namespace seqan;
19
20 /**
21  * Encapsulates a stretch of the reference containing only unambiguous
22  * characters.  From an ordered list of RefRecords, one can (almost)
23  * deduce the "shape" of the reference sequences (almost because we
24  * lose information about stretches of ambiguous characters at the end
25  * of reference sequences).
26  */
27 struct RefRecord {
28         RefRecord() : off(), len(), first() { }
29         RefRecord(uint32_t _off, uint32_t _len, bool _first) :
30                 off(_off), len(_len), first(_first)
31         { }
32
33         RefRecord(FILE *in, bool swap) {
34                 assert(in != NULL);
35                 if(!fread(&off, 4, 1, in)) {
36                         cerr << "Error reading RefRecord offset from FILE" << endl;
37                         throw 1;
38                 }
39                 if(swap) off = endianSwapU32(off);
40                 if(!fread(&len, 4, 1, in)) {
41                         cerr << "Error reading RefRecord offset from FILE" << endl;
42                         throw 1;
43                 }
44                 if(swap) len = endianSwapU32(len);
45                 first = fgetc(in) ? true : false;
46         }
47
48         RefRecord(int in, bool swap) {
49                 off = readU32(in, swap);
50                 len = readU32(in, swap);
51                 char c;
52                 if(!read(in, &c, 1)) {
53                         cerr << "Error reading RefRecord 'first' flag" << endl;
54                         throw 1;
55                 }
56                 first = (c ? true : false);
57         }
58
59         void write(std::ostream& out, bool be) {
60                 writeU32(out, off, be);
61                 writeU32(out, len, be);
62                 out.put(first ? 1 : 0);
63         }
64
65         uint32_t off; /// Offset of the first character in the record
66         uint32_t len; /// Length of the record
67         bool   first; /// Whether this record is the first for a reference sequence
68 };
69
70 enum {
71         REF_READ_FORWARD = 0, // don't reverse reference sequence
72         REF_READ_REVERSE,     // reverse entire reference sequence
73         REF_READ_REVERSE_EACH // reverse each unambiguous stretch of reference
74 };
75
76 /**
77  * Parameters governing treatment of references as they're read in.
78  */
79 struct RefReadInParams {
80         RefReadInParams(bool col, int r, bool nsToA, bool bisulf) :
81                 color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { }
82         // extract colors from reference
83         bool color;
84         // reverse each reference sequence before passing it along
85         int reverse;
86         // convert ambiguous characters to As
87         bool nsToAs;
88         // bisulfite-convert the reference
89         bool bisulfite;
90 };
91
92 extern RefRecord
93 fastaRefReadSize(
94         FileBuf& in,
95         const RefReadInParams& rparms,
96         bool first,
97         BitpairOutFileBuf* bpout = NULL);
98
99 extern std::pair<size_t, size_t>
100 fastaRefReadSizes(
101         vector<FileBuf*>& in,
102         vector<RefRecord>& recs,
103         vector<uint32_t>& plens,
104         const RefReadInParams& rparms,
105         BitpairOutFileBuf* bpout,
106         int& numSeqs);
107
108 extern void
109 reverseRefRecords(
110         const vector<RefRecord>& src,
111         vector<RefRecord>& dst,
112         bool recursive = false,
113         bool verbose = false);
114
115 /**
116  * Reads the next sequence from the given FASTA file and appends it to
117  * the end of dst, optionally reversing it.
118  */
119 template <typename TStr>
120 static RefRecord fastaRefReadAppend(FileBuf& in,
121                                     bool first,
122                                     TStr& dst,
123                                     RefReadInParams& rparms,
124                                     string* name = NULL)
125 {
126         typedef typename Value<TStr>::Type TVal;
127         int c;
128         static int lastc = '>';
129         if(first) {
130                 c = in.getPastWhitespace();
131                 if(c != '>') {
132                         cerr << "Reference file does not seem to be a FASTA file" << endl;
133                         throw 1;
134                 }
135                 lastc = c;
136         }
137         assert_neq(-1, lastc);
138
139         // RefRecord params
140         size_t len = 0;
141         size_t off = 0;
142         first = true;
143
144         size_t ilen = length(dst);
145
146         // Chew up the id line; if the next line is either
147         // another id line or a comment line, keep chewing
148         int lc = -1; // last-DNA char variable for color conversion
149         c = lastc;
150         if(c == '>' || c == '#') {
151                 do {
152                         while (c == '#') {
153                                 if((c = in.getPastNewline()) == -1) {
154                                         lastc = -1;
155                                         goto bail;
156                                 }
157                         }
158                         assert_eq('>', c);
159                         while(true) {
160                                 c = in.get();
161                                 if(c == -1) {
162                                         lastc = -1;
163                                         goto bail;
164                                 }
165                                 if(c == '\n' || c == '\r') {
166                                         while(c == '\r' || c == '\n') c = in.get();
167                                         if(c == -1) {
168                                                 lastc = -1;
169                                                 goto bail;
170                                         }
171                                         break;
172                                 }
173                                 if (name) name->push_back(c);
174                         }
175                         // c holds the first character on the line after the name
176                         // line
177                 } while (c == '>' || c == '#');
178         } else {
179                 ASSERT_ONLY(int cc = toupper(c));
180                 assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
181                 first = false;
182         }
183
184         // Skip over an initial stretch of gaps or ambiguous characters.
185         // For colorspace we skip until we see two consecutive unambiguous
186         // characters (i.e. the first unambiguous color).
187         while(true) {
188                 int cat = dna4Cat[c];
189                 if(rparms.nsToAs && cat == 2) {
190                         c = 'A';
191                 }
192                 int cc = toupper(c);
193                 if(rparms.bisulfite && cc == 'C') c = cc = 'T';
194                 if(cat == 1) {
195                         // This is a DNA character
196                         if(rparms.color) {
197                                 if(lc != -1) {
198                                         // Got two consecutive unambiguous DNAs
199                                         break; // to read-in loop
200                                 }
201                                 // Keep going; we need two consecutive unambiguous DNAs
202                                 lc = charToDna5[(int)c];
203                                 // The 'if(off > 0)' takes care of the case where
204                                 // the reference is entirely unambiguous and we don't
205                                 // want to incorrectly increment off.
206                                 if(off > 0) off++;
207                         } else {
208                                 break; // to read-in loop
209                         }
210                 } else if(cat == 2) {
211                         if(lc != -1 && off == 0) {
212                                 off++;
213                         }
214                         lc = -1;
215                         off++; // skip it
216                 } else if(c == '>') {
217                         lastc = '>';
218                         goto bail;
219                 }
220                 c = in.get();
221                 if(c == -1) {
222                         lastc = -1;
223                         goto bail;
224                 }
225         }
226         if(first && rparms.color && off > 0) {
227                 // Handle the case where the first record has ambiguous
228                 // characters but we're in color space; one of those counts is
229                 // spurious
230                 off--;
231         }
232         assert(!rparms.color || lc != -1);
233         assert_eq(1, dna4Cat[c]);
234
235         // in now points just past the first character of a sequence
236         // line, and c holds the first character
237         while(true) {
238                 // Note: can't have a comment in the middle of a sequence,
239                 // though a comment can end a sequence
240                 int cat = dna4Cat[c];
241                 assert_neq(2, cat);
242                 if(cat == 1) {
243                         // Consume it
244                         if(!rparms.color || lc != -1) len++;
245                         // Add it to referenece buffer
246                         if(rparms.color) {
247                                 appendValue(dst, (Dna)dinuc2color[charToDna5[(int)c]][lc]);
248                         } else if(!rparms.color) {
249                                 appendValue(dst, (Dna)(char)c);
250                         }
251                         assert_lt((uint8_t)(Dna)dst[length(dst)-1], 4);
252                         lc = charToDna5[(int)c];
253                 }
254                 c = in.get();
255                 if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
256                 if (c == -1 || c == '>' || c == '#' || dna4Cat[c] == 2) {
257                         lastc = c;
258                         break;
259                 }
260                 if(rparms.bisulfite && toupper(c) == 'C') c = 'T';
261         }
262
263   bail:
264         // Optionally reverse the portion that we just appended.
265         // ilen = length of buffer before this last sequence was appended.
266         if(rparms.reverse == REF_READ_REVERSE_EACH) {
267                 // Find limits of the portion we just appended
268                 size_t nlen = length(dst);
269                 assert_eq(nlen - ilen, len);
270                 if(len > 0) {
271                         size_t halfway =  ilen + (len>>1);
272                         // Reverse it in-place
273                         for(size_t i = ilen; i < halfway; i++) {
274                                 size_t diff = i-ilen;
275                                 size_t j = nlen-diff-1;
276                                 TVal tmp = dst[i];
277                                 dst[i] = dst[j];
278                                 dst[j] = tmp;
279                         }
280                 }
281         }
282         return RefRecord(off, len, first);
283 }
284
285 #endif /*ndef REF_READ_H_*/