Imported Upstream version 0.12.7
[bowtie.git] / filebuf.h
1 /*
2  * filebuf.h
3  *
4  *      Author: Ben Langmead
5  */
6 #ifndef FILEBUF_H_
7 #define FILEBUF_H_
8
9 #include <iostream>
10 #include <fstream>
11 #include <string>
12 #include <stdio.h>
13 #include <string.h>
14 #include <stdint.h>
15 #include <stdexcept>
16 #include "assert_helpers.h"
17
18 /**
19  * Simple wrapper for a FILE*, istream or ifstream that reads it in
20  * chunks (with fread) and keeps those chunks in a buffer.  It also
21  * services calls to get(), peek() and gets() from the buffer, reading
22  * in additional chunks when necessary.
23  */
24 class FileBuf {
25 public:
26         FileBuf() {
27                 init();
28         }
29
30         FileBuf(FILE *in) {
31                 init();
32                 _in = in;
33                 assert(_in != NULL);
34         }
35
36         FileBuf(std::ifstream *inf) {
37                 init();
38                 _inf = inf;
39                 assert(_inf != NULL);
40         }
41
42         FileBuf(std::istream *ins) {
43                 init();
44                 _ins = ins;
45                 assert(_ins != NULL);
46         }
47
48         bool isOpen() {
49                 return _in != NULL || _inf != NULL || _ins != NULL;
50         }
51
52         /**
53          * Close the input stream (if that's possible)
54          */
55         void close() {
56                 if(_in != NULL && _in != stdin) {
57                         fclose(_in);
58                 } else if(_inf != NULL) {
59                         _inf->close();
60                 } else {
61                         // can't close _ins
62                 }
63         }
64
65         /**
66          * Get the next character of input and advance.
67          */
68         int get() {
69                 assert(_in != NULL || _inf != NULL || _ins != NULL);
70                 int c = peek();
71                 if(c != -1) {
72                         _cur++;
73                         if(_lastn_cur < LASTN_BUF_SZ) _lastn_buf[_lastn_cur++] = c;
74                 }
75                 return c;
76         }
77
78         /**
79          * Return true iff all input is exhausted.
80          */
81         bool eof() {
82                 return (_cur == _buf_sz) && _done;
83         }
84
85         /**
86          * Initialize the buffer with a new C-style file.
87          */
88         void newFile(FILE *in) {
89                 _in = in;
90                 _inf = NULL;
91                 _ins = NULL;
92                 _cur = BUF_SZ;
93                 _buf_sz = BUF_SZ;
94                 _done = false;
95         }
96
97         /**
98          * Initialize the buffer with a new ifstream.
99          */
100         void newFile(std::ifstream *__inf) {
101                 _in = NULL;
102                 _inf = __inf;
103                 _ins = NULL;
104                 _cur = BUF_SZ;
105                 _buf_sz = BUF_SZ;
106                 _done = false;
107         }
108
109         /**
110          * Initialize the buffer with a new istream.
111          */
112         void newFile(std::istream *__ins) {
113                 _in = NULL;
114                 _inf = NULL;
115                 _ins = __ins;
116                 _cur = BUF_SZ;
117                 _buf_sz = BUF_SZ;
118                 _done = false;
119         }
120
121         /**
122          * Restore state as though we just started reading the input
123          * stream.
124          */
125         void reset() {
126                 if(_inf != NULL) {
127                         _inf->clear();
128                         _inf->seekg(0, std::ios::beg);
129                 } else if(_ins != NULL) {
130                         _ins->clear();
131                         _ins->seekg(0, std::ios::beg);
132                 } else {
133                         rewind(_in);
134                 }
135                 _cur = BUF_SZ;
136                 _buf_sz = BUF_SZ;
137                 _done = false;
138         }
139
140         /**
141          * Peek at the next character of the input stream without
142          * advancing.  Typically we can simple read it from the buffer.
143          * Occasionally we'll need to read in a new buffer's worth of data.
144          */
145         int peek() {
146                 assert(_in != NULL || _inf != NULL || _ins != NULL);
147                 assert_leq(_cur, _buf_sz);
148                 if(_cur == _buf_sz) {
149                         if(_done) {
150                                 // We already exhausted the input stream
151                                 return -1;
152                         }
153                         // Read a new buffer's worth of data
154                         else {
155                                 // Get the next chunk
156                                 if(_inf != NULL) {
157                                         _inf->read((char*)_buf, BUF_SZ);
158                                         _buf_sz = _inf->gcount();
159                                 } else if(_ins != NULL) {
160                                         _ins->read((char*)_buf, BUF_SZ);
161                                         _buf_sz = _ins->gcount();
162                                 } else {
163                                         assert(_in != NULL);
164                                         _buf_sz = fread(_buf, 1, BUF_SZ, _in);
165                                 }
166                                 _cur = 0;
167                                 if(_buf_sz == 0) {
168                                         // Exhausted, and we have nothing to return to the
169                                         // caller
170                                         _done = true;
171                                         return -1;
172                                 } else if(_buf_sz < BUF_SZ) {
173                                         // Exhausted
174                                         _done = true;
175                                 }
176                         }
177                 }
178                 return (int)_buf[_cur];
179         }
180
181         /**
182          * Store a string of characters from the input file into 'buf',
183          * until we see a newline, EOF, or until 'len' characters have been
184          * read.
185          */
186         size_t gets(char *buf, size_t len) {
187                 size_t stored = 0;
188                 while(true) {
189                         int c = get();
190                         if(c == -1) {
191                                 // End-of-file
192                                 buf[stored] = '\0';
193                                 return stored;
194                         }
195                         if(stored == len-1 || c == '\n' || c == '\r') {
196                                 // End of string
197                                 buf[stored] = '\0';
198                                 // Skip over all end-of-line characters
199                                 int pc = peek();
200                                 while(pc == '\n' || pc == '\r') {
201                                         get(); // discard
202                                         pc = peek();
203                                 }
204                                 // Next get() will be after all newline characters
205                                 return stored;
206                         }
207                         buf[stored++] = (char)c;
208                 }
209         }
210
211         /**
212          * Store a string of characters from the input file into 'buf',
213          * until we see a newline, EOF, or until 'len' characters have been
214          * read.
215          */
216         size_t get(char *buf, size_t len) {
217                 size_t stored = 0;
218                 for(size_t i = 0; i < len; i++) {
219                         int c = get();
220                         if(c == -1) return i;
221                         buf[stored++] = (char)c;
222                 }
223                 return len;
224         }
225
226         static const size_t LASTN_BUF_SZ = 8 * 1024;
227
228         /**
229          * Keep get()ing characters until a non-whitespace character (or
230          * -1) is reached, and return it.
231          */
232         int getPastWhitespace() {
233                 int c;
234                 while(isspace(c = get()) && c != -1);
235                 return c;
236         }
237
238         /**
239          * Keep get()ing characters until a we've passed over the next
240          * string of newline characters (\r's and \n's) or -1 is reached,
241          * and return it.
242          */
243         int getPastNewline() {
244                 int c = get();
245                 while(c != '\r' && c != '\n' && c != -1) c = get();
246                 while(c == '\r' || c == '\n') c = get();
247                 assert_neq(c, '\r');
248                 assert_neq(c, '\n');
249                 return c;
250         }
251
252         /**
253          * Keep get()ing characters until a we've passed over the next
254          * string of newline characters (\r's and \n's) or -1 is reached,
255          * and return it.
256          */
257         int peekPastNewline() {
258                 int c = peek();
259                 while(c != '\r' && c != '\n' && c != -1) c = get();
260                 while(c == '\r' || c == '\n') c = get();
261                 assert_neq(c, '\r');
262                 assert_neq(c, '\n');
263                 return c;
264         }
265
266         /**
267          * Keep peek()ing then get()ing characters until the next return
268          * from peek() is just after the last newline of the line.
269          */
270         int peekUptoNewline() {
271                 int c = peek();
272                 while(c != '\r' && c != '\n' && c != -1) {
273                         get(); c = peek();
274                 }
275                 while(c == '\r' || c == '\n') {
276                         get();
277                         c = peek();
278                 }
279                 assert_neq(c, '\r');
280                 assert_neq(c, '\n');
281                 return c;
282         }
283
284         size_t lastNCur() const { return _lastn_cur; }
285
286         /**
287          * Reset to the beginning of the last-N-chars buffer.
288          */
289         void resetLastN() {
290                 _lastn_cur = 0;
291         }
292
293         /**
294          * Copy the last several characters in the last-N-chars buffer
295          * (since the last reset) into the provided buffer.
296          */
297         size_t copyLastN(char *buf) {
298                 memcpy(buf, _lastn_buf, _lastn_cur);
299                 return _lastn_cur;
300         }
301
302         /**
303          * Get const pointer to the last-N-chars buffer.
304          */
305         const char *lastN() const {
306                 return _lastn_buf;
307         }
308
309         /**
310          * Get current size of the last-N-chars buffer.
311          */
312         const size_t lastNLen() const {
313                 return _lastn_cur;
314         }
315
316 private:
317
318         void init() {
319                 _in = NULL;
320                 _inf = NULL;
321                 _ins = NULL;
322                 _cur = _buf_sz = BUF_SZ;
323                 _done = false;
324                 _lastn_cur = 0;
325                 // no need to clear _buf[]
326         }
327
328         static const size_t BUF_SZ = 256 * 1024;
329         FILE     *_in;
330         std::ifstream *_inf;
331         std::istream  *_ins;
332         size_t    _cur;
333         size_t    _buf_sz;
334         bool      _done;
335         uint8_t   _buf[BUF_SZ]; // (large) input buffer
336         size_t    _lastn_cur;
337         char      _lastn_buf[LASTN_BUF_SZ]; // buffer of the last N chars dispensed
338 };
339
340 /**
341  * Wrapper for a buffered output stream that writes bitpairs.
342  */
343 class BitpairOutFileBuf {
344 public:
345         /**
346          * Open a new output stream to a file with given name.
347          */
348         BitpairOutFileBuf(const char *in) : bpPtr_(0), cur_(0) {
349                 assert(in != NULL);
350                 out_ = fopen(in, "wb");
351                 if(out_ == NULL) {
352                         std::cerr << "Error: Could not open bitpair-output file " << in << std::endl;
353                         throw 1;
354                 }
355                 memset(buf_, 0, BUF_SZ);
356         }
357
358         /**
359          * Write a single bitpair into the buf.  Flush the buffer if it's
360          * full.
361          */
362         void write(int bp) {
363                 assert_lt(bp, 4);
364                 assert_geq(bp, 0);
365                 buf_[cur_] |= (bp << bpPtr_);
366                 if(bpPtr_ == 6) {
367                         bpPtr_ = 0;
368                         cur_++;
369                         if(cur_ == BUF_SZ) {
370                                 // Flush the buffer
371                                 if(!fwrite((const void *)buf_, BUF_SZ, 1, out_)) {
372                                         std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
373                                         throw 1;
374                                 }
375                                 // Reset to beginning of the buffer
376                                 cur_ = 0;
377                         }
378                         // Initialize next octet to 0
379                         buf_[cur_] = 0;
380                 } else {
381                         bpPtr_ += 2;
382                 }
383         }
384
385         /**
386          * Write any remaining bitpairs and then close the input
387          */
388         void close() {
389                 if(cur_ > 0 || bpPtr_ > 0) {
390                         if(bpPtr_ == 0) cur_--;
391                         if(!fwrite((const void *)buf_, cur_ + 1, 1, out_)) {
392                                 std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
393                                 throw 1;
394                         }
395                 }
396                 fclose(out_);
397         }
398 private:
399         static const size_t BUF_SZ = 128 * 1024;
400         FILE    *out_;
401         int      bpPtr_;
402         uint32_t cur_;
403         char     buf_[BUF_SZ]; // (large) input buffer
404 };
405
406 /**
407  * Wrapper for a buffered output stream that writes characters and
408  * other data types.  This class is *not* synchronized; the caller is
409  * responsible for synchronization.
410  */
411 class OutFileBuf {
412
413 public:
414
415         /**
416          * Open a new output stream to a file with given name.
417          */
418         OutFileBuf(const char *out, bool binary = false) :
419                 name_(out), cur_(0), closed_(false)
420         {
421                 assert(out != NULL);
422                 out_ = fopen(out, binary ? "wb" : "w");
423                 if(out_ == NULL) {
424                         std::cerr << "Error: Could not open alignment output file " << out << std::endl;
425                         throw 1;
426                 }
427         }
428
429         /**
430          * Open a new output stream to standard out.
431          */
432         OutFileBuf() : name_("cout"), cur_(0), closed_(false) {
433                 out_ = stdout;
434         }
435
436         /**
437          * Open a new output stream to a file with given name.
438          */
439         void setFile(const char *out, bool binary = false) {
440                 assert(out != NULL);
441                 out_ = fopen(out, binary ? "wb" : "w");
442                 if(out_ == NULL) {
443                         std::cerr << "Error: Could not open alignment output file " << out << std::endl;
444                         throw 1;
445                 }
446                 reset();
447         }
448
449         /**
450          * Write a single character into the write buffer and, if
451          * necessary, flush.
452          */
453         void write(char c) {
454                 assert(!closed_);
455                 if(cur_ == BUF_SZ) flush();
456                 buf_[cur_++] = c;
457         }
458
459         /**
460          * Write a c++ string to the write buffer and, if necessary, flush.
461          */
462         void writeString(const std::string& s) {
463                 assert(!closed_);
464                 size_t slen = s.length();
465                 if(cur_ + slen > BUF_SZ) {
466                         if(cur_ > 0) flush();
467                         if(slen >= BUF_SZ) {
468                                 size_t wlen = fwrite(s.c_str(), 1, slen, out_);
469                                 if(wlen != slen) {
470                                         std::cerr << "Error while writing string output; " << slen
471                                                           << " characters in string, " << wlen
472                                                           << " written" << std::endl;
473                                         throw 1;
474                                 }
475                         } else {
476                                 memcpy(&buf_[cur_], s.data(), slen);
477                                 assert_eq(0, cur_);
478                                 cur_ = slen;
479                         }
480                 } else {
481                         memcpy(&buf_[cur_], s.data(), slen);
482                         cur_ += slen;
483                 }
484                 assert_leq(cur_, BUF_SZ);
485         }
486
487         /**
488          * Write a c++ string to the write buffer and, if necessary, flush.
489          */
490         void writeChars(const char * s, size_t len) {
491                 assert(!closed_);
492                 if(cur_ + len > BUF_SZ) {
493                         if(cur_ > 0) flush();
494                         if(len >= BUF_SZ) {
495                                 size_t wlen = fwrite(s, 1, len, out_);
496                                 if(wlen != len) {
497                                         std::cerr << "Error while writing string output; " << len
498                                                           << " characters in string, " << wlen
499                                                           << " written" << std::endl;
500                                         throw 1;
501                                 }
502                         } else {
503                                 memcpy(&buf_[cur_], s, len);
504                                 assert_eq(0, cur_);
505                                 cur_ = len;
506                         }
507                 } else {
508                         memcpy(&buf_[cur_], s, len);
509                         cur_ += len;
510                 }
511                 assert_leq(cur_, BUF_SZ);
512         }
513
514         /**
515          * Write any remaining bitpairs and then close the input
516          */
517         void close() {
518                 if(closed_) return;
519                 if(cur_ > 0) flush();
520                 closed_ = true;
521                 if(out_ != stdout) {
522                         fclose(out_);
523                 }
524         }
525
526         /**
527          * Reset so that the next write is as though it's the first.
528          */
529         void reset() {
530                 cur_ = 0;
531                 closed_ = false;
532         }
533
534         void flush() {
535                 if(!fwrite((const void *)buf_, cur_, 1, out_)) {
536                         std::cerr << "Error while flushing and closing output" << std::endl;
537                         throw 1;
538                 }
539                 cur_ = 0;
540         }
541
542         /**
543          * Return true iff this stream is closed.
544          */
545         bool closed() const {
546                 return closed_;
547         }
548
549         /**
550          * Return the filename.
551          */
552         const char *name() {
553                 return name_;
554         }
555
556 private:
557
558         static const size_t BUF_SZ = 16 * 1024;
559
560         const char *name_;
561         FILE       *out_;
562         uint32_t    cur_;
563         char        buf_[BUF_SZ]; // (large) input buffer
564         bool        closed_;
565 };
566
567 #endif /*ndef FILEBUF_H_*/