Imported Upstream version 0.12.7
[bowtie.git] / reference.h
1 #ifndef REFERENCE_H_
2 #define REFERENCE_H_
3
4 #include <stdexcept>
5 #include "endian_swap.h"
6 #include "mm.h"
7 #include "shmem.h"
8 #include "timer.h"
9
10 /**
11  * Concrete reference representation that bulk-loads the reference from
12  * the bit-pair-compacted binary file and stores it in memory also in
13  * bit-pair-compacted format.  The user may request reference
14  * characters either on a per-character bases or by "stretch" using
15  * getBase(...) and getStretch(...) respectively.
16  *
17  * Most of the complexity in this class is due to the fact that we want
18  * to represent references with ambiguous (non-A/C/G/T) characters but
19  * we don't want to use more than two bits per base.  This means we
20  * need a way to encode the ambiguous stretches of the reference in a
21  * way that is external to the bitpair sequence.  To accomplish this,
22  * we use the RefRecords vector, which is stored in the .3.ebwt index
23  * file.  The bitpairs themselves are stored in the .4.ebwt index file.
24  *
25  * Once it has been loaded, a BitPairReference is read-only, and is
26  * safe for many threads to access at once.
27  */
28 class BitPairReference {
29
30 public:
31         /**
32          * Load from .3.ebwt/.4.ebwt Bowtie index files.
33          */
34         BitPairReference(const string& in,
35                          bool color,
36                          bool sanity,
37                          std::vector<string>* infiles,
38                          std::vector<String<Dna5> >* origs,
39                          bool infilesSeq,
40                          bool loadSequence, // as opposed to just records
41                          bool useMm,
42                          bool useShmem,
43                          bool mmSweep,
44                          bool verbose,
45                          bool startVerbose) :
46         buf_(NULL),
47         sanityBuf_(NULL),
48         loaded_(true),
49         sanity_(sanity),
50         useMm_(useMm),
51         useShmem_(useShmem),
52         verbose_(verbose)
53         {
54                 string s3 = in + ".3.ebwt";
55                 string s4 = in + ".4.ebwt";
56
57 #ifdef BOWTIE_MM
58                 int f3, f4;
59                 if((f3 = open(s3.c_str(), O_RDONLY)) < 0) {
60                         cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
61                         cerr << "This is most likely because your index was built with an older version" << endl
62                              << "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
63                              << "index (or download one from the Bowtie website) and try again." << endl;
64                         loaded_ = false;
65                         return;
66                 }
67                 if((f4 = open(s4.c_str(), O_RDONLY)) < 0) {
68                         cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
69                         loaded_ = false;
70                         return;
71                 }
72                 char *mmFile = NULL;
73                 if(useMm_) {
74                         if(verbose_ || startVerbose) {
75                                 cerr << "  Memory-mapping reference index file " << s4 << ": ";
76                                 logTime(cerr);
77                         }
78                         struct stat sbuf;
79                         if (stat(s4.c_str(), &sbuf) == -1) {
80                                 perror("stat");
81                                 cerr << "Error: Could not stat index file " << s4.c_str() << " prior to memory-mapping" << endl;
82                                 throw 1;
83                         }
84                         mmFile = (char*)mmap((void *)0, sbuf.st_size,
85                                              PROT_READ, MAP_SHARED, f4, 0);
86                         if(mmFile == (void *)(-1) || mmFile == NULL) {
87                                 perror("mmap");
88                                 cerr << "Error: Could not memory-map the index file " << s4.c_str() << endl;
89                                 throw 1;
90                         }
91                         if(mmSweep) {
92                                 int sum = 0;
93                                 for(off_t i = 0; i < sbuf.st_size; i += 1024) {
94                                         sum += (int) mmFile[i];
95                                 }
96                                 if(startVerbose) {
97                                         cerr << "  Swept the memory-mapped ref index file; checksum: " << sum << ": ";
98                                         logTime(cerr);
99                                 }
100                         }
101                 }
102 #else
103                 FILE *f3, *f4;
104                 if((f3 = fopen(s3.c_str(), "rb")) == NULL) {
105                         cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
106                         cerr << "This is most likely because your index was built with an older version" << endl
107                              << "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
108                              << "index (or download one from the Bowtie website) and try again." << endl;
109                         loaded_ = false;
110                         return;
111                 }
112                 if((f4 = fopen(s4.c_str(), "rb"))  == NULL) {
113                         cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
114                         loaded_ = false;
115                         return;
116                 }
117 #endif
118
119                 // Read endianness sentinel, set 'swap'
120                 uint32_t one;
121                 bool swap = false;
122                 one = readU32(f3, swap);
123                 if(one != 1) {
124                         if(useMm_) {
125                                 cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
126                                 throw 1;
127                         }
128                         assert_eq(0x1000000, one);
129                         swap = true; // have to endian swap U32s
130                 }
131
132                 // Read # records
133                 uint32_t sz;
134                 sz = readU32(f3, swap);
135                 if(sz == 0) {
136                         cerr << "Error: number of reference records is 0 in " << s3 << endl;
137                         throw 1;
138                 }
139
140                 // Read records
141                 nrefs_ = 0;
142                 nNoGapRefs_ = 0;
143
144                 // Cumulative count of all unambiguous characters on a per-
145                 // stretch 8-bit alignment (i.e. count of bytes we need to
146                 // allocate in buf_)
147                 uint32_t cumsz = 0;
148                 uint32_t cumlen = 0;
149                 uint32_t unambiglen = 0;
150                 uint32_t maxlen = 0;
151                 // For each unambiguous stretch...
152                 for(uint32_t i = 0; i < sz; i++) {
153                         recs_.push_back(RefRecord(f3, swap));
154                 }
155                 for(uint32_t i = 0; i < sz; i++) {
156                         if(recs_[i].first) {
157                                 if(nrefs_ > 0) {
158                                         refLens_.push_back(cumlen);
159                                 }
160                                 // Stupid hack to get around the fact that, for
161                                 // colorspace Ebwts, not only do we omit reference
162                                 // sequences that are *all* gaps from
163                                 // nPat/plen/refnames, but we also omit reference
164                                 // sequences that never have a stretch of more than 1
165                                 // unambiguous character.
166                                 if(unambiglen > 0 && (!color || maxlen > 1)) {
167                                         refApproxLens_.push_back(cumlen);
168                                 }
169                                 // More hackery to detect references that won't be
170                                 // in the Ebwt even though they have non-zero length
171                                 bool willBeInEbwt = true;
172                                 if(recs_[i].len > 0 && color) {
173                                         // Omit until we prove it should be kept
174                                         willBeInEbwt = false;
175                                         for(uint32_t j = i; j < sz; j++) {
176                                                 if(j > i && recs_[j].first) break;
177                                                 if(recs_[j].len >= 2) {
178                                                         willBeInEbwt = true;
179                                                         break;
180                                                 }
181                                         }
182                                 }
183                                 if(recs_[i].len > 0 && willBeInEbwt) {
184                                         // Remember that this is the first record for this
185                                         // reference sequence (and the last record for the one
186                                         // before)
187                                         refRecOffs_.push_back(i);
188                                         refOffs_.push_back(cumsz);
189                                         expandIdx_.push_back(nrefs_);
190                                         shrinkIdx_.push_back(nNoGapRefs_);
191                                         nNoGapRefs_++;
192                                         isGaps_.push_back(false);
193                                 } else {
194                                         shrinkIdx_.push_back(nNoGapRefs_);
195                                         isGaps_.push_back(true);
196                                 }
197                                 cumlen = 0;
198                                 unambiglen = 0;
199                                 maxlen = 0;
200                                 nrefs_++;
201                                 assert_eq(nNoGapRefs_, expandIdx_.size());
202                                 assert_eq(nrefs_, shrinkIdx_.size());
203                         } else if(i == 0) {
204                                 //cerr << "First record in reference index file was not marked as 'first'" << endl;
205                                 //throw 1;
206                         }
207                         cumsz += recs_[i].len;
208 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
209                         cumlen += recs_[i].off;
210                         cumlen += recs_[i].len;
211 #else
212                         if(recs_[i].len > 0) {
213                                 cumlen += recs_[i].off;
214                                 cumlen += recs_[i].len;
215                         }
216 #endif
217                         unambiglen += recs_[i].len;
218                         if(recs_[i].len > maxlen) maxlen = recs_[i].len;
219                 }
220                 if(verbose_ || startVerbose) {
221                         cerr << "Read " << nrefs_ << " reference strings (" << nNoGapRefs_ << " non-empty) from " << sz << " records: ";
222                         logTime(cerr);
223                 }
224                 // Store a cap entry for the end of the last reference seq
225                 refRecOffs_.push_back(recs_.size());
226                 refOffs_.push_back(cumsz);
227                 if(unambiglen > 0 && (!color || maxlen > 1)) {
228                         refApproxLens_.push_back(cumlen);
229                 }
230                 refLens_.push_back(cumlen);
231                 bufSz_ = cumsz;
232                 assert_eq(nNoGapRefs_, refApproxLens_.size());
233                 assert_eq(sz, recs_.size());
234                 MM_FILE_CLOSE(f3); // done with .3.ebwt file
235                 // Round cumsz up to nearest byte boundary
236                 if((cumsz & 3) != 0) {
237                         cumsz += (4 - (cumsz & 3));
238                 }
239                 bufAllocSz_ = cumsz >> 2;
240                 assert_eq(0, cumsz & 3); // should be rounded up to nearest 4
241                 if(!loadSequence) return;
242                 if(useMm_) {
243 #ifdef BOWTIE_MM
244                         buf_ = (uint8_t*)mmFile;
245                         if(sanity_) {
246                                 FILE *ftmp = fopen(s4.c_str(), "rb");
247                                 sanityBuf_ = new uint8_t[cumsz >> 2];
248                                 size_t ret = fread(sanityBuf_, 1, cumsz >> 2, ftmp);
249                                 if(ret != (cumsz >> 2)) {
250                                         cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
251                                         throw 1;
252                                 }
253                                 fclose(ftmp);
254                                 for(size_t i = 0; i < (cumsz >> 2); i++) {
255                                         assert_eq(sanityBuf_[i], buf_[i]);
256                                 }
257                         }
258 #else
259                         cerr << "Shouldn't be at " << __FILE__ << ":" << __LINE__ << " without BOWTIE_MM defined" << endl;
260                         throw 1;
261 #endif
262                 } else {
263                         bool shmemLeader = true;
264                         if(!useShmem_) {
265                                 // Allocate a buffer to hold the reference string
266                                 try {
267                                         buf_ = new uint8_t[cumsz >> 2];
268                                         if(buf_ == NULL) throw std::bad_alloc();
269                                 } catch(std::bad_alloc& e) {
270                                         cerr << "Error: Ran out of memory allocating space for the bitpacked reference.  Please" << endl
271                                                  << "re-run on a computer with more memory." << endl;
272                                         throw 1;
273                                 }
274                         } else {
275                                 shmemLeader = ALLOC_SHARED_U8(
276                                         (s4 + "[ref]"), (cumsz >> 2), &buf_,
277                                         "ref", (verbose_ || startVerbose));
278                         }
279                         if(shmemLeader) {
280                                 // Open the bitpair-encoded reference file
281                                 FILE *f4 = fopen(s4.c_str(), "rb");
282                                 if(f4 == NULL) {
283                                         cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
284                                         cerr << "This is most likely because your index was built with an older version" << endl
285                                                  << "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
286                                                  << "index (or download one from the Bowtie website) and try again." << endl;
287                                         loaded_ = false;
288                                         return;
289                                 }
290                                 // Read the whole thing in
291                                 size_t ret = fread(buf_, 1, cumsz >> 2, f4);
292                                 // Didn't read all of it?
293                                 if(ret != (cumsz >> 2)) {
294                                         cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
295                                         throw 1;
296                                 }
297                                 // Make sure there's no more
298                                 char c;
299                                 ret = fread(&c, 1, 1, f4);
300                                 assert_eq(0, ret); // should have failed
301                                 fclose(f4);
302                                 if(useShmem_) NOTIFY_SHARED(buf_, (cumsz >> 2));
303                         } else {
304                                 if(useShmem_) WAIT_SHARED(buf_, (cumsz >> 2));
305                         }
306                 }
307
308                 // Populate byteToU32_
309                 bool big = currentlyBigEndian();
310                 for(int i = 0; i < 256; i++) {
311                         uint32_t word = 0;
312                         if(big) {
313                                 word |= ((i >> 0) & 3) << 24;
314                                 word |= ((i >> 2) & 3) << 16;
315                                 word |= ((i >> 4) & 3) << 8;
316                                 word |= ((i >> 6) & 3) << 0;
317                         } else {
318                                 word |= ((i >> 0) & 3) << 0;
319                                 word |= ((i >> 2) & 3) << 8;
320                                 word |= ((i >> 4) & 3) << 16;
321                                 word |= ((i >> 6) & 3) << 24;
322                         }
323                         byteToU32_[i] = word;
324                 }
325
326 #ifndef NDEBUG
327                 if(sanity_) {
328                         // Compare the sequence we just read from the compact index
329                         // file to the true reference sequence.
330                         std::vector<seqan::String<seqan::Dna5> > *os; // for holding references
331                         std::vector<seqan::String<seqan::Dna5> > osv; // for holding references
332                         if(infiles != NULL) {
333                                 if(infilesSeq) {
334                                         for(size_t i = 0; i < infiles->size(); i++) {
335                                                 // Remove initial backslash; that's almost
336                                                 // certainly being used to protect the first
337                                                 // character of the sequence from getopts (e.g.,
338                                                 // when the first char is -)
339                                                 if((*infiles)[i].at(0) == '\\') {
340                                                         (*infiles)[i].erase(0, 1);
341                                                 }
342                                                 osv.push_back(String<Dna5>((*infiles)[i]));
343                                         }
344                                 } else {
345                                         readSequenceFiles<seqan::String<seqan::Dna5>, seqan::Fasta>(*infiles, osv);
346                                 }
347                                 os = &osv;
348                         } else {
349                                 assert(origs != NULL);
350                                 os = origs;
351                         }
352
353                         // Never mind; reference is always letters, even if index
354                         // and alignment run are colorspace
355
356                         // If we're building a colorspace index, we need to convert
357                         // osv to colors first
358 //                      if(color) {
359 //                              for(size_t i = 0; i < os->size(); i++) {
360 //                                      size_t olen = seqan::length((*os)[i]);
361 //                                      for(size_t j = 0; j < olen-1; j++) {
362 //                                              int b1 = (int)(*os)[i][j];
363 //                                              assert_geq(b1, 0); assert_leq(b1, 4);
364 //                                              int b2 = (int)(*os)[i][j+1];
365 //                                              assert_geq(b2, 0); assert_leq(b2, 4);
366 //                                              (*os)[i][j] = (Dna5)dinuc2color[b1][b2];
367 //                                              assert((b1 != 4 && b2 != 4) || (int)(*os)[i][j] == 4);
368 //                                      }
369 //                                      seqan::resize((*os)[i], olen-1);
370 //                              }
371 //                      }
372                         // Go through the loaded reference files base-by-base and
373                         // sanity check against what we get by calling getBase and
374                         // getStretch
375                         size_t refi = 0;
376                         int longestStretch = 0;
377                         int curStretch = 0;
378                         for(size_t i = 0; i < os->size(); i++) {
379                                 size_t olen = seqan::length((*os)[i]);
380                                 for(size_t j = 0; j < olen; j++) {
381                                         if((int)(*os)[i][j] < 4) {
382                                                 curStretch++;
383                                                 if(curStretch > longestStretch) longestStretch = curStretch;
384                                         } else {
385                                                 curStretch = 0;
386                                         }
387                                 }
388                                 if(longestStretch == 0 || (color && longestStretch == 1)) {
389                                         continue;
390                                 }
391                                 longestStretch = 0;
392                                 size_t olenU32 = (olen + 12) / 4;
393                                 uint32_t *buf = new uint32_t[olenU32];
394                                 uint8_t *bufadj = (uint8_t*)buf;
395                                 bufadj += getStretch(buf, refi, 0, olen);
396                                 for(size_t j = 0; j < olen; j++) {
397                                         assert_eq((int)(*os)[i][j], (int)bufadj[j]);
398                                         assert_eq((int)(*os)[i][j], (int)getBase(refi, j));
399                                 }
400                                 refi++;
401                                 delete[] buf;
402                         }
403                 }
404 #endif
405         }
406
407         ~BitPairReference() {
408                 if(buf_ != NULL && !useMm_ && !useShmem_) delete[] buf_;
409                 if(sanityBuf_ != NULL) delete[] sanityBuf_;
410         }
411
412         /**
413          * Return a single base of the reference.  Calling this repeatedly
414          * is not an efficient way to retrieve bases from the reference;
415          * use loadStretch() instead.
416          *
417          * This implementation scans linearly through the records for the
418          * unambiguous stretches of the target reference sequence.  When
419          * there are many records, binary search would be more appropriate.
420          */
421         int getBase(uint32_t tidx, uint32_t toff) const {
422                 uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
423                 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
424                 assert_gt(recf, reci);
425                 uint32_t bufOff = refOffs_[tidx];
426                 uint32_t off = 0;
427                 // For all records pertaining to the target reference sequence...
428                 for(uint32_t i = reci; i < recf; i++) {
429                         assert_geq(toff, off);
430                         off += recs_[i].off;
431                         if(toff < off) {
432                                 return 4;
433                         }
434                         assert_geq(toff, off);
435                         uint32_t recOff = off + recs_[i].len;
436                         if(toff < recOff) {
437                                 toff -= off;
438                                 bufOff += toff;
439                                 assert_lt(bufOff, bufSz_);
440                                 const uint32_t bufElt = (bufOff) >> 2;
441                                 const uint32_t shift = (bufOff & 3) << 1;
442                                 return ((buf_[bufElt] >> shift) & 3);
443                         }
444                         bufOff += recs_[i].len;
445                         off = recOff;
446                         assert_geq(toff, off);
447                 } // end for loop over records
448                 return 4;
449         }
450
451         /**
452          * Load a stretch of the reference string into memory at 'dest'.
453          *
454          * This implementation scans linearly through the records for the
455          * unambiguous stretches of the target reference sequence.  When
456          * there are many records, binary search would be more appropriate.
457          */
458         int getStretchNaive(uint32_t *destU32,
459                             uint32_t tidx,
460                             uint32_t toff,
461                             uint32_t count) const
462         {
463                 uint8_t *dest = (uint8_t*)destU32;
464                 uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
465                 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
466                 assert_gt(recf, reci);
467                 uint32_t cur = 0;
468                 uint32_t bufOff = refOffs_[tidx];
469                 uint32_t off = 0;
470                 // For all records pertaining to the target reference sequence...
471                 for(uint32_t i = reci; i < recf; i++) {
472                         assert_geq(toff, off);
473                         off += recs_[i].off;
474                         for(; toff < off && count > 0; toff++) {
475                                 dest[cur++] = 4;
476                                 count--;
477                         }
478                         if(count == 0) break;
479                         assert_geq(toff, off);
480                         if(toff < off + recs_[i].len) {
481                                 bufOff += (toff - off); // move bufOff pointer forward
482                         } else {
483                                 bufOff += recs_[i].len;
484                         }
485                         off += recs_[i].len;
486                         for(; toff < off && count > 0; toff++) {
487                                 assert_lt(bufOff, bufSz_);
488                                 const uint32_t bufElt = (bufOff) >> 2;
489                                 const uint32_t shift = (bufOff & 3) << 1;
490                                 dest[cur++] = (buf_[bufElt] >> shift) & 3;
491                                 bufOff++;
492                                 count--;
493                         }
494                         if(count == 0) break;
495                         assert_geq(toff, off);
496                 } // end for loop over records
497                 // In any chars are left after scanning all the records,
498                 // they must be ambiguous
499                 while(count > 0) {
500                         count--;
501                         dest[cur++] = 4;
502                 }
503                 assert_eq(0, count);
504                 return 0;
505         }
506
507         /**
508          * Load a stretch of the reference string into memory at 'dest'.
509          *
510          * This implementation scans linearly through the records for the
511          * unambiguous stretches of the target reference sequence.  When
512          * there are many records, binary search would be more appropriate.
513          */
514         int getStretch(uint32_t *destU32,
515                        uint32_t tidx,
516                        uint32_t toff,
517                        uint32_t count) const
518         {
519                 ASSERT_ONLY(uint32_t origCount = count);
520                 ASSERT_ONLY(uint32_t origToff = toff);
521                 if(count == 0) return 0;
522                 uint8_t *dest = (uint8_t*)destU32;
523 #ifndef NDEBUG
524                 uint32_t *destU32_2 = new uint32_t[(origCount >> 2) + 2];
525                 int off2 = getStretchNaive(destU32_2, tidx, origToff, origCount);
526                 uint8_t *dest_2 = ((uint8_t*)destU32_2) + off2;
527 #endif
528                 destU32[0] = 0x04040404; // Add Ns, which we might end up using later
529                 uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
530                 uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
531                 assert_gt(recf, reci);
532                 uint32_t cur = 4; // keep a cushion of 4 bases at the beginning
533                 uint32_t bufOff = refOffs_[tidx];
534                 uint32_t off = 0;
535                 int offset = 4;
536                 bool firstStretch = true;
537                 // For all records pertaining to the target reference sequence...
538                 for(uint32_t i = reci; i < recf; i++) {
539                         ASSERT_ONLY(uint32_t origBufOff = bufOff);
540                         assert_geq(toff, off);
541                         off += recs_[i].off;
542                         assert_gt(count, 0);
543                         if(toff < off) {
544                                 uint32_t cpycnt = min(off - toff, count);
545                                 memset(&dest[cur], 4, cpycnt);
546                                 count -= cpycnt;
547                                 toff += cpycnt;
548                                 cur += cpycnt;
549                                 if(count == 0) break;
550                         }
551                         assert_geq(toff, off);
552                         if(toff < off + recs_[i].len) {
553                                 bufOff += (toff - off); // move bufOff pointer forward
554                         } else {
555                                 bufOff += recs_[i].len;
556                         }
557                         off += recs_[i].len;
558                         if(toff < off) {
559                                 if(firstStretch) {
560                                         if(toff + 8 < off && count > 8) {
561                                                 // We already added some Ns, so we have to do
562                                                 // a fixup at the beginning of the buffer so
563                                                 // that we can start clobbering at cur >> 2
564                                                 if(cur & 3) {
565                                                         offset -= (cur & 3);
566                                                 }
567                                                 uint32_t curU32 = cur >> 2;
568                                                 // Do the initial few bases
569                                                 if(bufOff & 3) {
570                                                         const uint32_t bufElt = (bufOff) >> 2;
571                                                         const int low2 = bufOff & 3;
572                                                         destU32[curU32] = byteToU32_[buf_[bufElt]];
573                                                         for(int j = 0; j < low2; j++) {
574                                                                 ((char *)(&destU32[curU32]))[j] = 4;
575                                                         }
576                                                         curU32++;
577                                                         offset += low2;
578                                                         const int chars = 4 - low2;
579                                                         count -= chars;
580                                                         bufOff += chars;
581                                                         toff += chars;
582                                                 }
583                                                 assert_eq(0, bufOff & 3);
584                                                 uint32_t bufOffU32 = bufOff >> 2;
585                                                 uint32_t countLim = count >> 2;
586                                                 uint32_t offLim = (off - (toff + 4)) >> 2;
587                                                 uint32_t lim = min(countLim, offLim);
588                                                 // Do the fast thing for as far as possible
589                                                 for(uint32_t j = 0; j < lim; j++) {
590                                                         destU32[curU32] = byteToU32_[buf_[bufOffU32++]];
591                                                         assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]);
592                                                         assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]);
593                                                         assert_eq(dest[(curU32 << 2) + 2], dest_2[(curU32 << 2) - offset + 2]);
594                                                         assert_eq(dest[(curU32 << 2) + 3], dest_2[(curU32 << 2) - offset + 3]);
595                                                         curU32++;
596                                                 }
597                                                 toff += (lim << 2);
598                                                 assert_leq(toff, off);
599                                                 assert_leq((lim << 2), count);
600                                                 count -= (lim << 2);
601                                                 bufOff = bufOffU32 << 2;
602                                                 cur = curU32 << 2;
603                                         }
604                                         // Do the slow thing for the rest
605                                         for(; toff < off && count > 0; toff++) {
606                                                 assert_lt(bufOff, bufSz_);
607                                                 const uint32_t bufElt = (bufOff) >> 2;
608                                                 const uint32_t shift = (bufOff & 3) << 1;
609                                                 dest[cur++] = (buf_[bufElt] >> shift) & 3;
610                                                 bufOff++;
611                                                 count--;
612                                         }
613                                         firstStretch = false;
614                                 } else {
615                                         // Do the slow thing
616                                         for(; toff < off && count > 0; toff++) {
617                                                 assert_lt(bufOff, bufSz_);
618                                                 const uint32_t bufElt = (bufOff) >> 2;
619                                                 const uint32_t shift = (bufOff & 3) << 1;
620                                                 dest[cur++] = (buf_[bufElt] >> shift) & 3;
621                                                 bufOff++;
622                                                 count--;
623                                         }
624                                 }
625                         }
626                         if(count == 0) break;
627                         assert_eq(recs_[i].len, bufOff - origBufOff);
628                         assert_geq(toff, off);
629                 } // end for loop over records
630                 // In any chars are left after scanning all the records,
631                 // they must be ambiguous
632                 while(count > 0) {
633                         count--;
634                         dest[cur++] = 4;
635                 }
636                 assert_eq(0, count);
637 #ifndef NDEBUG
638                 delete[] destU32_2;
639 #endif
640                 return offset;
641         }
642
643         /// Return the number of reference sequences.
644         uint32_t numRefs() const {
645                 return nrefs_;
646         }
647
648         /// Return the number of reference sequences that don't consist
649         /// entirely of stretches of ambiguous characters.
650         uint32_t numNonGapRefs() const {
651                 return nNoGapRefs_;
652         }
653
654         /**
655          *
656          */
657         uint32_t shrinkIdx(uint32_t idx) const {
658                 assert_lt(idx, shrinkIdx_.size());
659                 return shrinkIdx_[idx];
660         }
661
662         /**
663          *
664          */
665         uint32_t expandIdx(uint32_t idx) const {
666                 assert_lt(idx, expandIdx_.size());
667                 return expandIdx_[idx];
668         }
669
670         /// Return the lengths of reference sequences.
671         uint32_t approxLen(uint32_t elt) const {
672                 assert_lt(elt, nrefs_);
673                 return refApproxLens_[elt];
674         }
675
676         /// Return the lengths of reference sequences.
677         uint32_t len(uint32_t elt) const {
678                 assert_lt(elt, nrefs_);
679                 return refLens_[elt];
680         }
681
682         /// Return true iff ref 'elt' is all gaps
683         bool isAllGaps(uint32_t elt) const {
684                 assert_lt(elt, nrefs_);
685                 assert_eq(isGaps_.size(), nrefs_);
686                 return isGaps_[elt];
687         }
688
689         /// Return true iff buf_ and all the vectors are populated.
690         bool loaded() const {
691                 return loaded_;
692         }
693
694         /**
695          * Return constant reference to the RefRecord list.
696          */
697         const std::vector<RefRecord>& refRecords() const { return recs_; }
698
699 protected:
700
701         uint32_t byteToU32_[256];
702
703         std::vector<RefRecord> recs_;       /// records describing unambiguous stretches
704         std::vector<uint32_t>  refApproxLens_; /// approx lens of ref seqs (excludes trailing ambig chars)
705         std::vector<uint32_t>  refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
706         std::vector<uint32_t>  refOffs_;    /// buf_ begin offsets per ref seq
707         std::vector<uint32_t>  refRecOffs_; /// record begin/end offsets per ref seq
708         std::vector<uint32_t>  expandIdx_; /// map from small idxs (e.g. w/r/t plen) to large ones (w/r/t refnames)
709         std::vector<uint32_t>  shrinkIdx_; /// map from large idxs to small
710         std::vector<bool>      isGaps_;    /// ref i is all gaps?
711         uint8_t *buf_;      /// the whole reference as a big bitpacked byte array
712         uint8_t *sanityBuf_;/// for sanity-checking buf_
713         uint32_t bufSz_;    /// size of buf_
714         uint32_t bufAllocSz_;
715         uint32_t nrefs_;      /// the number of reference sequences
716         uint32_t nNoGapRefs_; /// the number of reference sequences that aren't totally ambiguous
717         bool     loaded_;   /// whether it's loaded
718         bool     sanity_;   /// do sanity checking
719         bool     useMm_;    /// load the reference as a memory-mapped file
720         bool     useShmem_; /// load the reference into shared memory
721         bool     verbose_;
722 };
723
724 #endif