hit.h

   1 #ifndef HIT_H_
   2 #define HIT_H_
   3
   4 #include <vector>
   5 #include <stdint.h>
   6 #include <iostream>
   7 #include <sstream>
   8 #include <fstream>
   9 #include <string>
  10 #include <stdexcept>
  11 #include <seqan/sequence.h>
  12 #include "alphabet.h"
  13 #include "assert_helpers.h"
  14 #include "spinlock.h"
  15 #include "threading.h"
  16 #include "bitset.h"
  17 #include "tokenize.h"
  18 #include "pat.h"
  19 #include "formats.h"
  20 #include "filebuf.h"
  21 #include "edit.h"
  22 #include "refmap.h"
  23 #include "annot.h"
  24
  25 /**
  26  * Classes for dealing with reporting alignments.
  27  */
  28
  29 using namespace std;
  30 using namespace seqan;
  31
  32 /// Constants for the various output modes
  33 enum output_types {
  34         OUTPUT_FULL = 1,
  35         OUTPUT_CONCISE,
  36         OUTPUT_BINARY,
  37         OUTPUT_CHAIN,
  38         OUTPUT_SAM,
  39         OUTPUT_NONE
  40 };
  41
  42 /// Names of the various output modes
  43 static const std::string output_type_names[] = {
  44         "Invalid!",
  45         "Full",
  46         "Concise",
  47         "Binary",
  48         "None"
  49 };
  50
  51 typedef pair<uint32_t,uint32_t> U32Pair;
  52
  53 /**
  54  * Encapsulates a hit, including a text-id/text-offset pair, a pattern
  55  * id, and a boolean indicating whether it matched as its forward or
  56  * reverse-complement version.
  57  */
  58 class Hit {
  59 public:
  60         Hit() : stratum(-1) { }
  61
  62         /**
  63          * Initialize this Ent to represent the given hit.
  64          */
  65         void toHitSetEnt(HitSetEnt& hse, AnnotationMap *amap) const {
  66                 hse.h = h;
  67                 hse.fw = fw ? 1 : 0;
  68                 hse.oms = oms;
  69                 hse.stratum = stratum;
  70                 hse.cost = cost;
  71                 if(mms.count() == 0) return;
  72                 for(int i = 0; i < (int)length(); i++) {
  73                         if(mms.test(i)) {
  74                                 hse.expand();
  75                                 hse.back().type = EDIT_TYPE_MM;
  76                                 hse.back().pos = i;
  77                                 hse.back().chr = refcs[i];
  78                         }
  79                 }
  80         }
  81
  82         /**
  83          * Convert a list of Hit objects to a single HitSet object suitable
  84          * for chaining.
  85          */
  86         static void toHitSet(const std::vector<Hit>& hits, HitSet& hs,
  87                              AnnotationMap *amap)
  88         {
  89                 if(hits.empty()) return;
  90                 // Initialize HitSet
  91                 hs.name = hits.front().patName;
  92                 hs.seq  = hits.front().patSeq;
  93                 hs.qual = hits.front().quals;
  94                 hs.color = hits.front().color;
  95                 if(!hits.front().fw) {
  96                         // Re-reverse
  97                         reverseComplementInPlace(hs.seq, hs.color);
  98                         reverseInPlace(hs.qual);
  99                 }
 100                 // Convert hits to entries
 101                 hs.ents.resize(hits.size());
 102                 for(size_t i = 0; i < hs.ents.size(); i++) {
 103                         hits[i].toHitSetEnt(hs.ents[i], amap);
 104                 }
 105         }
 106
 107         /**
 108          * Populate a vector of Hits with hits from the given HitSet.
 109          */
 110         static void fromHitSet(std::vector<Hit>& hits, const HitSet& hs) {
 111                 assert(hs.sorted());
 112                 hits.resize(hs.size());
 113                 for(size_t i = 0; i < hs.size(); i++) {
 114                         hits[i].h = hs[i].h;
 115                         hits[i].oms = hs[i].oms;
 116                         hits[i].fw = hs[i].fw;
 117                         hits[i].stratum = hs[i].stratum;
 118                         hits[i].cost = hs[i].cost;
 119                         hits[i].mate = 0; //hs[i].mate;
 120                         hits[i].patName = hs.name;
 121                         hits[i].patSeq = hs.seq;
 122                         hits[i].quals = hs.qual;
 123                         hits[i].color = hs.color;
 124                         size_t len = seqan::length(hs.seq);
 125                         if(!hs[i].fw) {
 126                                 reverseComplementInPlace(hits[i].patSeq, hs.color);
 127                                 reverseInPlace(hits[i].quals);
 128                         }
 129                         hits[i].refcs.resize(len);
 130                         for(size_t j = 0; j < hs[i].size(); j++) {
 131                                 if(hs[i][j].type != EDIT_TYPE_MM) continue;
 132                                 hits[i].mms.set(hs[i][j].pos);
 133                                 hits[i].refcs[hs[i][j].pos] = hs[i][j].chr;
 134                         }
 135                 }
 136         }
 137
 138         U32Pair             h;       /// reference index & offset
 139         U32Pair             mh;      /// reference index & offset for mate
 140         uint32_t            patId;   /// read index
 141         String<char>        patName; /// read name
 142         String<Dna5>        patSeq;  /// read sequence
 143         String<Dna5>        colSeq;  /// original color sequence, not decoded
 144         String<char>        quals;   /// read qualities
 145         String<char>        colQuals;/// original color qualities, not decoded
 146         FixedBitset<1024>   mms;     /// nucleotide mismatch mask
 147         FixedBitset<1024>   cmms;    /// color mismatch mask (if relevant)
 148         vector<char>        refcs;   /// reference characters for mms
 149         vector<char>        crefcs;  /// reference characters for cmms
 150         uint32_t            oms;     /// # of other possible mappings; 0 -> this is unique
 151         bool                fw;      /// orientation of read in alignment
 152         bool                mfw;     /// orientation of mate in alignment
 153         uint16_t            mlen;    /// length of mate
 154         int8_t              stratum; /// stratum of hit (= mismatches in seed)
 155         uint32_t            cost;    /// total cost, factoring in stratum and quality penalty
 156         uint8_t             mate;    /// matedness; 0 = not a mate
 157                                      ///            1 = upstream mate
 158                                      ///            2 = downstream mate
 159         bool                color;   /// read is in colorspace?
 160         uint32_t            seed;    /// pseudo-random seed for aligned read
 161
 162         /**
 163          * Return true if this Hit is internally consistent.  Otherwise,
 164          * throw an assertion.
 165          */
 166         bool repOk() const {
 167                 assert_geq(cost, (uint32_t)(stratum << 14));
 168                 return true;
 169         }
 170
 171         size_t length() const { return seqan::length(patSeq); }
 172
 173         Hit& operator = (const Hit &other) {
 174                 this->h       = other.h;
 175                 this->mh      = other.mh;
 176                 this->patId   = other.patId;
 177                 this->patName = other.patName;
 178                 this->patSeq  = other.patSeq;
 179                 this->colSeq  = other.colSeq;
 180                 this->quals   = other.quals;
 181                 this->colQuals= other.colQuals;
 182                 this->mms     = other.mms;
 183                 this->cmms    = other.cmms;
 184                 this->refcs   = other.refcs;
 185                 this->crefcs  = other.crefcs;
 186                 this->oms     = other.oms;
 187                 this->fw      = other.fw;
 188                 this->mfw     = other.mfw;
 189                 this->mlen    = other.mlen;
 190                 this->stratum = other.stratum;
 191                 this->cost    = other.cost;
 192                 this->mate    = other.mate;
 193                 this->color   = other.color;
 194                 this->cmms    = other.cmms;
 195                 this->seed    = other.seed;
 196                 return *this;
 197         }
 198 };
 199
 200 /**
 201  * Compare hits a and b; a < b if its cost is less than B's.  If
 202  * there's a tie, break on position and orientation.
 203  */
 204 class HitCostCompare {
 205 public:
 206         bool operator() (const Hit& a, const Hit& b) {
 207                 if(a.cost < b.cost) return true;
 208                 if(a.cost > b.cost) return false;
 209                 if(a.h < b.h) return true;
 210                 if(a.h > b.h) return false;
 211                 if(a.fw < b.fw) return true;
 212                 if(a.fw > b.fw) return false;
 213                 return false;
 214         }
 215 };
 216
 217 /// Sort by text-id then by text-offset
 218 bool operator< (const Hit& a, const Hit& b);
 219
 220 /**
 221  * Table for holding recalibration counts, along the lines of the table
 222  * presented in the SOAPsnp paper in Genome Res.  Each element maps a
 223  * read allele (o), quality score (q), cycle (c), and reference allele
 224  * (H), to a count of how many times that combination is observed in a
 225  * reported alignment.
 226  *
 227  * RecalTable is not synchronized, so it's assumed that threads are
 228  * incrementing the counters from within critical sections.
 229  */
 230 class RecalTable {
 231 public:
 232         RecalTable(int maxCycle,
 233                    int maxQual,
 234                    int qualShift) : maxCycle_(maxCycle),
 235                                     maxQual_(maxQual),
 236                                     qualShift_(qualShift),
 237                                     shift1_(6 - qualShift_),
 238                                     shift2_(shift1_ + 2),
 239                                     shift3_(shift2_ + 2),
 240                                     ents_(NULL), len_(0)
 241         {
 242                 if(maxCycle == 0) {
 243                         cerr << "Warning: maximum cycle for recalibration table is 0" << endl;
 244                 } else if(maxQual >> qualShift == 0) {
 245                         cerr << "Warning: maximum quality value " << maxQual << ", when shifted, is 0" << endl;
 246                 } else if(qualShift > 5) {
 247                         cerr << "Warning: quality shift value " << qualShift << " exceeds ceiling of 5" << endl;
 248                 } else {
 249                         try {
 250                                 len_ = maxCycle_ * 4 /* subj alleles*/ * 4 /* ref alleles */ * 64 /* quals */;
 251                                 ents_ = new uint32_t[len_];
 252                                 if(ents_ == NULL) {
 253                                         throw std::bad_alloc();
 254                                 }
 255                                 memset(ents_, 0, len_ << 2);
 256                         } catch(std::bad_alloc& e) {
 257                                 cerr << "Error allocating recalibration table with " << len_ << " entries" << endl;
 258                                 throw 1;
 259                         }
 260                 }
 261         }
 262
 263         ~RecalTable() {
 264                 if(ents_ != NULL) delete[] ents_;
 265         }
 266
 267         /**
 268          * Factor a new alignment into the recalibration table.
 269          */
 270         void commitHit(const Hit& h) {
 271                 // Iterate through the pattern from 5' to 3', calculate the
 272                 // shifted quality value, obtain the reference character, and
 273                 // increment the appropriate counter
 274                 assert(h.repOk());
 275                 for(int i = 0; i < (int)h.length(); i++) {
 276                         int ii = i;
 277                         if(!h.fw) {
 278                                 ii = h.length() - ii - 1;
 279                         }
 280                         int qc = (int)h.patSeq[ii];
 281                         int rc = qc;
 282                         if(h.mms.test(i)) {
 283                                 rc = charToDna5[(int)h.refcs[i]];
 284                                 assert_neq(rc, qc);
 285                         }
 286                         int q = (int)h.quals[ii]-33;
 287                         assert_lt(q, 64);
 288                         q >>= qualShift_;
 289                         ents_[calcIdx(i, qc, rc, q)]++;
 290                 }
 291         }
 292
 293         /**
 294          * Print the contents of the recalibration table.
 295          */
 296         void print (std::ostream& out) const {
 297                 if(ents_ == NULL) return;
 298                 const int lim = maxCycle_;
 299                 for(int i = 0; i < lim; i++) {
 300                         out << "t" << i << "\t";
 301                         // Iterate over subject alleles
 302                         for(int j = 0; j < 4; j++) {
 303                                 // Iterate over reference alleles
 304                                 for(int k = 0; k < 4; k++) {
 305                                         // Iterate over qualities
 306                                         int lim2 = maxQual_ >> qualShift_;
 307                                         for(int l = 0; l < lim2; l++) {
 308                                                 out << ents_[calcIdx(i, j, k, l)] << '\t';
 309                                         }
 310                                 }
 311                         }
 312                         out << endl;
 313                 }
 314         }
 315
 316 protected:
 317
 318         /**
 319          * Calculate index into the ents_ array given cycle, subject
 320          * allele, reference allele, and (shifted) quality.
 321          */
 322         int calcIdx(int cyc, int sa, int ra, int q) const {
 323                 int ret = q | (ra << shift1_) | (sa << shift2_) | (cyc << shift3_);
 324                 assert_lt(ret, len_);
 325                 return ret;
 326         }
 327
 328         const int maxCycle_;
 329         const int maxQual_;
 330         const int qualShift_;
 331         const int shift1_;
 332         const int shift2_;
 333         const int shift3_;
 334         uint32_t *ents_;
 335         int len_;
 336 };
 337
 338 #define DECL_HIT_DUMPS \
 339         const std::string& dumpAl, \
 340         const std::string& dumpUnal, \
 341         const std::string& dumpMax
 342
 343 #define INIT_HIT_DUMPS \
 344         dumpAlBase_(dumpAl), \
 345         dumpUnalBase_(dumpUnal), \
 346         dumpMaxBase_(dumpMax)
 347
 348 #define DECL_HIT_DUMPS2 \
 349         DECL_HIT_DUMPS, \
 350         bool onePairFile, \
 351         bool sampleMax, \
 352         RecalTable *recalTable, \
 353         std::vector<std::string>* refnames
 354
 355 #define PASS_HIT_DUMPS \
 356         dumpAl, \
 357         dumpUnal, \
 358         dumpMax
 359
 360 #define PASS_HIT_DUMPS2 \
 361         PASS_HIT_DUMPS, \
 362         onePairFile, \
 363         sampleMax, \
 364         recalTable, \
 365         refnames
 366
 367 /**
 368  * Encapsulates an object that accepts hits, optionally retains them in
 369  * a vector, and does something else with them according to
 370  * descendent's implementation of pure virtual member reportHitImpl().
 371  */
 372 class HitSink {
 373 public:
 374         explicit HitSink(OutFileBuf* out,
 375                         DECL_HIT_DUMPS,
 376                         bool onePairFile,
 377                         bool sampleMax,
 378                         RecalTable *table,
 379                         vector<string>* refnames = NULL) :
 380                 _outs(),
 381                 _deleteOuts(false),
 382                 recalTable_(table),
 383                 _refnames(refnames),
 384                 _numWrappers(0),
 385                 _locks(),
 386                 INIT_HIT_DUMPS,
 387                 onePairFile_(onePairFile),
 388                 sampleMax_(sampleMax),
 389                 first_(true),
 390                 numAligned_(0llu),
 391                 numUnaligned_(0llu),
 392                 numMaxed_(0llu),
 393                 numReported_(0llu),
 394                 numReportedPaired_(0llu),
 395                 quiet_(false),
 396                 ssmode_(ios_base::out)
 397         {
 398                 _outs.push_back(out);
 399                 _locks.resize(1);
 400                 MUTEX_INIT(_locks[0]);
 401                 MUTEX_INIT(_mainlock);
 402                 initDumps();
 403         }
 404
 405         /**
 406          * Open a number of output streams; usually one per reference
 407          * sequence.  For now, we give then names refXXXXX.map where XXXXX
 408          * is the 0-padded reference index.  Someday we may want to include
 409          * the name of the reference sequence in the filename somehow.
 410          */
 411         explicit HitSink(size_t numOuts,
 412                         DECL_HIT_DUMPS,
 413                         bool onePairFile,
 414                         bool sampleMax,
 415                         RecalTable *table,
 416                         vector<string>* refnames = NULL) :
 417                 _outs(),
 418                 _deleteOuts(true),
 419                 recalTable_(table),
 420                 _refnames(refnames),
 421                 _locks(),
 422                 INIT_HIT_DUMPS,
 423                 onePairFile_(onePairFile),
 424                 sampleMax_(sampleMax),
 425                 quiet_(false),
 426                 ssmode_(ios_base::out)
 427         {
 428                 // Open all files for writing and initialize all locks
 429                 for(size_t i = 0; i < numOuts; i++) {
 430                         _outs.push_back(NULL); // we open output streams lazily
 431                         _locks.resize(i+1);
 432                         MUTEX_INIT(_locks[i]);
 433                 }
 434                 MUTEX_INIT(_mainlock);
 435                 initDumps();
 436         }
 437
 438         /**
 439          * Destroy HitSinkobject;
 440          */
 441         virtual ~HitSink() {
 442                 closeOuts();
 443                 if(_deleteOuts) {
 444                         // Delete all non-NULL output streams
 445                         for(size_t i = 0; i < _outs.size(); i++) {
 446                                 if(_outs[i] != NULL) {
 447                                         delete _outs[i];
 448                                         _outs[i] = NULL;
 449                                 }
 450                         }
 451                 }
 452                 destroyDumps();
 453         }
 454
 455         /**
 456          * Call this whenever this HitSink is wrapped by a new
 457          * HitSinkPerThread.  This helps us keep track of whether the main
 458          * lock or any of the per-stream locks will be contended.
 459          */
 460         void addWrapper() {
 461                 _numWrappers++;
 462         }
 463
 464         /**
 465          * Called by concrete subclasses to figure out which elements of
 466          * the _outs/_locks array to use when outputting the alignment.
 467          */
 468         size_t refIdxToStreamIdx(size_t refIdx) {
 469                 if(refIdx >= _outs.size()) return 0;
 470                 return refIdx;
 471         }
 472
 473         /**
 474          * Append a single hit to the given output stream.
 475          */
 476         virtual void append(ostream& o, const Hit& h) = 0;
 477
 478         /**
 479          * Report a batch of hits; all in the given vector.
 480          */
 481         virtual void reportHits(vector<Hit>& hs) {
 482                 reportHits(hs, 0, hs.size());
 483         }
 484
 485         /**
 486          * Report a batch of hits from a vector, perhaps subsetting it.
 487          */
 488         virtual void reportHits(vector<Hit>& hs, size_t start, size_t end) {
 489                 assert_geq(end, start);
 490                 if(end-start == 0) return;
 491                 bool paired = hs[start].mate > 0;
 492                 // Sort reads so that those against the same reference sequence
 493                 // are consecutive.
 494                 if(_outs.size() > 1 && end-start > 2) {
 495                         sort(hs.begin() + start, hs.begin() + end);
 496                 }
 497                 char buf[4096];
 498                 for(size_t i = start; i < end; i++) {
 499                         const Hit& h = hs[i];
 500                         assert(h.repOk());
 501                         bool diff = false;
 502                         if(i > start) {
 503                                 diff = (refIdxToStreamIdx(h.h.first) != refIdxToStreamIdx(hs[i-1].h.first));
 504                                 if(diff) unlock(hs[i-1].h.first);
 505                         }
 506                         ostringstream ss(ssmode_);
 507                         ss.rdbuf()->pubsetbuf(buf, 4096);
 508                         append(ss, h);
 509                         if(i == start || diff) {
 510                                 lock(h.h.first);
 511                         }
 512                         out(h.h.first).writeChars(buf, ss.tellp());
 513                 }
 514                 unlock(hs[end-1].h.first);
 515                 mainlock();
 516                 commitHits(hs);
 517                 first_ = false;
 518                 numAligned_++;
 519                 if(paired) numReportedPaired_ += (end-start);
 520                 else       numReported_ += (end-start);
 521                 mainunlock();
 522         }
 523
 524         void commitHit(const Hit& hit) {
 525                 if(recalTable_ != NULL) {
 526                         recalTable_->commitHit(hit);
 527                 }
 528         }
 529
 530         void commitHits(const std::vector<Hit>& hits) {
 531                 if(recalTable_ != NULL) {
 532                         const size_t sz = hits.size();
 533                         for(size_t i = 0; i < sz; i++) {
 534                                 commitHit(hits[i]);
 535                         }
 536                 }
 537         }
 538
 539         /**
 540          * Called when all alignments are complete.  It is assumed that no
 541          * synchronization is necessary.
 542          */
 543         void finish(bool hadoopOut) {
 544                 // Close output streams
 545                 closeOuts();
 546                 if(!quiet_) {
 547                         // Print information about how many unpaired and/or paired
 548                         // reads were aligned.
 549                         uint64_t tot = numAligned_ + numUnaligned_ + numMaxed_;
 550                         double alPct = 0.0, unalPct = 0.0, maxPct = 0.0;
 551                         if(tot > 0) {
 552                                 alPct   = 100.0 * (double)numAligned_ / (double)tot;
 553                                 unalPct = 100.0 * (double)numUnaligned_ / (double)tot;
 554                                 maxPct  = 100.0 * (double)numMaxed_ / (double)tot;
 555                         }
 556                         cerr << "# reads processed: " << tot << endl;
 557                         cerr << "# reads with at least one reported alignment: "
 558                              << numAligned_ << " (" << fixed << setprecision(2)
 559                              << alPct << "%)" << endl;
 560                         cerr << "# reads that failed to align: "
 561                              << numUnaligned_ << " (" << fixed << setprecision(2)
 562                              << unalPct << "%)" << endl;
 563                         if(numMaxed_ > 0) {
 564                                 if(sampleMax_) {
 565                                         cerr << "# reads with alignments sampled due to -M: "
 566                                                  << numMaxed_ << " (" << fixed << setprecision(2)
 567                                                  << maxPct << "%)" << endl;
 568                                 } else {
 569                                         cerr << "# reads with alignments suppressed due to -m: "
 570                                                  << numMaxed_ << " (" << fixed << setprecision(2)
 571                                                  << maxPct << "%)" << endl;
 572                                 }
 573                         }
 574                         if(first_) {
 575                                 assert_eq(0llu, numReported_);
 576                                 cerr << "No alignments" << endl;
 577                         }
 578                         else if(numReportedPaired_ > 0 && numReported_ == 0) {
 579                                 cerr << "Reported " << (numReportedPaired_ >> 1)
 580                                          << " paired-end alignments to " << _outs.size()
 581                                          << " output stream(s)" << endl;
 582                         }
 583                         else if(numReported_ > 0 && numReportedPaired_ == 0) {
 584                                 cerr << "Reported " << numReported_
 585                                          << " alignments to " << _outs.size()
 586                                          << " output stream(s)" << endl;
 587                         }
 588                         else {
 589                                 assert_gt(numReported_, 0);
 590                                 assert_gt(numReportedPaired_, 0);
 591                                 cerr << "Reported " << (numReportedPaired_ >> 1)
 592                                          << " paired-end alignments and " << numReported_
 593                                          << " singleton alignments to " << _outs.size()
 594                                          << " output stream(s)" << endl;
 595                         }
 596                         if(hadoopOut) {
 597                                 cerr << "reporter:counter:Bowtie,Reads with reported alignments," << numAligned_ << endl;
 598                                 cerr << "reporter:counter:Bowtie,Reads with no alignments," << numUnaligned_ << endl;
 599                                 cerr << "reporter:counter:Bowtie,Reads exceeding -m limit," << numMaxed_ << endl;
 600                                 cerr << "reporter:counter:Bowtie,Unpaired alignments reported," << numReported_ << endl;
 601                                 cerr << "reporter:counter:Bowtie,Paired alignments reported," << numReportedPaired_ << endl;
 602                         }
 603                 }
 604                 // Print the recalibration table.
 605                 if(recalTable_ != NULL) {
 606                         recalTable_->print(cout);
 607                 }
 608         }
 609
 610         /// Returns the alignment output stream; if the stream needs to be
 611         /// created, create it
 612         OutFileBuf& out(size_t refIdx) {
 613                 size_t strIdx = refIdxToStreamIdx(refIdx);
 614                 if(_outs[strIdx] == NULL) {
 615                         assert(_deleteOuts);
 616                         ostringstream oss;
 617                         oss << "ref";
 618                         if     (strIdx < 10)    oss << "0000";
 619                         else if(strIdx < 100)   oss << "000";
 620                         else if(strIdx < 1000)  oss << "00";
 621                         else if(strIdx < 10000) oss << "0";
 622                         oss << strIdx << ".map";
 623                         _outs[strIdx] = new OutFileBuf(oss.str().c_str(), ssmode_ == ios_base::binary);
 624                 }
 625                 assert(_outs[strIdx] != NULL);
 626                 return *(_outs[strIdx]);
 627         }
 628
 629         /**
 630          * Lock the monolithic lock for this HitSink.  This is useful when,
 631          * for example, outputting a read to an unaligned-read file.
 632          */
 633         void mainlock() {
 634                 MUTEX_LOCK(_mainlock);
 635         }
 636
 637         /**
 638          * Unlock the monolithic lock for this HitSink.  This is useful
 639          * when, for example, outputting a read to an unaligned-read file.
 640          */
 641         void mainunlock() {
 642                 MUTEX_UNLOCK(_mainlock);
 643         }
 644
 645         /**
 646          * Return true iff this HitSink dumps aligned reads to an output
 647          * stream (i.e., iff --alfa or --alfq are specified).
 648          */
 649         bool dumpsAlignedReads() {
 650                 return dumpAlignFlag_;
 651         }
 652
 653         /**
 654          * Return true iff this HitSink dumps unaligned reads to an output
 655          * stream (i.e., iff --unfa or --unfq are specified).
 656          */
 657         bool dumpsUnalignedReads() {
 658                 return dumpUnalignFlag_;
 659         }
 660
 661         /**
 662          * Return true iff this HitSink dumps maxed-out reads to an output
 663          * stream (i.e., iff --maxfa or --maxfq are specified).
 664          */
 665         bool dumpsMaxedReads() {
 666                 return dumpMaxedFlag_ || dumpUnalignFlag_;
 667         }
 668
 669         /**
 670          * Return true iff this HitSink dumps either unaligned or maxed-
 671          * out reads to an output stream (i.e., iff --unfa, --maxfa,
 672          * --unfq, or --maxfq are specified).
 673          */
 674         bool dumpsReads() {
 675                 return dumpAlignFlag_ || dumpUnalignFlag_ || dumpMaxedFlag_;
 676         }
 677
 678         /**
 679          * Dump an aligned read to all of the appropriate output streams.
 680          * Be careful to synchronize correctly - there may be multiple
 681          * simultaneous writers.
 682          */
 683         void dumpAlign(PatternSourcePerThread& p) {
 684                 if(!dumpAlignFlag_) return;
 685                 if(!p.paired() || onePairFile_) {
 686                         // Dump unpaired read to an aligned-read file of the same format
 687                         if(!dumpAlBase_.empty()) {
 688                                 MUTEX_LOCK(dumpAlignLock_);
 689                                 if(dumpAl_ == NULL) {
 690                                         assert(dumpAlQv_ == NULL);
 691                                         dumpAl_ = openOf(dumpAlBase_, 0, "");
 692                                         assert(dumpAl_ != NULL);
 693                                         if(p.bufa().qualOrigBufLen > 0) {
 694                                                 dumpAlQv_ = openOf(dumpAlBase_ + ".qual", 0, "");
 695                                                 assert(dumpAlQv_ != NULL);
 696                                         }
 697                                 }
 698                                 dumpAl_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 699                                 if(dumpAlQv_ != NULL) {
 700                                         dumpAlQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 701                                 }
 702                                 MUTEX_UNLOCK(dumpAlignLock_);
 703                         }
 704                 } else {
 705                         // Dump paired-end read to an aligned-read file (or pair of
 706                         // files) of the same format
 707                         if(!dumpAlBase_.empty()) {
 708                                 MUTEX_LOCK(dumpAlignLockPE_);
 709                                 if(dumpAl_1_ == NULL) {
 710                                         assert(dumpAlQv_1_ == NULL);
 711                                         assert(dumpAlQv_2_ == NULL);
 712                                         dumpAl_1_ = openOf(dumpAlBase_, 1, "");
 713                                         dumpAl_2_ = openOf(dumpAlBase_, 2, "");
 714                                         assert(dumpAl_1_ != NULL);
 715                                         assert(dumpAl_2_ != NULL);
 716                                         if(p.bufa().qualOrigBufLen > 0) {
 717                                                 dumpAlQv_1_ = openOf(dumpAlBase_ + ".qual", 1, "");
 718                                                 dumpAlQv_2_ = openOf(dumpAlBase_ + ".qual", 2, "");
 719                                                 assert(dumpAlQv_1_ != NULL);
 720                                                 assert(dumpAlQv_2_ != NULL);
 721                                         }
 722                                 }
 723                                 dumpAl_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 724                                 dumpAl_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
 725                                 if(dumpAlQv_1_ != NULL) {
 726                                         dumpAlQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 727                                         dumpAlQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
 728                                 }
 729                                 MUTEX_UNLOCK(dumpAlignLockPE_);
 730                         }
 731                 }
 732         }
 733
 734         /**
 735          * Dump an unaligned read to all of the appropriate output streams.
 736          * Be careful to synchronize correctly - there may be multiple
 737          * simultaneous writers.
 738          */
 739         void dumpUnal(PatternSourcePerThread& p) {
 740                 if(!dumpUnalignFlag_) return;
 741                 if(!p.paired() || onePairFile_) {
 742                         // Dump unpaired read to an unaligned-read file of the same format
 743                         if(!dumpUnalBase_.empty()) {
 744                                 MUTEX_LOCK(dumpUnalLock_);
 745                                 if(dumpUnal_ == NULL) {
 746                                         assert(dumpUnalQv_ == NULL);
 747                                         dumpUnal_ = openOf(dumpUnalBase_, 0, "");
 748                                         assert(dumpUnal_ != NULL);
 749                                         if(p.bufa().qualOrigBufLen > 0) {
 750                                                 dumpUnalQv_ = openOf(dumpUnalBase_ + ".qual", 0, "");
 751                                                 assert(dumpUnalQv_ != NULL);
 752                                         }
 753                                 }
 754                                 dumpUnal_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 755                                 if(dumpUnalQv_ != NULL) {
 756                                         dumpUnalQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 757                                 }
 758                                 MUTEX_UNLOCK(dumpUnalLock_);
 759                         }
 760                 } else {
 761                         // Dump paired-end read to an unaligned-read file (or pair
 762                         // of files) of the same format
 763                         if(!dumpUnalBase_.empty()) {
 764                                 MUTEX_LOCK(dumpUnalLockPE_);
 765                                 if(dumpUnal_1_ == NULL) {
 766                                         assert(dumpUnal_1_ == NULL);
 767                                         assert(dumpUnal_2_ == NULL);
 768                                         dumpUnal_1_ = openOf(dumpUnalBase_, 1, "");
 769                                         dumpUnal_2_ = openOf(dumpUnalBase_, 2, "");
 770                                         assert(dumpUnal_1_ != NULL);
 771                                         assert(dumpUnal_2_ != NULL);
 772                                         if(p.bufa().qualOrigBufLen > 0) {
 773                                                 dumpUnalQv_1_ = openOf(dumpUnalBase_ + ".qual", 1, "");
 774                                                 dumpUnalQv_2_ = openOf(dumpUnalBase_ + ".qual", 2, "");
 775                                         }
 776                                 }
 777                                 dumpUnal_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 778                                 dumpUnal_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
 779                                 if(dumpUnalQv_1_ != NULL) {
 780                                         dumpUnalQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 781                                         dumpUnalQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
 782                                 }
 783                                 MUTEX_UNLOCK(dumpUnalLockPE_);
 784                         }
 785                 }
 786         }
 787
 788         /**
 789          * Dump a maxed-out read to all of the appropriate output streams.
 790          * Be careful to synchronize correctly - there may be multiple
 791          * simultaneous writers.
 792          */
 793         void dumpMaxed(PatternSourcePerThread& p) {
 794                 if(!dumpMaxedFlag_) {
 795                         if(dumpUnalignFlag_) dumpUnal(p);
 796                         return;
 797                 }
 798                 if(!p.paired() || onePairFile_) {
 799                         // Dump unpaired read to an maxed-out-read file of the same format
 800                         if(!dumpMaxBase_.empty()) {
 801                                 MUTEX_LOCK(dumpMaxLock_);
 802                                 if(dumpMax_ == NULL) {
 803                                         dumpMax_ = openOf(dumpMaxBase_, 0, "");
 804                                         assert(dumpMax_ != NULL);
 805                                         if(p.bufa().qualOrigBufLen > 0) {
 806                                                 dumpMaxQv_ = openOf(dumpMaxBase_ + ".qual", 0, "");
 807                                         }
 808                                 }
 809                                 dumpMax_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 810                                 if(dumpMaxQv_ != NULL) {
 811                                         dumpMaxQv_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 812                                 }
 813                                 MUTEX_UNLOCK(dumpMaxLock_);
 814                         }
 815                 } else {
 816                         // Dump paired-end read to a maxed-out-read file (or pair
 817                         // of files) of the same format
 818                         if(!dumpMaxBase_.empty()) {
 819                                 MUTEX_LOCK(dumpMaxLockPE_);
 820                                 if(dumpMax_1_ == NULL) {
 821                                         assert(dumpMaxQv_1_ == NULL);
 822                                         assert(dumpMaxQv_2_ == NULL);
 823                                         dumpMax_1_ = openOf(dumpMaxBase_, 1, "");
 824                                         dumpMax_2_ = openOf(dumpMaxBase_, 2, "");
 825                                         assert(dumpMax_1_ != NULL);
 826                                         assert(dumpMax_2_ != NULL);
 827                                         if(p.bufa().qualOrigBufLen > 0) {
 828                                                 dumpMaxQv_1_ = openOf(dumpMaxBase_ + ".qual", 1, "");
 829                                                 dumpMaxQv_2_ = openOf(dumpMaxBase_ + ".qual", 2, "");
 830                                         }
 831                                 }
 832                                 dumpMax_1_->write(p.bufa().readOrigBuf, p.bufa().readOrigBufLen);
 833                                 dumpMax_2_->write(p.bufb().readOrigBuf, p.bufb().readOrigBufLen);
 834                                 if(dumpMaxQv_1_ != NULL) {
 835                                         dumpMaxQv_1_->write(p.bufa().qualOrigBuf, p.bufa().qualOrigBufLen);
 836                                         dumpMaxQv_2_->write(p.bufb().qualOrigBuf, p.bufb().qualOrigBufLen);
 837                                 }
 838                                 MUTEX_UNLOCK(dumpMaxLockPE_);
 839                         }
 840                 }
 841         }
 842
 843         /**
 844          * Report a maxed-out read.  Typically we do nothing, but we might
 845          * want to print a placeholder when output is chained.
 846          */
 847         virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p) {
 848                 mainlock();
 849                 numMaxed_++;
 850                 mainunlock();
 851         }
 852
 853         /**
 854          * Report an unaligned read.  Typically we do nothing, but we might
 855          * want to print a placeholder when output is chained.
 856          */
 857         virtual void reportUnaligned(PatternSourcePerThread& p) {
 858                 mainlock();
 859                 numUnaligned_++;
 860                 mainunlock();
 861         }
 862
 863 protected:
 864
 865         /// Implementation of hit-report
 866         virtual void reportHit(const Hit& h) {
 867                 assert(h.repOk());
 868                 mainlock();
 869                 commitHit(h);
 870                 first_ = false;
 871                 if(h.mate > 0) numReportedPaired_++;
 872                 else           numReported_++;
 873                 numAligned_++;
 874                 mainunlock();
 875         }
 876
 877         /**
 878          * Close (and flush) all OutFileBufs.
 879          */
 880         void closeOuts() {
 881                 // Flush and close all non-NULL output streams
 882                 for(size_t i = 0; i < _outs.size(); i++) {
 883                         if(_outs[i] != NULL && !_outs[i]->closed()) {
 884                                 _outs[i]->close();
 885                         }
 886                 }
 887         }
 888
 889         /**
 890          * Lock the output buffer for the output stream for reference with
 891          * index 'refIdx'.  By default, hits for all references are
 892          * directed to the same output stream, but if --refout is
 893          * specified, each reference has its own reference stream.
 894          */
 895         void lock(size_t refIdx) {
 896                 size_t strIdx = refIdxToStreamIdx(refIdx);
 897                 MUTEX_LOCK(_locks[strIdx]);
 898         }
 899
 900         /**
 901          * Lock the output buffer for the output stream for reference with
 902          * index 'refIdx'.  By default, hits for all references are
 903          * directed to the same output stream, but if --refout is
 904          * specified, each reference has its own reference stream.
 905          */
 906         void unlock(size_t refIdx) {
 907                 size_t strIdx = refIdxToStreamIdx(refIdx);
 908                 MUTEX_UNLOCK(_locks[strIdx]);
 909         }
 910
 911         vector<OutFileBuf*> _outs;        /// the alignment output stream(s)
 912         bool                _deleteOuts;  /// Whether to delete elements of _outs upon exit
 913         RecalTable         *recalTable_;  /// recalibration table
 914         vector<string>*     _refnames;    /// map from reference indexes to names
 915         int                 _numWrappers; /// # threads owning a wrapper for this HitSink
 916         vector<MUTEX_T>     _locks;       /// pthreads mutexes for per-file critical sections
 917         MUTEX_T             _mainlock;    /// pthreads mutexes for fields of this object
 918
 919         // Output filenames for dumping
 920         std::string dumpAlBase_;
 921         std::string dumpUnalBase_;
 922         std::string dumpMaxBase_;
 923
 924         bool onePairFile_;
 925         bool sampleMax_;
 926
 927         // Output streams for dumping sequences
 928         std::ofstream *dumpAl_;       // for single-ended reads
 929         std::ofstream *dumpAl_1_;     // for first mates
 930         std::ofstream *dumpAl_2_;     // for second mates
 931         std::ofstream *dumpUnal_;     // for single-ended reads
 932         std::ofstream *dumpUnal_1_;   // for first mates
 933         std::ofstream *dumpUnal_2_;   // for second mates
 934         std::ofstream *dumpMax_;      // for single-ended reads
 935         std::ofstream *dumpMax_1_;    // for first mates
 936         std::ofstream *dumpMax_2_;    // for second mates
 937
 938         // Output streams for dumping qualities
 939         std::ofstream *dumpAlQv_;     // for single-ended reads
 940         std::ofstream *dumpAlQv_1_;   // for first mates
 941         std::ofstream *dumpAlQv_2_;   // for second mates
 942         std::ofstream *dumpUnalQv_;   // for single-ended reads
 943         std::ofstream *dumpUnalQv_1_; // for first mates
 944         std::ofstream *dumpUnalQv_2_; // for second mates
 945         std::ofstream *dumpMaxQv_;    // for single-ended reads
 946         std::ofstream *dumpMaxQv_1_;  // for first mates
 947         std::ofstream *dumpMaxQv_2_;  // for second mates
 948
 949         /**
 950          * Open an ofstream with given name; output error message and quit
 951          * if it fails.
 952          */
 953         std::ofstream* openOf(const std::string& name,
 954                               int mateType,
 955                               const std::string& suffix)
 956         {
 957                 std::string s = name;
 958                 size_t dotoff = name.find_last_of(".");
 959                 if(mateType == 1) {
 960                         if(dotoff == string::npos) {
 961                                 s += "_1"; s += suffix;
 962                         } else {
 963                                 s = name.substr(0, dotoff) + "_1" + s.substr(dotoff);
 964                         }
 965                 } else if(mateType == 2) {
 966                         if(dotoff == string::npos) {
 967                                 s += "_2"; s += suffix;
 968                         } else {
 969                                 s = name.substr(0, dotoff) + "_2" + s.substr(dotoff);
 970                         }
 971                 } else if(mateType != 0) {
 972                         cerr << "Bad mate type " << mateType << endl; throw 1;
 973                 }
 974                 std::ofstream* tmp = new ofstream(s.c_str(), ios::out);
 975                 if(tmp->fail()) {
 976                         if(mateType == 0) {
 977                                 cerr << "Could not open single-ended aligned/unaligned-read file for writing: " << name << endl;
 978                         } else {
 979                                 cerr << "Could not open paired-end aligned/unaligned-read file for writing: " << name << endl;
 980                         }
 981                         throw 1;
 982                 }
 983                 return tmp;
 984         }
 985
 986         /**
 987          * Initialize all the locks for dumping.
 988          */
 989         void initDumps() {
 990                 dumpAl_       = dumpAl_1_     = dumpAl_2_     = NULL;
 991                 dumpUnal_     = dumpUnal_1_   = dumpUnal_2_   = NULL;
 992                 dumpMax_      = dumpMax_1_    = dumpMax_2_    = NULL;
 993                 dumpAlQv_     = dumpAlQv_1_   = dumpAlQv_2_   = NULL;
 994                 dumpUnalQv_   = dumpUnalQv_1_ = dumpUnalQv_2_ = NULL;
 995                 dumpMaxQv_    = dumpMaxQv_1_  = dumpMaxQv_2_  = NULL;
 996                 dumpAlignFlag_   = !dumpAlBase_.empty();
 997                 dumpUnalignFlag_ = !dumpUnalBase_.empty();
 998                 dumpMaxedFlag_   = !dumpMaxBase_.empty();
 999                 MUTEX_INIT(dumpAlignLock_);
1000                 MUTEX_INIT(dumpAlignLockPE_);
1001                 MUTEX_INIT(dumpUnalLock_);
1002                 MUTEX_INIT(dumpUnalLockPE_);
1003                 MUTEX_INIT(dumpMaxLock_);
1004                 MUTEX_INIT(dumpMaxLockPE_);
1005         }
1006
1007         void destroyDumps() {
1008                 if(dumpAl_       != NULL) { dumpAl_->close();       delete dumpAl_; }
1009                 if(dumpAl_1_     != NULL) { dumpAl_1_->close();     delete dumpAl_1_; }
1010                 if(dumpAl_2_     != NULL) { dumpAl_2_->close();     delete dumpAl_2_; }
1011                 if(dumpUnal_     != NULL) { dumpUnal_->close();     delete dumpUnal_; }
1012                 if(dumpUnal_1_   != NULL) { dumpUnal_1_->close();   delete dumpUnal_1_; }
1013                 if(dumpUnal_2_   != NULL) { dumpUnal_2_->close();   delete dumpUnal_2_; }
1014                 if(dumpMax_      != NULL) { dumpMax_->close();      delete dumpMax_; }
1015                 if(dumpMax_1_    != NULL) { dumpMax_1_->close();    delete dumpMax_1_; }
1016                 if(dumpMax_2_    != NULL) { dumpMax_2_->close();    delete dumpMax_2_; }
1017                 if(dumpAlQv_     != NULL) { dumpAlQv_->close();     delete dumpAlQv_; }
1018                 if(dumpAlQv_1_   != NULL) { dumpAlQv_1_->close();   delete dumpAlQv_1_; }
1019                 if(dumpAlQv_2_   != NULL) { dumpAlQv_2_->close();   delete dumpAlQv_2_; }
1020                 if(dumpUnalQv_   != NULL) { dumpUnalQv_->close();   delete dumpUnalQv_; }
1021                 if(dumpUnalQv_1_ != NULL) { dumpUnalQv_1_->close(); delete dumpUnalQv_1_; }
1022                 if(dumpUnalQv_2_ != NULL) { dumpUnalQv_2_->close(); delete dumpUnalQv_2_; }
1023                 if(dumpMaxQv_    != NULL) { dumpMaxQv_->close();    delete dumpMaxQv_; }
1024                 if(dumpMaxQv_1_  != NULL) { dumpMaxQv_1_->close();  delete dumpMaxQv_1_; }
1025                 if(dumpMaxQv_2_  != NULL) { dumpMaxQv_2_->close();  delete dumpMaxQv_2_; }
1026         }
1027
1028         // Locks for dumping
1029         MUTEX_T dumpAlignLock_;
1030         MUTEX_T dumpAlignLockPE_; // _1 and _2
1031         MUTEX_T dumpUnalLock_;
1032         MUTEX_T dumpUnalLockPE_; // _1 and _2
1033         MUTEX_T dumpMaxLock_;
1034         MUTEX_T dumpMaxLockPE_;   // _1 and _2
1035
1036         // false -> no dumping
1037         bool dumpAlignFlag_;
1038         bool dumpUnalignFlag_;
1039         bool dumpMaxedFlag_;
1040
1041         volatile bool     first_;       /// true -> first hit hasn't yet been reported
1042         volatile uint64_t numAligned_;  /// # reads with >= 1 alignment
1043         volatile uint64_t numUnaligned_;/// # reads with no alignments
1044         volatile uint64_t numMaxed_;    /// # reads with # alignments exceeding -m ceiling
1045         volatile uint64_t numReported_; /// # single-ended alignments reported
1046         volatile uint64_t numReportedPaired_; /// # paired-end alignments reported
1047         bool quiet_;  /// true -> don't print alignment stats at the end
1048         ios_base::openmode ssmode_;     /// output mode for stringstreams
1049 };
1050
1051 /**
1052  * A per-thread wrapper for a HitSink.  Incorporates state that a
1053  * single search thread cares about.
1054  */
1055 class HitSinkPerThread {
1056 public:
1057         HitSinkPerThread(HitSink& sink, uint32_t max, uint32_t n) :
1058                 _sink(sink),
1059                 _bestRemainingStratum(0),
1060                 _numValidHits(0llu),
1061                 _hits(),
1062                 _bufferedHits(),
1063                 hitsForThisRead_(),
1064                 _max(max),
1065                 _n(n)
1066         {
1067                 _sink.addWrapper();
1068                 assert_gt(_n, 0);
1069         }
1070
1071         virtual ~HitSinkPerThread() { }
1072
1073         /// Return the vector of retained hits
1074         vector<Hit>& retainedHits()   { return _hits; }
1075
1076         /// Finalize current read
1077         virtual uint32_t finishRead(PatternSourcePerThread& p, bool report, bool dump) {
1078                 uint32_t ret = finishReadImpl();
1079                 _bestRemainingStratum = 0;
1080                 if(!report) {
1081                         _bufferedHits.clear();
1082                         return 0;
1083                 }
1084                 bool maxed = (ret > _max);
1085                 bool unal = (ret == 0);
1086                 if(dump && (unal || maxed)) {
1087                         // Either no reportable hits were found or the number of
1088                         // reportable hits exceeded the -m limit specified by the
1089                         // user
1090                         assert(ret == 0 || ret > _max);
1091                         if(maxed) _sink.dumpMaxed(p);
1092                         else      _sink.dumpUnal(p);
1093                 }
1094                 ret = 0;
1095                 if(maxed) {
1096                         // Report that the read maxed-out; useful for chaining output
1097                         if(dump) _sink.reportMaxed(_bufferedHits, p);
1098                         _bufferedHits.clear();
1099                 } else if(unal) {
1100                         // Report that the read failed to align; useful for chaining output
1101                         if(dump) _sink.reportUnaligned(p);
1102                 } else {
1103                         // Flush buffered hits
1104                         assert_gt(_bufferedHits.size(), 0);
1105                         if(_bufferedHits.size() > _n) {
1106                                 _bufferedHits.resize(_n);
1107                         }
1108                         _sink.reportHits(_bufferedHits);
1109                         _sink.dumpAlign(p);
1110                         ret = _bufferedHits.size();
1111                         _bufferedHits.clear();
1112                 }
1113                 assert_eq(0, _bufferedHits.size());
1114                 return ret;
1115         }
1116
1117         virtual uint32_t finishReadImpl() = 0;
1118
1119         /**
1120          * Implementation for hit reporting; update per-thread _hits and
1121          * _numReportableHits variables and call the master HitSink to do the actual
1122          * reporting
1123          */
1124         virtual void bufferHit(const Hit& h, int stratum) {
1125 #ifndef NDEBUG
1126                 // Ensure all buffered hits have the same patid
1127                 for(size_t i = 1; i < _bufferedHits.size(); i++) {
1128                         assert_eq(_bufferedHits[0].patId, _bufferedHits[i].patId);
1129                 }
1130 #endif
1131                 _bufferedHits.push_back(h);
1132         }
1133
1134         /**
1135          * Concrete subclasses override this to (possibly) report a hit and
1136          * return true iff the caller should continue to report more hits.
1137          */
1138         virtual bool reportHit(const Hit& h, int stratum) {
1139                 assert(h.repOk());
1140                 _numValidHits++;
1141                 return true;
1142         }
1143
1144         /// Return the number of valid hits so far (not necessarily
1145         /// reported).  It's up to the concrete subclasses
1146         uint64_t numValidHits()    { return _numValidHits; }
1147
1148         /**
1149          * Return true if there are no more reportable hits.
1150          */
1151         bool finishedWithStratum(int stratum) {
1152                 bool ret = finishedWithStratumImpl(stratum);
1153                 _bestRemainingStratum = stratum+1;
1154                 return ret;
1155         }
1156
1157         /**
1158          * Use the given set of hits as a starting point.  By default, we don't
1159          */
1160         virtual bool setHits(HitSet& hs) {
1161                 if(!hs.empty()) {
1162                         cerr << "Error: default setHits() called with non-empty HitSet" << endl;
1163                         throw 1;
1164                 }
1165                 return false;
1166         }
1167
1168         /**
1169          * Return true if there are no reportable hits with the given cost
1170          * (or worse).
1171          */
1172         virtual bool irrelevantCost(uint16_t cost) {
1173                 return false;
1174         }
1175
1176         /**
1177          * Concrete subclasses override this to determine whether the
1178          * search routine should keep searching after having finished
1179          * reporting all alignments at the given stratum.
1180          */
1181         virtual bool finishedWithStratumImpl(int stratum) = 0;
1182
1183         /// The mhits maximum
1184         uint32_t overThresh() { return _max; }
1185
1186         /// Whether this thread, for this read, knows that we have already
1187         /// exceeded the mhits maximum
1188         bool exceededOverThresh() { return hitsForThisRead_ > _max; }
1189
1190         /// Return whether we span strata
1191         virtual bool spanStrata() = 0;
1192
1193         /// Return whether we report only the best possible hits
1194         virtual bool best() = 0;
1195
1196         /**
1197          * Return true iff the underlying HitSink dumps unaligned or
1198          * maxed-out reads.
1199          */
1200         bool dumpsReads() {
1201                 return _sink.dumpsReads();
1202         }
1203
1204         /**
1205          * Return true iff there are currently no buffered hits.
1206          */
1207         bool empty() const {
1208                 return _bufferedHits.empty();
1209         }
1210
1211         /**
1212          * Return the number of currently buffered hits.
1213          */
1214         bool size() const {
1215                 return _bufferedHits.size();
1216         }
1217
1218         /**
1219          * Return max # hits to report (*2 in paired-end mode because mates
1220          * count separately)
1221          */
1222         virtual uint32_t maxHits() {
1223                 return _n;
1224         }
1225
1226 protected:
1227         HitSink&    _sink; /// Ultimate destination of reported hits
1228         /// Least # mismatches in alignments that will be reported in the
1229         /// future.  Updated by the search routine.
1230         int         _bestRemainingStratum;
1231         /// # hits reported to this HitSink so far (not all of which were
1232         /// necesssary reported to _sink)
1233         uint64_t    _numValidHits;
1234         vector<Hit> _hits; /// Repository for retained hits
1235         /// Buffered hits, to be reported and flushed at end of read-phase
1236         vector<Hit> _bufferedHits;
1237
1238         // Following variables are declared in the parent but maintained in
1239         // the concrete subcalsses
1240         uint32_t hitsForThisRead_; /// # hits for this read so far
1241         uint32_t _max; /// don't report any hits if there were > _max
1242         uint32_t _n;   /// report at most _n hits
1243 };
1244
1245 /**
1246  * Abstract parent factory for HitSinkPerThreads.
1247  */
1248 class HitSinkPerThreadFactory {
1249 public:
1250         virtual ~HitSinkPerThreadFactory() { }
1251         virtual HitSinkPerThread* create() const = 0;
1252         virtual HitSinkPerThread* createMult(uint32_t m) const = 0;
1253
1254         /// Free memory associated with a per-thread hit sink
1255         virtual void destroy(HitSinkPerThread* sink) const {
1256                 assert(sink != NULL);
1257                 // Free the HitSinkPerThread
1258                 delete sink;
1259         }
1260 };
1261
1262 /**
1263  * Report first N good alignments encountered; trust search routine
1264  * to try alignments in something approximating a best-first order.
1265  * Best used in combination with a stringent alignment policy.
1266  */
1267 class NGoodHitSinkPerThread : public HitSinkPerThread {
1268
1269 public:
1270         NGoodHitSinkPerThread(
1271                         HitSink& sink,
1272                         uint32_t n,
1273                         uint32_t max) :
1274                                 HitSinkPerThread(sink, max, n)
1275         { }
1276
1277         virtual bool spanStrata() {
1278                 return true; // we span strata
1279         }
1280
1281         virtual bool best() {
1282                 return false; // we settle for "good" hits
1283         }
1284
1285         /// Finalize current read
1286         virtual uint32_t finishReadImpl() {
1287                 uint32_t ret = hitsForThisRead_;
1288                 hitsForThisRead_ = 0;
1289                 return ret;
1290         }
1291
1292         /**
1293          * Report and then return true if we've already reported N good
1294          * hits.  Ignore the stratum - it's not relevant for finding "good"
1295          * hits.
1296          */
1297         virtual bool reportHit(const Hit& h, int stratum) {
1298                 HitSinkPerThread::reportHit(h, stratum);
1299                 hitsForThisRead_++;
1300                 if(hitsForThisRead_ > _max) {
1301                         return true; // done - report nothing
1302                 }
1303                 //if(hitsForThisRead_ <= _n) {
1304                         // Only report hit if we haven't
1305                         bufferHit(h, stratum);
1306                 //}
1307                 if(hitsForThisRead_ == _n &&
1308                    (_max == 0xffffffff || _max < _n))
1309                 {
1310                         return true; // already reported N good hits and max isn't set; stop!
1311                 }
1312                 return false; // not at N or max yet; keep going
1313         }
1314
1315         /**
1316          * Always return true; search routine should only stop if it's
1317          * already reported N hits.
1318          */
1319         virtual bool finishedWithStratumImpl(int stratum) { return false; }
1320 };
1321
1322 /**
1323  * Concrete factory for FirstNGoodHitSinkPerThreads.
1324  */
1325 class NGoodHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1326 public:
1327         NGoodHitSinkPerThreadFactory(
1328                         HitSink& sink,
1329                         uint32_t n,
1330                         uint32_t max) :
1331                         sink_(sink),
1332                         n_(n),
1333                         max_(max)
1334         { }
1335
1336         /**
1337          * Allocate a new NGoodHitSinkPerThread object on the heap,
1338          * using the parameters given in the constructor.
1339          */
1340         virtual HitSinkPerThread* create() const {
1341                 return new NGoodHitSinkPerThread(sink_, n_, max_);
1342         }
1343         virtual HitSinkPerThread* createMult(uint32_t m) const {
1344                 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1345                 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1346                 return new NGoodHitSinkPerThread(sink_, n, max);
1347         }
1348
1349 private:
1350         HitSink& sink_;
1351         uint32_t n_;
1352         uint32_t max_;
1353 };
1354
1355 /**
1356  * Report the first N best alignments encountered in a single
1357  * alignment stratum assuming that we're receiving the alignments in
1358  * best-first order.
1359  */
1360 class NBestFirstStratHitSinkPerThread : public HitSinkPerThread {
1361
1362 public:
1363         NBestFirstStratHitSinkPerThread(
1364                         HitSink& sink,
1365                         uint32_t n,
1366                         uint32_t max,
1367                         uint32_t mult) :
1368                                 HitSinkPerThread(sink, max, n),
1369                                 bestStratum_(999), mult_(mult)
1370         { }
1371
1372         /**
1373          * false -> we do not allow strata to be spanned
1374          */
1375         virtual bool spanStrata() {
1376                 return false; // we do not span strata
1377         }
1378
1379         /**
1380          * true -> we report best hits
1381          */
1382         virtual bool best() {
1383                 return true;
1384         }
1385
1386         /**
1387          * Report and then return false if we've already reported N.
1388          */
1389         virtual bool reportHit(const Hit& h, int stratum) {
1390                 HitSinkPerThread::reportHit(h, stratum);
1391                 // This hit is within th best possible remaining stratum,
1392                 // so it should definitely count
1393                 hitsForThisRead_++;
1394                 // It doesn't exceed the limit, so buffer it
1395                 if(stratum < bestStratum_) {
1396                         bestStratum_ = stratum;
1397                 }
1398                 if(hitsForThisRead_ > _max) {
1399                         return true; // done - report nothing
1400                 }
1401                 //if(hitsForThisRead_ <= _n) {
1402                         bufferHit(h, stratum);
1403                 //}
1404                 if(hitsForThisRead_ == _n &&
1405                    (_max == 0xffffffff || _max < _n))
1406                 {
1407                         return true; // already reported N good hits; stop!
1408                 }
1409                 return false; // not at N yet; keep going
1410         }
1411
1412         /**
1413          * Finalize current read by reporting any buffered hits from best
1414          * to worst until they're all reported or until we've reported all
1415          * N
1416          */
1417         virtual uint32_t finishReadImpl() {
1418                 uint32_t ret = hitsForThisRead_;
1419                 hitsForThisRead_ = 0;
1420                 bestStratum_ = 999;
1421                 const size_t sz = _bufferedHits.size();
1422                 for(size_t i = 0; i < sz; i++) {
1423                         // Set 'oms' according to the number of other alignments
1424                         // at this stratum
1425                         _bufferedHits[i].oms = (sz / mult_) - 1;
1426                 }
1427                 return ret;
1428         }
1429
1430         /**
1431          * If we had any alignments at all and we're now moving on to a new
1432          * stratum, then we're done.
1433          */
1434         virtual bool finishedWithStratumImpl(int stratum) {
1435                 return hitsForThisRead_ > 0;
1436         }
1437
1438         /**
1439          * If there have been any hits reported so far, classify any
1440          * subsequent alignments with higher strata as irrelevant.
1441          */
1442         virtual bool irrelevantCost(uint16_t cost) {
1443                 if(hitsForThisRead_) {
1444                         // irrelevant iff at worse stratum
1445                         return ((int)cost >> 14) > bestStratum_;
1446                 }
1447                 return false;
1448         }
1449
1450 private:
1451
1452         int bestStratum_; /// best stratum observed so far
1453         uint32_t mult_; /// number of batched-up alignments
1454 };
1455
1456 /**
1457  * Concrete factory for NBestStratHitSinkPerThread.
1458  */
1459 class NBestFirstStratHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1460 public:
1461         NBestFirstStratHitSinkPerThreadFactory(
1462                         HitSink& sink,
1463                         uint32_t n,
1464                         uint32_t max) :
1465                         sink_(sink),
1466                         n_(n),
1467                         max_(max)
1468         { }
1469
1470         /**
1471          * Allocate a new NGoodHitSinkPerThread object on the heap,
1472          * using the parameters given in the constructor.
1473          */
1474         virtual HitSinkPerThread* create() const {
1475                 return new NBestFirstStratHitSinkPerThread(sink_, n_, max_, 1);
1476         }
1477         virtual HitSinkPerThread* createMult(uint32_t m) const {
1478                 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1479                 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1480                 return new NBestFirstStratHitSinkPerThread(sink_, n, max, m);
1481         }
1482
1483 private:
1484         HitSink& sink_;
1485         uint32_t n_;
1486         uint32_t max_;
1487 };
1488
1489 /**
1490  *
1491  */
1492 class ChainingHitSinkPerThread : public HitSinkPerThread {
1493 public:
1494
1495         ChainingHitSinkPerThread(HitSink& sink,
1496                                  uint32_t n,
1497                                  uint32_t max,
1498                                  bool strata,
1499                                  uint32_t mult) :
1500         HitSinkPerThread(sink, max, n),
1501         mult_(mult), strata_(strata), cutoff_(0xffff)
1502         {
1503                 hs_ = NULL;
1504                 hsISz_ = 0;
1505         }
1506
1507         /**
1508          * Return true iff we're allowed to span strata.
1509          */
1510         virtual bool spanStrata() { return !strata_; }
1511
1512         /**
1513          * true -> we report best hits
1514          */
1515         virtual bool best() { return true; }
1516
1517         /**
1518          * Report and then return false if we've already reported N.
1519          */
1520         virtual bool reportHit(const Hit& h, int stratum) {
1521                 HitSinkPerThread::reportHit(h, stratum);
1522                 assert(hs_->sorted());
1523                 assert(hs_ != NULL);
1524                 assert_eq(h.stratum, stratum);
1525                 assert_eq(1, mult_);
1526                 assert(consistentStrata());
1527                 assert(!irrelevantCost(h.cost));
1528
1529                 if(!hs_->empty() && strata_ && stratum < hs_->front().stratum) {
1530                         hs_->clear();
1531                         _bufferedHits.clear();
1532                         hitsForThisRead_ = 0;
1533                 }
1534                 assert(consistentStrata());
1535
1536                 size_t replPos = 0;
1537                 if(!hs_->empty() && hs_->tryReplacing(h.h, h.fw, h.cost, replPos)) {
1538                         if(replPos != 0xffffffff) {
1539                                 // Replaced an existing hit
1540                                 assert_lt(replPos, _bufferedHits.size());
1541                                 _bufferedHits[replPos] = h;
1542                                 hs_->sort();
1543                         }
1544                         // Size didn't change, so no need to check against _max and _n
1545                         assert(hs_->sorted());
1546                         assert(consistentStrata());
1547                 } else {
1548                         // Added a new hit
1549                         hs_->expand();
1550                         hs_->back().h = h.h;
1551                         hs_->back().fw = h.fw;
1552                         hs_->back().stratum = h.stratum;
1553                         hs_->back().cost = h.cost;
1554                         hitsForThisRead_++;
1555                         if(hs_->size() > _max) {
1556                                 assert_eq(hs_->size(), _bufferedHits.size());
1557                                 return true; // done - report nothing
1558                         }
1559                         _bufferedHits.push_back(h);
1560                         if(hsISz_ == 0 &&
1561                            hs_->size() == _n &&
1562                            (_max == 0xffffffff || _max < _n))
1563                         {
1564                                 assert_eq(hs_->size(), _bufferedHits.size());
1565                                 return true; // already reported N good hits; stop!
1566                         }
1567                         hs_->sort();
1568                         assert(consistentStrata());
1569                 }
1570                 assert_eq(hs_->size(), _bufferedHits.size());
1571                 updateCutoff();
1572                 return false; // not at N yet; keep going
1573         }
1574
1575         /**
1576          * Finalize current read by reporting any buffered hits from best
1577          * to worst until they're all reported or until we've reported all
1578          * N
1579          */
1580         virtual uint32_t finishReadImpl() {
1581                 assert_eq(1, mult_);
1582                 assert(hs_ != NULL);
1583                 assert(consistentStrata());
1584                 uint32_t ret = hitsForThisRead_;
1585                 hitsForThisRead_ = 0;
1586                 if(!hs_->empty() && hs_->size() < _n) {
1587                         const size_t sz = _bufferedHits.size();
1588                         for(size_t i = 0; i < sz; i++) {
1589                                 // Set 'oms' according to the number of other alignments
1590                                 // at this stratum
1591                                 _bufferedHits[i].oms = (sz / mult_) - 1;
1592                         }
1593                 }
1594                 std::sort(_bufferedHits.begin(), _bufferedHits.end(), HitCostCompare());
1595                 if(hs_->size() > _n) {
1596                         _bufferedHits.resize(_n);
1597                 }
1598                 assert(consistentStrata());
1599                 // Make sure that a chained, maxed-out read gets treated as
1600                 // maxed-out and not as unaligned
1601                 if(hs_->empty() && hs_->maxedStratum != -1) {
1602                         assert(_bufferedHits.empty());
1603                         // Boy, this is stupid.  Need to switch to just using HitSet
1604                         // internally all the time.
1605                         _bufferedHits.resize(_max+1);
1606                         for(size_t i = 0; i < _max+1; i++) {
1607                                 _bufferedHits[i].stratum = hs_->maxedStratum;
1608                         }
1609                         ret = _max+1;
1610                 } else if(!hs_->empty() && hs_->maxedStratum != -1) {
1611                         assert_lt(hs_->front().stratum, hs_->maxedStratum);
1612                 }
1613                 return ret;
1614         }
1615
1616         /**
1617          * Set the initial set of Hits.
1618          */
1619         virtual bool setHits(HitSet& hs) {
1620                 hs_ = &hs;
1621                 assert(hs_ != NULL);
1622                 hsISz_ = hs.size();
1623                 cutoff_ = 0xffff;
1624                 hitsForThisRead_ = hs.size();
1625                 assert(_bufferedHits.empty());
1626                 assert_geq(hs.maxedStratum, -1);
1627                 assert_lt(hs.maxedStratum, 4);
1628                 if(!hs.empty()) {
1629                         assert_eq(-1, hs.maxedStratum);
1630                         hs.sort();
1631                         Hit::fromHitSet(_bufferedHits, hs);
1632                         assert(!_bufferedHits.empty());
1633                         assert_leq(_bufferedHits.size(), _max);
1634                         assert(consistentStrata());
1635                 } else if(hs.maxedStratum != -1) {
1636                         if(hs.maxedStratum == 0) {
1637                                 cutoff_ = 0;
1638                                 // Already done
1639                                 return true;
1640                         }
1641                         cutoff_ = (hs.maxedStratum << 14);
1642                 }
1643                 assert_eq(hs_->size(), _bufferedHits.size());
1644                 updateCutoff();
1645                 return false;
1646         }
1647
1648         /**
1649          * If we had any alignments at all and we're now moving on to a new
1650          * stratum, then we're done.
1651          */
1652         virtual bool finishedWithStratumImpl(int stratum) {
1653                 assert(false);
1654                 return false;
1655         }
1656
1657         /**
1658          * If there have been any hits reported so far, classify any
1659          * subsequent alignments with higher strata as irrelevant.
1660          */
1661         virtual bool irrelevantCost(uint16_t cost) {
1662                 if(cutoff_ == 0) return false;
1663                 return cost > cutoff_;
1664         }
1665
1666 protected:
1667
1668 #ifndef NDEBUG
1669         /**
1670          * Sanity check that, if we're in strata_ mode, all 'stratum's
1671          * should be the same among hits.
1672          */
1673         bool consistentStrata() {
1674                 if(hs_->empty() || !strata_) return true;
1675                 int stratum = hs_->front().stratum;
1676                 for(size_t i = 1; i < hs_->size(); i++) {
1677                         assert_eq(stratum, (*hs_)[i].stratum);
1678                 }
1679                 return true;
1680         }
1681 #endif
1682
1683         /**
1684          * Update the lowest relevant cost.
1685          */
1686         void updateCutoff() {
1687                 ASSERT_ONLY(uint16_t origCutoff = cutoff_);
1688                 assert(hs_->sorted());
1689                 assert(hs_->empty() || hs_->back().cost >= hs_->front().cost);
1690                 bool atCapacity = (hs_->size() >= _n);
1691                 if(atCapacity && (_max == 0xffffffff || _max < _n)) {
1692                         cutoff_ = min(hs_->back().cost, cutoff_);
1693                 }
1694                 if(strata_ && !hs_->empty()) {
1695                         uint16_t sc = hs_->back().cost;
1696                         sc = ((sc >> 14) + 1) << 14;
1697                         cutoff_ = min(cutoff_, sc);
1698                         assert_leq(cutoff_, origCutoff);
1699                 }
1700         }
1701
1702         HitSet *hs_;
1703         size_t hsISz_;
1704         uint32_t mult_;
1705         bool strata_; /// true -> reporting is stratified
1706         uint16_t cutoff_; /// the smallest irrelevant cost
1707 };
1708
1709 /**
1710  * Concrete factory for ChainingHitSinkPerThread.
1711  */
1712 class ChainingHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1713 public:
1714         ChainingHitSinkPerThreadFactory(
1715                         HitSink& sink,
1716                         uint32_t n,
1717                         uint32_t max,
1718                         bool strata) :
1719                         sink_(sink),
1720                         n_(n),
1721                         max_(max),
1722                         strata_(strata)
1723         { }
1724
1725         /**
1726          * Allocate a new NGoodHitSinkPerThread object on the heap,
1727          * using the parameters given in the constructor.
1728          */
1729         virtual HitSinkPerThread* create() const {
1730                 return new ChainingHitSinkPerThread(sink_, n_, max_, strata_, 1);
1731         }
1732         virtual HitSinkPerThread* createMult(uint32_t m) const {
1733                 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1734                 uint32_t n = n_ * (n_ == 0xffffffff ? 1 : m);
1735                 return new ChainingHitSinkPerThread(sink_, n, max, strata_, m);
1736         }
1737
1738 private:
1739         HitSink& sink_;
1740         uint32_t n_;
1741         uint32_t max_;
1742         bool strata_;
1743 };
1744
1745 /**
1746  * Report all valid alignments.
1747  */
1748 class AllHitSinkPerThread : public HitSinkPerThread {
1749
1750 public:
1751         AllHitSinkPerThread(
1752                         HitSink& sink,
1753                 uint32_t max) :
1754                     HitSinkPerThread(sink, max, 0xffffffff) { }
1755
1756         virtual bool spanStrata() {
1757                 return true; // we span strata
1758         }
1759
1760         virtual bool best() {
1761                 return true; // we report "best" hits
1762         }
1763
1764         /**
1765          * Report and always return true; we're finiding all hits so that
1766          * search routine should always continue.
1767          */
1768         virtual bool reportHit(const Hit& h, int stratum) {
1769                 HitSinkPerThread::reportHit(h, stratum);
1770                 hitsForThisRead_++;
1771                 if(hitsForThisRead_ > _max) {
1772                         return true; // done - report nothing
1773                 }
1774                 bufferHit(h, stratum);
1775                 return false; // reporting all; always keep going
1776         }
1777
1778         /**
1779          * Finalize; do nothing because we haven't buffered anything
1780          */
1781         virtual uint32_t finishReadImpl() {
1782                 uint32_t ret = hitsForThisRead_;
1783                 hitsForThisRead_ = 0;
1784                 return ret;
1785         }
1786
1787         /**
1788          * Always return false; search routine should not stop.
1789          */
1790         virtual bool finishedWithStratumImpl(int stratum) { return false; }
1791 };
1792
1793 /**
1794  * Concrete factory for AllHitSinkPerThread.
1795  */
1796 class AllHitSinkPerThreadFactory : public HitSinkPerThreadFactory {
1797 public:
1798         AllHitSinkPerThreadFactory(
1799                         HitSink& sink,
1800                         uint32_t max) :
1801                         sink_(sink),
1802                         max_(max)
1803         { }
1804
1805         /**
1806          * Allocate a new NGoodHitSinkPerThread object on the heap,
1807          * using the parameters given in the constructor.
1808          */
1809         virtual HitSinkPerThread* create() const {
1810                 return new AllHitSinkPerThread(sink_, max_);
1811         }
1812         virtual HitSinkPerThread* createMult(uint32_t m) const {
1813                 uint32_t max = max_ * (max_ == 0xffffffff ? 1 : m);
1814                 return new AllHitSinkPerThread(sink_, max);
1815         }
1816
1817 private:
1818         HitSink& sink_;
1819         uint32_t max_;
1820 };
1821
1822 /**
1823  * Sink that prints lines like this:
1824  * (pat-id)[-|+]:<hit1-text-id,hit2-text-offset>,<hit2-text-id...
1825  *
1826  * Activated with --concise
1827  */
1828 class ConciseHitSink : public HitSink {
1829 public:
1830         /**
1831          * Construct a single-stream ConciseHitSink (default)
1832          */
1833         ConciseHitSink(OutFileBuf* out,
1834                                int offBase,
1835                        DECL_HIT_DUMPS2,
1836                        bool reportOpps = false) :
1837                 HitSink(out, PASS_HIT_DUMPS2),
1838                 _reportOpps(reportOpps),
1839                 offBase_(offBase) { }
1840
1841         /**
1842          * Construct a multi-stream ConciseHitSink with one stream per
1843          * reference string (see --refout)
1844          */
1845         ConciseHitSink(size_t numOuts,
1846                        int offBase,
1847                        DECL_HIT_DUMPS2,
1848                        bool reportOpps = false) :
1849                 HitSink(numOuts, PASS_HIT_DUMPS2),
1850                 _reportOpps(reportOpps),
1851                 offBase_(offBase) { }
1852
1853         /**
1854          * Append a verbose, readable hit to the given output stream.
1855          */
1856         static void append(ostream& ss, const Hit& h, int offBase, bool reportOpps) {
1857                 ss << h.patId;
1858                 if(h.mate > 0) {
1859                         assert(h.mate == 1 || h.mate == 2);
1860                         ss << '/' << (int)h.mate;
1861                 }
1862                 ss << (h.fw? "+" : "-") << ":";
1863                 // .first is text id, .second is offset
1864                 ss << "<" << h.h.first << "," << (h.h.second + offBase) << "," << h.mms.count();
1865                 if(reportOpps) ss << "," << h.oms;
1866                 ss << ">" << endl;
1867         }
1868
1869         /**
1870          * Append a verbose, readable hit to the given output stream.
1871          */
1872         void append(ostream& ss, const Hit& h) {
1873                 ConciseHitSink::append(ss, h, this->offBase_, this->_reportOpps);
1874         }
1875
1876 protected:
1877
1878         /**
1879          * Report a concise alignment to the appropriate output stream.
1880          */
1881         virtual void reportHit(const Hit& h) {
1882                 HitSink::reportHit(h);
1883                 ostringstream ss;
1884                 append(ss, h);
1885                 lock(h.h.first);
1886                 out(h.h.first).writeString(ss.str());
1887                 unlock(h.h.first);
1888         }
1889
1890 private:
1891         bool _reportOpps;
1892         int  offBase_;     /// Add this to reference offsets before outputting.
1893                            /// (An easy way to make things 1-based instead of
1894                            /// 0-based)
1895 };
1896
1897 /**
1898  * Print the given string.  If ws = true, print only up to and not
1899  * including the first space or tab.  Useful for printing reference
1900  * names.
1901  */
1902 inline void printUptoWs(std::ostream& os, const std::string& str, bool ws) {
1903         if(!ws) {
1904                 os << str;
1905         } else {
1906                 size_t pos = str.find_first_of(" \t");
1907                 if(pos != string::npos) {
1908                         os << str.substr(0, pos);
1909                 } else {
1910                         os << str;
1911                 }
1912         }
1913 }
1914
1915 /**
1916  * Sink that prints lines like this:
1917  * pat-name \t [-|+] \t ref-name \t ref-off \t pat \t qual \t #-alt-hits \t mm-list
1918  */
1919 class VerboseHitSink : public HitSink {
1920 public:
1921         /**
1922          * Construct a single-stream VerboseHitSink (default)
1923          */
1924         VerboseHitSink(OutFileBuf* out,
1925                        int offBase,
1926                        bool colorSeq,
1927                        bool colorQual,
1928                        bool printCost,
1929                        const Bitset& suppressOuts,
1930                        ReferenceMap *rmap,
1931                        AnnotationMap *amap,
1932                        bool fullRef,
1933                        DECL_HIT_DUMPS2,
1934                                    int partition = 0) :
1935         HitSink(out, PASS_HIT_DUMPS2),
1936         partition_(partition),
1937         offBase_(offBase),
1938         colorSeq_(colorSeq),
1939         colorQual_(colorQual),
1940         cost_(printCost),
1941         suppress_(suppressOuts),
1942         fullRef_(fullRef),
1943         rmap_(rmap), amap_(amap)
1944         { }
1945
1946         /**
1947          * Construct a multi-stream VerboseHitSink with one stream per
1948          * reference string (see --refout)
1949          */
1950         VerboseHitSink(size_t numOuts,
1951                        int offBase,
1952                        bool colorSeq,
1953                        bool colorQual,
1954                        bool printCost,
1955                        const Bitset& suppressOuts,
1956                        ReferenceMap *rmap,
1957                        AnnotationMap *amap,
1958                        bool fullRef,
1959                        DECL_HIT_DUMPS2,
1960                                    int partition = 0) :
1961         HitSink(numOuts, PASS_HIT_DUMPS2),
1962         partition_(partition),
1963         offBase_(offBase),
1964         colorSeq_(colorSeq),
1965         colorQual_(colorQual),
1966         cost_(printCost),
1967         suppress_(64),
1968         fullRef_(fullRef),
1969         rmap_(rmap),
1970         amap_(amap)
1971         { }
1972
1973         // In hit.cpp
1974         static void append(ostream& ss,
1975                            const Hit& h,
1976                            const vector<string>* refnames,
1977                            ReferenceMap *rmap,
1978                            AnnotationMap *amap,
1979                            bool fullRef,
1980                            int partition,
1981                            int offBase,
1982                            bool colorSeq,
1983                            bool colorQual,
1984                            bool cost,
1985                            const Bitset& suppress);
1986
1987         /**
1988          * Append a verbose, readable hit to the output stream
1989          * corresponding to the hit.
1990          */
1991         virtual void append(ostream& ss, const Hit& h) {
1992                 VerboseHitSink::append(ss, h, _refnames, rmap_, amap_,
1993                                        fullRef_, partition_, offBase_,
1994                                        colorSeq_, colorQual_, cost_,
1995                                        suppress_);
1996         }
1997
1998         /**
1999          * See hit.cpp
2000          */
2001         virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p);
2002
2003 protected:
2004
2005         /**
2006          * Report a verbose, human-readable alignment to the appropriate
2007          * output stream.
2008          */
2009         virtual void reportHit(const Hit& h) {
2010                 reportHit(h, true);
2011         }
2012
2013         /**
2014          * Report a verbose, human-readable alignment to the appropriate
2015          * output stream.
2016          */
2017         virtual void reportHit(const Hit& h, bool count) {
2018                 if(count) HitSink::reportHit(h);
2019                 ostringstream ss;
2020                 append(ss, h);
2021                 // Make sure to grab lock before writing to output stream
2022                 lock(h.h.first);
2023                 out(h.h.first).writeString(ss.str());
2024                 unlock(h.h.first);
2025         }
2026
2027 private:
2028         int      partition_;   /// partition size, or 0 if partitioning is disabled
2029         int      offBase_;     /// Add this to reference offsets before outputting.
2030                                /// (An easy way to make things 1-based instead of
2031                                /// 0-based)
2032         bool     colorSeq_;    /// true -> print colorspace alignment sequence in colors
2033         bool     colorQual_;   /// true -> print colorspace quals as originals, not decoded
2034         bool     cost_;        /// true -> print statum and cost
2035         Bitset   suppress_;    /// output fields to suppress
2036         bool fullRef_;         /// print full reference name
2037         ReferenceMap *rmap_;   /// mapping to reference coordinate system.
2038         AnnotationMap *amap_;  ///
2039 };
2040
2041 /**
2042  * Sink for outputting alignments in a binary format.
2043  */
2044 class ChainingHitSink : public HitSink {
2045 public:
2046
2047         /**
2048          * Construct a single-stream BinaryHitSink (default)
2049          */
2050         ChainingHitSink(OutFileBuf* out, bool strata, AnnotationMap *amap, DECL_HIT_DUMPS2) :
2051         HitSink(out, PASS_HIT_DUMPS2), amap_(amap), strata_(strata)
2052         {
2053                 ssmode_ |= ios_base::binary;
2054         }
2055
2056         /**
2057          * Report a batch of hits.
2058          */
2059         virtual void reportHits(vector<Hit>& hs);
2060
2061         /**
2062          * Append a binary alignment to the output stream corresponding to
2063          * the reference sequence involved.
2064          */
2065         virtual void append(ostream& o, const Hit& h) {
2066                 cerr << "Error: ChainingHitSink::append() not implemented" << endl;
2067                 throw 1;
2068         }
2069
2070         /**
2071          * See hit.cpp
2072          */
2073         virtual void reportMaxed(vector<Hit>& hs, PatternSourcePerThread& p);
2074
2075         /**
2076          * See hit.cpp
2077          */
2078         virtual void reportUnaligned(PatternSourcePerThread& p);
2079
2080 protected:
2081         AnnotationMap *amap_;
2082         bool strata_;
2083 };
2084
2085 /**
2086  * Sink that does nothing.
2087  */
2088 class StubHitSink : public HitSink {
2089 public:
2090         StubHitSink() : HitSink(new OutFileBuf(".tmp"), "", "", "", false, false, NULL) { }
2091         virtual void append(ostream& o, const Hit& h) { }
2092 };
2093
2094 #endif /*HIT_H_*/
2095