Commit patch to not break on spaces.
[bowtie.git] / ebwt_search.cpp
1 #include <stdlib.h>
2 #include <iostream>
3 #include <fstream>
4 #include <string>
5 #include <algorithm>
6 #include <cassert>
7 #include <stdexcept>
8 #include <seqan/find.h>
9 #include <getopt.h>
10 #include <vector>
11 #include <pthread.h>
12 #include "alphabet.h"
13 #include "assert_helpers.h"
14 #include "endian_swap.h"
15 #include "ebwt.h"
16 #include "formats.h"
17 #include "sequence_io.h"
18 #include "tokenize.h"
19 #include "hit.h"
20 #include "pat.h"
21 #include "bitset.h"
22 #include "threading.h"
23 #include "range_cache.h"
24 #include "refmap.h"
25 #include "annot.h"
26 #include "aligner.h"
27 #include "aligner_0mm.h"
28 #include "aligner_1mm.h"
29 #include "aligner_23mm.h"
30 #include "aligner_seed_mm.h"
31 #include "aligner_metrics.h"
32 #include "sam.h"
33 #ifdef CHUD_PROFILING
34 #include <CHUD/CHUD.h>
35 #endif
36
37 using namespace std;
38 using namespace seqan;
39
40 static vector<string> mates1;  // mated reads (first mate)
41 static vector<string> mates2;  // mated reads (second mate)
42 static vector<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
43 static string adjustedEbwtFileBase;
44 static bool verbose;      // be talkative
45 static bool startVerbose; // be talkative at startup
46 bool quiet;        // print nothing but the alignments
47 static int sanityCheck;   // enable expensive sanity checks
48 static int format;        // default read format is FASTQ
49 static string origString; // reference text, or filename(s)
50 static int seed;          // srandom() seed
51 static int timing;        // whether to report basic timing data
52 static bool allHits;      // for multihits, report just one
53 static bool rangeMode;    // report BWT ranges instead of ref locs
54 static int showVersion;   // just print version and quit?
55 static int ipause;        // pause before maching?
56 static uint32_t qUpto;    // max # of queries to read
57 static int trim5;         // amount to trim from 5' end
58 static int trim3;         // amount to trim from 3' end
59 static int reportOpps;    // whether to report # of other mappings
60 static int offRate;       // keep default offRate
61 static int isaRate;       // keep default isaRate
62 static int mismatches;    // allow 0 mismatches by default
63 static char *patDumpfile; // filename to dump patterns to
64 static bool solexaQuals;  // quality strings are solexa quals, not phred, and subtract 64 (not 33)
65 static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
66 static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
67 static int maqLike;       // do maq-like searching
68 static int seedLen;       // seed length (changed in Maq 0.6.4 from 24)
69 static int seedMms;       // # mismatches allowed in seed (maq's -n)
70 static int qualThresh;    // max qual-weighted hamming dist (maq's -e)
71 static int maxBtsBetter;  // max # backtracks allowed in half-and-half mode
72 static int maxBts;        // max # backtracks allowed in half-and-half mode
73 static int nthreads;      // number of pthreads operating concurrently
74 static output_types outType;  // style of output
75 static bool randReadsNoSync;  // true -> generate reads from per-thread random source
76 static int numRandomReads;    // # random reads (see Random*PatternSource in pat.h)
77 static int lenRandomReads;    // len of random reads (see Random*PatternSource in pat.h)
78 static bool noRefNames;       // true -> print reference indexes; not names
79 static string dumpAlBase;     // basename of same-format files to dump aligned reads to
80 static string dumpUnalBase;   // basename of same-format files to dump unaligned reads to
81 static string dumpMaxBase;    // basename of same-format files to dump reads with more than -m valid alignments to
82 static uint32_t khits;  // number of hits per read; >1 is much slower
83 static uint32_t mhits;  // don't report any hits if there are > mhits
84 static bool better;     // true -> guarantee alignments from best possible stratum
85 static bool strata;     // true -> don't stop at stratum boundaries
86 static bool refOut;     // if true, alignments go to per-ref files
87 static int partitionSz; // output a partitioning key in first field
88 static bool noMaqRound; // true -> don't round quals to nearest 10 like maq
89 static bool useSpinlock;  // false -> don't use of spinlocks even if they're #defines
90 static bool fileParallel; // separate threads read separate input files in parallel
91 static bool useShmem;     // use shared memory to hold the index
92 static bool useMm;        // use memory-mapped files to hold the index
93 static bool mmSweep;      // sweep through memory-mapped files immediately after mapping
94 static bool stateful;     // use stateful aligners
95 static uint32_t prefetchWidth; // number of reads to process in parallel w/ --stateful
96 static uint32_t minInsert;     // minimum insert size (Maq = 0, SOAP = 400)
97 static uint32_t maxInsert;     // maximum insert size (Maq = 250, SOAP = 600)
98 static bool mate1fw;           // -1 mate aligns in fw orientation on fw strand
99 static bool mate2fw;           // -2 mate aligns in rc orientation on fw strand
100 static bool mateFwSet;         // true -> user set --ff/--fr/--rf
101 static uint32_t mixedThresh;   // threshold for when to switch to paired-end mixed mode (see aligner.h)
102 static uint32_t mixedAttemptLim; // number of attempts to make in "mixed mode" before giving up on orientation
103 static bool dontReconcileMates;  // suppress pairwise all-versus-all way of resolving mates
104 static uint32_t cacheLimit;      // ranges w/ size > limit will be cached
105 static uint32_t cacheSize;       // # words per range cache
106 static int offBase;              // offsets are 0-based by default, but configurable
107 static bool tryHard;             // set very high maxBts, mixedAttemptLim
108 static uint32_t skipReads;       // # reads/read pairs to skip
109 static bool nofw; // don't align fw orientation of read
110 static bool norc; // don't align rc orientation of read
111 static bool strandFix;  // attempt to fix strand bias
112 static bool randomizeQuals; // randomize quality values
113 static bool stats; // print performance stats
114 static int chunkPoolMegabytes;    // max MB to dedicate to best-first search frames per thread
115 static int chunkSz;    // size of single chunk disbursed by ChunkPool
116 static bool chunkVerbose; // have chunk allocator output status messages?
117 static bool recal;
118 static int recalMaxCycle;
119 static int recalMaxQual;
120 static int recalQualShift;
121 static bool useV1;
122 static bool reportSe;
123 static const char * refMapFile;  // file containing a map from index coordinates to another coordinate system
124 static const char * annotMapFile;  // file containing a map from reference coordinates to annotations
125 static size_t fastaContLen;
126 static size_t fastaContFreq;
127 static bool hadoopOut; // print Hadoop status and summary messages
128 static bool fuzzy;
129 static bool fullRef;
130 static bool samNoHead; // don't print any header lines in SAM output
131 static bool samNoSQ;   // don't print @SQ header lines
132 bool color;     // true -> inputs are colorspace
133 bool colorExEnds; // true -> nucleotides on either end of decoded cspace alignment should be excluded
134 static string rgs; // SAM outputs for @RG header line
135 int snpPhred; // probability of SNP, for scoring colorspace alignments
136 static Bitset suppressOuts(64); // output fields to suppress
137 static bool sampleMax; // whether to report a random alignment when maxed-out via -m/-M
138 static int defaultMapq; // default mapping quality to print in SAM mode
139 bool colorSeq; // true -> show colorspace alignments as colors, not decoded bases
140 bool colorQual; // true -> show colorspace qualities as original quals, not decoded quals
141 static bool printCost; // true -> print stratum and cost
142 bool showSeed;
143 static vector<string> qualities;
144 static vector<string> qualities1;
145 static vector<string> qualities2;
146 MUTEX_T gLock;
147
148 static void resetOptions() {
149         mates1.clear();
150         mates2.clear();
151         mates12.clear();
152         adjustedEbwtFileBase    = "";
153         verbose                                 = 0;
154         startVerbose                    = 0;
155         quiet                                   = false;
156         sanityCheck                             = 0;  // enable expensive sanity checks
157         format                                  = FASTQ; // default read format is FASTQ
158         origString                              = ""; // reference text, or filename(s)
159         seed                                    = 0; // srandom() seed
160         timing                                  = 0; // whether to report basic timing data
161         allHits                                 = false; // for multihits, report just one
162         rangeMode                               = false; // report BWT ranges instead of ref locs
163         showVersion                             = 0; // just print version and quit?
164         ipause                                  = 0; // pause before maching?
165         qUpto                                   = 0xffffffff; // max # of queries to read
166         trim5                                   = 0; // amount to trim from 5' end
167         trim3                                   = 0; // amount to trim from 3' end
168         reportOpps                              = 0; // whether to report # of other mappings
169         offRate                                 = -1; // keep default offRate
170         isaRate                                 = -1; // keep default isaRate
171         mismatches                              = 0; // allow 0 mismatches by default
172         patDumpfile                             = NULL; // filename to dump patterns to
173         solexaQuals                             = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
174         phred64Quals                    = false; // quality chars are phred, but must subtract 64 (not 33)
175         integerQuals                    = false; // quality strings are space-separated strings of integers, not ASCII
176         maqLike                                 = 1;   // do maq-like searching
177         seedLen                                 = 28;  // seed length (changed in Maq 0.6.4 from 24)
178         seedMms                                 = 2;   // # mismatches allowed in seed (maq's -n)
179         qualThresh                              = 70;  // max qual-weighted hamming dist (maq's -e)
180         maxBtsBetter                    = 125; // max # backtracks allowed in half-and-half mode
181         maxBts                                  = 800; // max # backtracks allowed in half-and-half mode
182         nthreads                                = 1;     // number of pthreads operating concurrently
183         outType                                 = OUTPUT_FULL;  // style of output
184         randReadsNoSync                 = false; // true -> generate reads from per-thread random source
185         numRandomReads                  = 50000000; // # random reads (see Random*PatternSource in pat.h)
186         lenRandomReads                  = 35;    // len of random reads (see Random*PatternSource in pat.h)
187         noRefNames                              = false; // true -> print reference indexes; not names
188         dumpAlBase                              = "";    // basename of same-format files to dump aligned reads to
189         dumpUnalBase                    = "";    // basename of same-format files to dump unaligned reads to
190         dumpMaxBase                             = "";    // basename of same-format files to dump reads with more than -m valid alignments to
191         khits                                   = 1;     // number of hits per read; >1 is much slower
192         mhits                                   = 0xffffffff; // don't report any hits if there are > mhits
193         better                                  = false; // true -> guarantee alignments from best possible stratum
194         strata                                  = false; // true -> don't stop at stratum boundaries
195         refOut                                  = false; // if true, alignments go to per-ref files
196         partitionSz                             = 0;     // output a partitioning key in first field
197         noMaqRound                              = false; // true -> don't round quals to nearest 10 like maq
198         useSpinlock                             = true;  // false -> don't use of spinlocks even if they're #defines
199         fileParallel                    = false; // separate threads read separate input files in parallel
200         useShmem                                = false; // use shared memory to hold the index
201         useMm                                   = false; // use memory-mapped files to hold the index
202         mmSweep                                 = false; // sweep through memory-mapped files immediately after mapping
203         stateful                                = false; // use stateful aligners
204         prefetchWidth                   = 1;     // number of reads to process in parallel w/ --stateful
205         minInsert                               = 0;     // minimum insert size (Maq = 0, SOAP = 400)
206         maxInsert                               = 250;   // maximum insert size (Maq = 250, SOAP = 600)
207         mate1fw                                 = true;  // -1 mate aligns in fw orientation on fw strand
208         mate2fw                                 = false; // -2 mate aligns in rc orientation on fw strand
209         mateFwSet                               = false; // true -> user set mate1fw/mate2fw with --ff/--fr/--rf
210         mixedThresh                             = 4;     // threshold for when to switch to paired-end mixed mode (see aligner.h)
211         mixedAttemptLim                 = 100;   // number of attempts to make in "mixed mode" before giving up on orientation
212         dontReconcileMates              = true;  // suppress pairwise all-versus-all way of resolving mates
213         cacheLimit                              = 5;     // ranges w/ size > limit will be cached
214         cacheSize                               = 0;     // # words per range cache
215         offBase                                 = 0;     // offsets are 0-based by default, but configurable
216         tryHard                                 = false; // set very high maxBts, mixedAttemptLim
217         skipReads                               = 0;     // # reads/read pairs to skip
218         nofw                                    = false; // don't align fw orientation of read
219         norc                                    = false; // don't align rc orientation of read
220         strandFix                               = true;  // attempt to fix strand bias
221         randomizeQuals                  = false; // randomize quality values
222         stats                                   = false; // print performance stats
223         chunkPoolMegabytes              = 64;    // max MB to dedicate to best-first search frames per thread
224         chunkSz                                 = 256;   // size of single chunk disbursed by ChunkPool (in KB)
225         chunkVerbose                    = false; // have chunk allocator output status messages?
226         recal                                   = false;
227         recalMaxCycle                   = 64;
228         recalMaxQual                    = 40;
229         recalQualShift                  = 2;
230         useV1                                   = true;
231         reportSe                                = false;
232         refMapFile                              = NULL;  // file containing a map from index coordinates to another coordinate system
233         annotMapFile                    = NULL;  // file containing a map from reference coordinates to annotations
234         fastaContLen                    = 0;
235         fastaContFreq                   = 0;
236         hadoopOut                               = false; // print Hadoop status and summary messages
237         fuzzy                                   = false; // reads will have alternate basecalls w/ qualities
238         fullRef                                 = false; // print entire reference name instead of just up to 1st space
239         samNoHead                               = false; // don't print any header lines in SAM output
240         samNoSQ                                 = false; // don't print @SQ header lines
241         color                                   = false; // don't align in colorspace by default
242         colorExEnds                             = true;  // true -> nucleotides on either end of decoded cspace alignment should be excluded
243         rgs                                             = "";    // SAM outputs for @RG header line
244         snpPhred                                = 30;    // probability of SNP, for scoring colorspace alignments
245         suppressOuts.clear();            // output fields to suppress
246         sampleMax                               = false;
247         defaultMapq                             = 255;
248         colorSeq                                = false; // true -> show colorspace alignments as colors, not decoded bases
249         colorQual                               = false; // true -> show colorspace qualities as original quals, not decoded quals
250         printCost                               = false; // true -> print cost and stratum
251         showSeed                                = false; // true -> print per-read pseudo-random seed
252         qualities.clear();
253         qualities1.clear();
254         qualities2.clear();
255         MUTEX_INIT(gLock);
256 }
257
258 // mating constraints
259
260 static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:M:1:2:I:X:x:B:ySCQ:";
261
262 enum {
263         ARG_ORIG = 256,
264         ARG_SEED,
265         ARG_DUMP_PATS,
266         ARG_RANGE,
267         ARG_CONCISE,
268         ARG_SOLEXA_QUALS,
269         ARG_MAXBTS,
270         ARG_VERBOSE,
271         ARG_STARTVERBOSE,
272         ARG_QUIET,
273         ARG_RANDOM_READS,
274         ARG_RANDOM_READS_NOSYNC,
275         ARG_NOOUT,
276         ARG_FAST,
277         ARG_AL,
278         ARG_UN,
279         ARG_MAXDUMP,
280         ARG_REFIDX,
281         ARG_SANITY,
282         ARG_OLDBEST,
283         ARG_BETTER,
284         ARG_BEST,
285         ARG_REFOUT,
286         ARG_ISARATE,
287         ARG_PARTITION,
288         ARG_integerQuals,
289         ARG_NOMAQROUND,
290         ARG_USE_SPINLOCK,
291         ARG_FILEPAR,
292         ARG_SHMEM,
293         ARG_MM,
294         ARG_MMSWEEP,
295         ARG_STATEFUL,
296         ARG_PREFETCH_WIDTH,
297         ARG_FF,
298         ARG_FR,
299         ARG_RF,
300         ARG_MIXED_ATTEMPTS,
301         ARG_NO_RECONCILE,
302         ARG_CACHE_LIM,
303         ARG_CACHE_SZ,
304         ARG_NO_FW,
305         ARG_NO_RC,
306         ARG_SKIP,
307         ARG_STRAND_FIX,
308         ARG_RANDOMIZE_QUALS,
309         ARG_STATS,
310         ARG_ONETWO,
311         ARG_PHRED64,
312         ARG_PHRED33,
313         ARG_CHUNKMBS,
314         ARG_CHUNKSZ,
315         ARG_CHUNKVERBOSE,
316         ARG_RECAL,
317         ARG_STRATA,
318         ARG_PEV2,
319         ARG_CHAINOUT,
320         ARG_CHAININ,
321         ARG_REFMAP,
322         ARG_ANNOTMAP,
323         ARG_REPORTSE,
324         ARG_HADOOPOUT,
325         ARG_FUZZY,
326         ARG_FULLREF,
327         ARG_USAGE,
328         ARG_SNPPHRED,
329         ARG_SNPFRAC,
330         ARG_SAM_NOHEAD,
331         ARG_SAM_NOSQ,
332         ARG_SAM_RG,
333         ARG_SUPPRESS_FIELDS,
334         ARG_DEFAULT_MAPQ,
335         ARG_COLOR_SEQ,
336         ARG_COLOR_QUAL,
337         ARG_COST,
338         ARG_COLOR_KEEP_ENDS,
339         ARG_SHOWSEED,
340         ARG_QUALS1,
341         ARG_QUALS2
342 };
343
344 static struct option long_options[] = {
345         {(char*)"verbose",      no_argument,       0,            ARG_VERBOSE},
346         {(char*)"startverbose", no_argument,       0,            ARG_STARTVERBOSE},
347         {(char*)"quiet",        no_argument,       0,            ARG_QUIET},
348         {(char*)"sanity",       no_argument,       0,            ARG_SANITY},
349         {(char*)"pause",        no_argument,       &ipause,      1},
350         {(char*)"orig",         required_argument, 0,            ARG_ORIG},
351         {(char*)"all",          no_argument,       0,            'a'},
352         {(char*)"concise",      no_argument,       0,            ARG_CONCISE},
353         {(char*)"noout",        no_argument,       0,            ARG_NOOUT},
354         {(char*)"solexa-quals", no_argument,       0,            ARG_SOLEXA_QUALS},
355         {(char*)"integer-quals",no_argument,       0,            ARG_integerQuals},
356         {(char*)"time",         no_argument,       0,            't'},
357         {(char*)"trim3",        required_argument, 0,            '3'},
358         {(char*)"trim5",        required_argument, 0,            '5'},
359         {(char*)"seed",         required_argument, 0,            ARG_SEED},
360         {(char*)"qupto",        required_argument, 0,            'u'},
361         {(char*)"al",           required_argument, 0,            ARG_AL},
362         {(char*)"un",           required_argument, 0,            ARG_UN},
363         {(char*)"max",          required_argument, 0,            ARG_MAXDUMP},
364         {(char*)"offrate",      required_argument, 0,            'o'},
365         {(char*)"isarate",      required_argument, 0,            ARG_ISARATE},
366         {(char*)"reportopps",   no_argument,       &reportOpps,  1},
367         {(char*)"version",      no_argument,       &showVersion, 1},
368         {(char*)"dumppats",     required_argument, 0,            ARG_DUMP_PATS},
369         {(char*)"maqerr",       required_argument, 0,            'e'},
370         {(char*)"seedlen",      required_argument, 0,            'l'},
371         {(char*)"seedmms",      required_argument, 0,            'n'},
372         {(char*)"filepar",      no_argument,       0,            ARG_FILEPAR},
373         {(char*)"help",         no_argument,       0,            'h'},
374         {(char*)"threads",      required_argument, 0,            'p'},
375         {(char*)"khits",        required_argument, 0,            'k'},
376         {(char*)"mhits",        required_argument, 0,            'm'},
377         {(char*)"minins",       required_argument, 0,            'I'},
378         {(char*)"maxins",       required_argument, 0,            'X'},
379         {(char*)"quals",        required_argument, 0,            'Q'},
380         {(char*)"Q1",           required_argument, 0,            ARG_QUALS1},
381         {(char*)"Q2",           required_argument, 0,            ARG_QUALS2},
382         {(char*)"best",         no_argument,       0,            ARG_BEST},
383         {(char*)"better",       no_argument,       0,            ARG_BETTER},
384         {(char*)"oldbest",      no_argument,       0,            ARG_OLDBEST},
385         {(char*)"strata",       no_argument,       0,            ARG_STRATA},
386         {(char*)"nomaqround",   no_argument,       0,            ARG_NOMAQROUND},
387         {(char*)"refidx",       no_argument,       0,            ARG_REFIDX},
388         {(char*)"range",        no_argument,       0,            ARG_RANGE},
389         {(char*)"maxbts",       required_argument, 0,            ARG_MAXBTS},
390         {(char*)"randread",     no_argument,       0,            ARG_RANDOM_READS},
391         {(char*)"randreadnosync", no_argument,     0,            ARG_RANDOM_READS_NOSYNC},
392         {(char*)"phased",       no_argument,       0,            'z'},
393         {(char*)"refout",       no_argument,       0,            ARG_REFOUT},
394         {(char*)"partition",    required_argument, 0,            ARG_PARTITION},
395         {(char*)"nospin",       no_argument,       0,            ARG_USE_SPINLOCK},
396         {(char*)"stateful",     no_argument,       0,            ARG_STATEFUL},
397         {(char*)"prewidth",     required_argument, 0,            ARG_PREFETCH_WIDTH},
398         {(char*)"ff",           no_argument,       0,            ARG_FF},
399         {(char*)"fr",           no_argument,       0,            ARG_FR},
400         {(char*)"rf",           no_argument,       0,            ARG_RF},
401         {(char*)"mixthresh",    required_argument, 0,            'x'},
402         {(char*)"pairtries",    required_argument, 0,            ARG_MIXED_ATTEMPTS},
403         {(char*)"noreconcile",  no_argument,       0,            ARG_NO_RECONCILE},
404         {(char*)"cachelim",     required_argument, 0,            ARG_CACHE_LIM},
405         {(char*)"cachesz",      required_argument, 0,            ARG_CACHE_SZ},
406         {(char*)"nofw",         no_argument,       0,            ARG_NO_FW},
407         {(char*)"norc",         no_argument,       0,            ARG_NO_RC},
408         {(char*)"offbase",      required_argument, 0,            'B'},
409         {(char*)"tryhard",      no_argument,       0,            'y'},
410         {(char*)"skip",         required_argument, 0,            's'},
411         {(char*)"strandfix",    no_argument,       0,            ARG_STRAND_FIX},
412         {(char*)"randquals",    no_argument,       0,            ARG_RANDOMIZE_QUALS},
413         {(char*)"stats",        no_argument,       0,            ARG_STATS},
414         {(char*)"12",           required_argument, 0,            ARG_ONETWO},
415         {(char*)"phred33-quals", no_argument,      0,            ARG_PHRED33},
416         {(char*)"phred64-quals", no_argument,      0,            ARG_PHRED64},
417         {(char*)"solexa1.3-quals", no_argument,    0,            ARG_PHRED64},
418         {(char*)"chunkmbs",     required_argument, 0,            ARG_CHUNKMBS},
419         {(char*)"chunksz",      required_argument, 0,            ARG_CHUNKSZ},
420         {(char*)"chunkverbose", no_argument,       0,            ARG_CHUNKVERBOSE},
421         {(char*)"mm",           no_argument,       0,            ARG_MM},
422         {(char*)"shmem",        no_argument,       0,            ARG_SHMEM},
423         {(char*)"mmsweep",      no_argument,       0,            ARG_MMSWEEP},
424         {(char*)"recal",        no_argument,       0,            ARG_RECAL},
425         {(char*)"pev2",         no_argument,       0,            ARG_PEV2},
426         {(char*)"chainout",     no_argument,       0,            ARG_CHAINOUT},
427         {(char*)"chainin",      no_argument,       0,            ARG_CHAININ},
428         {(char*)"refmap",       required_argument, 0,            ARG_REFMAP},
429         {(char*)"annotmap",     required_argument, 0,            ARG_ANNOTMAP},
430         {(char*)"reportse",     no_argument,       0,            ARG_REPORTSE},
431         {(char*)"hadoopout",    no_argument,       0,            ARG_HADOOPOUT},
432         {(char*)"fuzzy",        no_argument,       0,            ARG_FUZZY},
433         {(char*)"fullref",      no_argument,       0,            ARG_FULLREF},
434         {(char*)"usage",        no_argument,       0,            ARG_USAGE},
435         {(char*)"sam",          no_argument,       0,            'S'},
436         {(char*)"sam-nohead",   no_argument,       0,            ARG_SAM_NOHEAD},
437         {(char*)"sam-nosq",     no_argument,       0,            ARG_SAM_NOSQ},
438         {(char*)"sam-noSQ",     no_argument,       0,            ARG_SAM_NOSQ},
439         {(char*)"color",        no_argument,       0,            'C'},
440         {(char*)"sam-RG",       required_argument, 0,            ARG_SAM_RG},
441         {(char*)"snpphred",     required_argument, 0,            ARG_SNPPHRED},
442         {(char*)"snpfrac",      required_argument, 0,            ARG_SNPFRAC},
443         {(char*)"suppress",     required_argument, 0,            ARG_SUPPRESS_FIELDS},
444         {(char*)"mapq",         required_argument, 0,            ARG_DEFAULT_MAPQ},
445         {(char*)"col-cseq",     no_argument,       0,            ARG_COLOR_SEQ},
446         {(char*)"col-cqual",    no_argument,       0,            ARG_COLOR_QUAL},
447         {(char*)"col-keepends", no_argument,       0,            ARG_COLOR_KEEP_ENDS},
448         {(char*)"cost",         no_argument,       0,            ARG_COST},
449         {(char*)"showseed",     no_argument,       0,            ARG_SHOWSEED},
450         {(char*)0, 0, 0, 0} // terminator
451 };
452
453 /**
454  * Print a summary usage message to the provided output stream.
455  */
456 static void printUsage(ostream& out) {
457         out << "Usage: " << endl
458         << "  bowtie [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>]" << endl
459         << endl
460             << "  <m1>    Comma-separated list of files containing upstream mates (or the" << endl
461             << "          sequences themselves, if -c is set) paired with mates in <m2>" << endl
462             << "  <m2>    Comma-separated list of files containing downstream mates (or the" << endl
463             << "          sequences themselves if -c is set) paired with mates in <m1>" << endl
464             << "  <r>     Comma-separated list of files containing Crossbow-style reads.  Can be" << endl
465             << "          a mixture of paired and unpaired.  Specify \"-\" for stdin." << endl
466             << "  <s>     Comma-separated list of files containing unpaired reads, or the" << endl
467             << "          sequences themselves, if -c is set.  Specify \"-\" for stdin." << endl
468             << "  <hit>   File to write hits to (default: stdout)" << endl
469             << "Input:" << endl
470             << "  -q                 query input files are FASTQ .fq/.fastq (default)" << endl
471             << "  -f                 query input files are (multi-)FASTA .fa/.mfa" << endl
472             << "  -r                 query input files are raw one-sequence-per-line" << endl
473             << "  -c                 query sequences given on cmd line (as <mates>, <singles>)" << endl
474             << "  -C                 reads and index are in colorspace" << endl
475             << "  -Q/--quals <file>  QV file(s) corresponding to CSFASTA inputs; use with -f -C" << endl
476             << "  --Q1/--Q2 <file>   same as -Q, but for mate files 1 and 2 respectively" << endl
477             << "  -s/--skip <int>    skip the first <int> reads/pairs in the input" << endl
478             << "  -u/--qupto <int>   stop after first <int> reads/pairs (excl. skipped reads)" << endl
479             << "  -5/--trim5 <int>   trim <int> bases from 5' (left) end of reads" << endl
480             << "  -3/--trim3 <int>   trim <int> bases from 3' (right) end of reads" << endl
481                 << "  --phred33-quals    input quals are Phred+33 (default)" << endl
482                 << "  --phred64-quals    input quals are Phred+64 (same as --solexa1.3-quals)" << endl
483                 << "  --solexa-quals     input quals are from GA Pipeline ver. < 1.3" << endl
484                 << "  --solexa1.3-quals  input quals are from GA Pipeline ver. >= 1.3" << endl
485                 << "  --integer-quals    qualities are given as space-separated integers (not ASCII)" << endl
486             << "Alignment:" << endl
487             << "  -v <int>           report end-to-end hits w/ <=v mismatches; ignore qualities" << endl
488             << "    or" << endl
489             << "  -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2)" << endl
490             << "  -e/--maqerr <int>  max sum of mismatch quals across alignment for -n (def: 70)" << endl
491             << "  -l/--seedlen <int> seed length for -n (default: 28)" << endl
492                 << "  --nomaqround       disable Maq-like quality rounding for -n (nearest 10 <= 30)" << endl
493             << "  -I/--minins <int>  minimum insert size for paired-end alignment (default: 0)" << endl
494             << "  -X/--maxins <int>  maximum insert size for paired-end alignment (default: 250)" << endl
495             << "  --fr/--rf/--ff     -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr)" << endl
496             << "  --nofw/--norc      do not align to forward/reverse-complement reference strand" << endl
497             << "  --maxbts <int>     max # backtracks for -n 2/3 (default: 125, 800 for --best)" << endl
498             << "  --pairtries <int>  max # attempts to find mate for anchor hit (default: 100)" << endl
499             << "  -y/--tryhard       try hard to find valid alignments, at the expense of speed" << endl
500             << "  --chunkmbs <int>   max megabytes of RAM for best-first search frames (def: 64)" << endl
501             << "Reporting:" << endl
502             << "  -k <int>           report up to <int> good alignments per read (default: 1)" << endl
503             << "  -a/--all           report all alignments per read (much slower than low -k)" << endl
504             << "  -m <int>           suppress all alignments if > <int> exist (def: no limit)" << endl
505             << "  -M <int>           like -m, but reports 1 random hit (MAPQ=0); requires --best" << endl
506             << "  --best             hits guaranteed best stratum; ties broken by quality" << endl
507             << "  --strata           hits in sub-optimal strata aren't reported (requires --best)" << endl
508             << "Output:" << endl
509             << "  -t/--time          print wall-clock time taken by search phases" << endl
510             << "  -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0)" << endl
511             << "  --quiet            print nothing but the alignments" << endl
512             << "  --refout           write alignments to files refXXXXX.map, 1 map per reference" << endl
513             << "  --refidx           refer to ref. seqs by 0-based index rather than name" << endl
514             << "  --al <fname>       write aligned reads/pairs to file(s) <fname>" << endl
515             << "  --un <fname>       write unaligned reads/pairs to file(s) <fname>" << endl
516             << "  --max <fname>      write reads/pairs over -m limit to file(s) <fname>" << endl
517             << "  --suppress <cols>  suppresses given columns (comma-delim'ed) in default output" << endl
518             << "  --fullref          write entire ref name (default: only up to 1st space)" << endl
519             << "Colorspace:" << endl
520             << "  --snpphred <int>   Phred penalty for SNP when decoding colorspace (def: 30)" << endl
521             << "     or" << endl
522             << "  --snpfrac <dec>    approx. fraction of SNP bases (e.g. 0.001); sets --snpphred" << endl
523             << "  --col-cseq         print aligned colorspace seqs as colors, not decoded bases" << endl
524             << "  --col-cqual        print original colorspace quals, not decoded quals" << endl
525             << "  --col-keepends     keep nucleotides at extreme ends of decoded alignment" << endl
526             << "SAM:" << endl
527             << "  -S/--sam           write hits in SAM format" << endl
528             << "  --mapq <int>       default mapping quality (MAPQ) to print for SAM alignments" << endl
529             << "  --sam-nohead       supppress header lines (starting with @) for SAM output" << endl
530             << "  --sam-nosq         supppress @SQ header lines for SAM output" << endl
531             << "  --sam-RG <text>    add <text> (usually \"lab=value\") to @RG line of SAM header" << endl
532             << "Performance:" << endl
533             << "  -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
534 #ifdef BOWTIE_PTHREADS
535             << "  -p/--threads <int> number of alignment threads to launch (default: 1)" << endl
536 #endif
537 #ifdef BOWTIE_MM
538             << "  --mm               use memory-mapped I/O for index; many 'bowtie's can share" << endl
539 #endif
540 #ifdef BOWTIE_SHARED_MEM
541             << "  --shmem            use shared mem for index; many 'bowtie's can share" << endl
542 #endif
543             << "Other:" << endl
544             << "  --seed <int>       seed for random number generator" << endl
545             << "  --verbose          verbose output (for debugging)" << endl
546             << "  --version          print version information and quit" << endl
547             << "  -h/--help          print this usage message" << endl
548             ;
549 }
550
551 /**
552  * Parse an int out of optarg and enforce that it be at least 'lower';
553  * if it is less than 'lower', than output the given error message and
554  * exit with an error and a usage message.
555  */
556 static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
557         long l;
558         char *endPtr= NULL;
559         l = strtol(arg, &endPtr, 10);
560         if (endPtr != NULL) {
561                 if (l < lower || l > upper) {
562                         cerr << errmsg << endl;
563                         printUsage(cerr);
564                         throw 1;
565                 }
566                 return (int32_t)l;
567         }
568         cerr << errmsg << endl;
569         printUsage(cerr);
570         throw 1;
571         return -1;
572 }
573
574 /**
575  * Parse from optarg by default.
576  */
577 static int parseInt(int lower, const char *errmsg) {
578         return parseInt(lower, INT_MAX, errmsg, optarg);
579 }
580
581 /**
582  * Upper is INT_MAX by default.
583  */
584 static int parseInt(int lower, const char *errmsg, const char *arg) {
585         return parseInt(lower, INT_MAX, errmsg, arg);
586 }
587
588 /**
589  * Upper is INT_MAX, parse from optarg by default.
590  */
591 static int parseInt(int lower, int upper, const char *errmsg) {
592         return parseInt(lower, upper, errmsg, optarg);
593 }
594
595 /**
596  * Parse a T string 'str'.
597  */
598 template<typename T>
599 T parse(const char *s) {
600         T tmp;
601         stringstream ss(s);
602         ss >> tmp;
603         return tmp;
604 }
605
606 /**
607  * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
608  */
609 template<typename T>
610 pair<T, T> parsePair(const char *str, char delim) {
611         string s(str);
612         vector<string> ss;
613         tokenize(s, delim, ss);
614         pair<T, T> ret;
615         ret.first = parse<T>(ss[0].c_str());
616         ret.second = parse<T>(ss[1].c_str());
617         return ret;
618 }
619
620 /**
621  * Read command-line arguments
622  */
623 static void parseOptions(int argc, const char **argv) {
624         int option_index = 0;
625         int next_option;
626         if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
627         do {
628                 next_option = getopt_long(
629                         argc, const_cast<char**>(argv),
630                         short_options, long_options, &option_index);
631                 switch (next_option) {
632                         case '1': tokenize(optarg, ",", mates1); break;
633                         case '2': tokenize(optarg, ",", mates2); break;
634                         case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
635                         case 'f': format = FASTA; break;
636                         case 'F': {
637                                 format = FASTA_CONT;
638                                 pair<size_t, size_t> p = parsePair<size_t>(optarg, ',');
639                                 fastaContLen = p.first;
640                                 fastaContFreq = p.second;
641                                 break;
642                         }
643                         case 'q': format = FASTQ; break;
644                         case 'r': format = RAW; break;
645                         case 'c': format = CMDLINE; break;
646                         case 'C': color = true; break;
647                         case ARG_CHAININ: format = INPUT_CHAIN; break;
648                         case 'I':
649                                 minInsert = (uint32_t)parseInt(0, "-I arg must be positive");
650                                 break;
651                         case 'X':
652                                 maxInsert = (uint32_t)parseInt(1, "-X arg must be at least 1");
653                                 break;
654                         case 's':
655                                 skipReads = (uint32_t)parseInt(0, "-s arg must be positive");
656                                 break;
657                         case ARG_FF: mate1fw = true;  mate2fw = true;  mateFwSet = true; break;
658                         case ARG_RF: mate1fw = false; mate2fw = true;  mateFwSet = true; break;
659                         case ARG_FR: mate1fw = true;  mate2fw = false; mateFwSet = true; break;
660                         case ARG_RANDOM_READS: format = RANDOM; break;
661                         case ARG_RANDOM_READS_NOSYNC:
662                                 format = RANDOM;
663                                 randReadsNoSync = true;
664                                 break;
665                         case ARG_RANGE: rangeMode = true; break;
666                         case ARG_CONCISE: outType = OUTPUT_CONCISE; break;
667                         case ARG_CHAINOUT: outType = OUTPUT_CHAIN; break;
668                         case 'S': outType = OUTPUT_SAM; break;
669                         case ARG_REFOUT: refOut = true; break;
670                         case ARG_NOOUT: outType = OUTPUT_NONE; break;
671                         case ARG_REFMAP: refMapFile = optarg; break;
672                         case ARG_ANNOTMAP: annotMapFile = optarg; break;
673                         case ARG_USE_SPINLOCK: useSpinlock = false; break;
674                         case ARG_SHMEM: useShmem = true; break;
675                         case ARG_COLOR_SEQ: colorSeq = true; break;
676                         case ARG_COLOR_QUAL: colorQual = true; break;
677                         case ARG_SHOWSEED: showSeed = true; break;
678                         case ARG_SUPPRESS_FIELDS: {
679                                 vector<string> supp;
680                                 tokenize(optarg, ",", supp);
681                                 for(size_t i = 0; i < supp.size(); i++) {
682                                         int ii = parseInt(1, "--suppress arg must be at least 1", supp[i].c_str());
683                                         suppressOuts.set(ii-1);
684                                 }
685                                 break;
686                         }
687                         case ARG_MM: {
688 #ifdef BOWTIE_MM
689                                 useMm = true;
690                                 break;
691 #else
692                                 cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
693                                      << "BOWTIE_MM defined.  Memory-mapped I/O is not supported under Windows.  If you" << endl
694                                      << "would like to use memory-mapped I/O on a platform that supports it, please" << endl
695                                      << "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
696                                 throw 1;
697 #endif
698                         }
699                         case ARG_MMSWEEP: mmSweep = true; break;
700                         case ARG_HADOOPOUT: hadoopOut = true; break;
701                         case ARG_AL: dumpAlBase = optarg; break;
702                         case ARG_UN: dumpUnalBase = optarg; break;
703                         case ARG_MAXDUMP: dumpMaxBase = optarg; break;
704                         case ARG_SOLEXA_QUALS: solexaQuals = true; break;
705                         case ARG_integerQuals: integerQuals = true; break;
706                         case ARG_PHRED64: phred64Quals = true; break;
707                         case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
708                         case ARG_NOMAQROUND: noMaqRound = true; break;
709                         case ARG_COLOR_KEEP_ENDS: colorExEnds = false; break;
710                         case ARG_SNPPHRED: snpPhred = parseInt(0, "--snpphred must be at least 0"); break;
711                         case ARG_SNPFRAC: {
712                                 double p = parse<double>(optarg);
713                                 if(p <= 0.0) {
714                                         cerr << "Error: --snpfrac parameter must be > 0.0" << endl;
715                                         throw 1;
716                                 }
717                                 p = (log10(p) * -10);
718                                 snpPhred = (int)(p + 0.5);
719                                 if(snpPhred < 10)
720                                 cout << "snpPhred: " << snpPhred << endl;
721                                 break;
722                         }
723                         case 'z': {
724                                 cerr << "Error: -z/--phased mode is no longer supported" << endl;
725                                 throw 1;
726                         }
727                         case ARG_REFIDX: noRefNames = true; break;
728                         case ARG_STATEFUL: stateful = true; break;
729                         case ARG_FUZZY: fuzzy = true; break;
730                         case ARG_REPORTSE: reportSe = true; break;
731                         case ARG_FULLREF: fullRef = true; break;
732                         case ARG_PREFETCH_WIDTH:
733                                 prefetchWidth = parseInt(1, "--prewidth must be at least 1");
734                                 break;
735                         case 'B':
736                                 offBase = parseInt(-999999, "-B/--offbase cannot be a large negative number");
737                                 break;
738                         case ARG_SEED:
739                                 seed = parseInt(0, "--seed arg must be at least 0");
740                                 break;
741                         case 'u':
742                                 qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1");
743                                 break;
744                         case 'k':
745                                 khits = (uint32_t)parseInt(1, "-k arg must be at least 1");
746                                 break;
747                         case 'Q':
748                                 tokenize(optarg, ",", qualities);
749                                 integerQuals = true;
750                                 break;
751                         case ARG_QUALS1:
752                                 tokenize(optarg, ",", qualities1);
753                                 integerQuals = true;
754                                 break;
755                         case ARG_QUALS2:
756                                 tokenize(optarg, ",", qualities2);
757                                 integerQuals = true;
758                                 break;
759                         case 'M':
760                                 sampleMax = true;
761                         case 'm':
762                                 mhits = (uint32_t)parseInt(1, "-m arg must be at least 1");
763                                 break;
764                         case 'x':
765                                 mixedThresh = (uint32_t)parseInt(0, "-x arg must be at least 0");
766                                 break;
767                         case ARG_MIXED_ATTEMPTS:
768                                 mixedAttemptLim = (uint32_t)parseInt(1, "--mixatt arg must be at least 1");
769                                 break;
770                         case ARG_CACHE_LIM:
771                                 cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1");
772                                 break;
773                         case ARG_CACHE_SZ:
774                                 cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1");
775                                 cacheSize *= (1024 * 1024); // convert from MB to B
776                                 break;
777                         case ARG_NO_RECONCILE:
778                                 dontReconcileMates = true;
779                                 break;
780                         case 'p':
781 #ifndef BOWTIE_PTHREADS
782                                 cerr << "-p/--threads is disabled because bowtie was not compiled with pthreads support" << endl;
783                                 throw 1;
784 #endif
785                                 nthreads = parseInt(1, "-p/--threads arg must be at least 1");
786                                 break;
787                         case ARG_FILEPAR:
788 #ifndef BOWTIE_PTHREADS
789                                 cerr << "--filepar is disabled because bowtie was not compiled with pthreads support" << endl;
790                                 throw 1;
791 #endif
792                                 fileParallel = true;
793                                 break;
794                         case 'v':
795                                 maqLike = 0;
796                                 mismatches = parseInt(0, 3, "-v arg must be at least 0 and at most 3");
797                                 break;
798                         case '3': trim3 = parseInt(0, "-3/--trim3 arg must be at least 0"); break;
799                         case '5': trim5 = parseInt(0, "-5/--trim5 arg must be at least 0"); break;
800                         case 'o': offRate = parseInt(1, "-o/--offrate arg must be at least 1"); break;
801                         case ARG_ISARATE: isaRate = parseInt(0, "--isarate arg must be at least 0"); break;
802                         case 'e': qualThresh = parseInt(1, "-e/--err arg must be at least 1"); break;
803                         case 'n': seedMms = parseInt(0, 3, "-n/--seedmms arg must be at least 0 and at most 3"); maqLike = 1; break;
804                         case 'l': seedLen = parseInt(5, "-l/--seedlen arg must be at least 5"); break;
805                         case 'h': printUsage(cout); throw 0; break;
806                         case ARG_USAGE: printUsage(cout); throw 0; break;
807                         case 'a': allHits = true; break;
808                         case 'y': tryHard = true; break;
809                         case ARG_RECAL: recal = true; break;
810                         case ARG_CHUNKMBS: chunkPoolMegabytes = parseInt(1, "--chunkmbs arg must be at least 1"); break;
811                         case ARG_CHUNKSZ: chunkSz = parseInt(1, "--chunksz arg must be at least 1"); break;
812                         case ARG_CHUNKVERBOSE: chunkVerbose = true; break;
813                         case ARG_BETTER: stateful = true; better = true; break;
814                         case ARG_BEST: stateful = true; useV1 = false; break;
815                         case ARG_STRATA: strata = true; break;
816                         case ARG_VERBOSE: verbose = true; break;
817                         case ARG_STARTVERBOSE: startVerbose = true; break;
818                         case ARG_QUIET: quiet = true; break;
819                         case ARG_SANITY: sanityCheck = true; break;
820                         case 't': timing = true; break;
821                         case ARG_NO_FW: nofw = true; break;
822                         case ARG_NO_RC: norc = true; break;
823                         case ARG_STATS: stats = true; break;
824                         case ARG_PEV2: useV1 = false; break;
825                         case ARG_SAM_NOHEAD: samNoHead = true; break;
826                         case ARG_SAM_NOSQ: samNoSQ = true; break;
827                         case ARG_SAM_RG: {
828                                 if(!rgs.empty()) rgs += '\t';
829                                 rgs += optarg;
830                                 break;
831                         }
832                         case ARG_COST: printCost = true; break;
833                         case ARG_DEFAULT_MAPQ:
834                                 defaultMapq = parseInt(0, "--mapq must be positive");
835                                 break;
836                         case ARG_MAXBTS: {
837                                 maxBts  = parseInt(0, "--maxbts must be positive");
838                                 maxBtsBetter = maxBts;
839                                 break;
840                         }
841                         case ARG_DUMP_PATS: patDumpfile = optarg; break;
842                         case ARG_STRAND_FIX: strandFix = true; break;
843                         case ARG_RANDOMIZE_QUALS: randomizeQuals = true; break;
844                         case ARG_PARTITION: partitionSz = parse<int>(optarg); break;
845                         case ARG_ORIG:
846                                 if(optarg == NULL || strlen(optarg) == 0) {
847                                         cerr << "--orig arg must be followed by a string" << endl;
848                                         printUsage(cerr);
849                                         throw 1;
850                                 }
851                                 origString = optarg;
852                                 break;
853                         case -1: break; /* Done with options. */
854                         case 0:
855                                 if (long_options[option_index].flag != 0)
856                                         break;
857                         default:
858                                 printUsage(cerr);
859                                 throw 1;
860                 }
861         } while(next_option != -1);
862         bool paired = mates1.size() > 0 || mates2.size() > 0 || mates12.size() > 0;
863         if(rangeMode) {
864                 // Tell the Ebwt loader to ignore the suffix-array portion of
865                 // the index.  We don't need it because the user isn't asking
866                 // for bowtie to report reference positions (just matrix
867                 // ranges).
868                 offRate = 32;
869         }
870         if(!maqLike && mismatches == 3) {
871                 // Much faster than normal 3-mismatch mode
872                 stateful = true;
873         }
874         if(mates1.size() != mates2.size()) {
875                 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
876                      << "mate files/sequences were specified with -2.  The same number of mate files/" << endl
877                      << "sequences must be specified with -1 and -2." << endl;
878                 throw 1;
879         }
880         if(qualities.size() && format != FASTA) {
881                 cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
882                      << "enabled.  -Q works only in combination with -f and -C." << endl;
883                 throw 1;
884         }
885         if(qualities.size() && !color) {
886                 cerr << "Error: one or more quality files were specified with -Q but -C was not" << endl
887                      << "enabled.  -Q works only in combination with -f and -C." << endl;
888                 throw 1;
889         }
890         if(qualities1.size() && format != FASTA) {
891                 cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
892                      << "enabled.  --Q1 works only in combination with -f and -C." << endl;
893                 throw 1;
894         }
895         if(qualities1.size() && !color) {
896                 cerr << "Error: one or more quality files were specified with --Q1 but -C was not" << endl
897                      << "enabled.  --Q1 works only in combination with -f and -C." << endl;
898                 throw 1;
899         }
900         if(qualities2.size() && format != FASTA) {
901                 cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
902                      << "enabled.  --Q2 works only in combination with -f and -C." << endl;
903                 throw 1;
904         }
905         if(qualities2.size() && !color) {
906                 cerr << "Error: one or more quality files were specified with --Q2 but -C was not" << endl
907                      << "enabled.  --Q2 works only in combination with -f and -C." << endl;
908                 throw 1;
909         }
910         if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
911                 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
912                      << "quality files were specified with --Q1.  The same number of mate and quality" << endl
913                      << "files must sequences must be specified with -1 and --Q1." << endl;
914                 throw 1;
915         }
916         if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
917                 cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
918                      << "quality files were specified with --Q2.  The same number of mate and quality" << endl
919                      << "files must sequences must be specified with -2 and --Q2." << endl;
920                 throw 1;
921         }
922         // Check for duplicate mate input files
923         if(format != CMDLINE) {
924                 for(size_t i = 0; i < mates1.size(); i++) {
925                         for(size_t j = 0; j < mates2.size(); j++) {
926                                 if(mates1[i] == mates2[j] && !quiet) {
927                                         cerr << "Warning: Same mate file \"" << mates1[i] << "\" appears as argument to both -1 and -2" << endl;
928                                 }
929                         }
930                 }
931         }
932         if(tryHard) {
933                 // Increase backtracking limit to huge number
934                 maxBts = maxBtsBetter = INT_MAX;
935                 // Increase number of paired-end scan attempts to huge number
936                 mixedAttemptLim = UINT_MAX;
937         }
938         if(!stateful && sampleMax) {
939                 if(!quiet) {
940                         cerr << "Warning: -M was specified w/o --best; automatically enabling --best" << endl;
941                 }
942                 stateful = true;
943         }
944         if(strata && !stateful) {
945                 cerr << "--strata must be combined with --best" << endl;
946                 throw 1;
947         }
948         if(strata && !allHits && khits == 1 && mhits == 0xffffffff) {
949                 cerr << "--strata has no effect unless combined with -k, -m or -a" << endl;
950                 throw 1;
951         }
952         if(fuzzy && (!stateful && !paired)) {
953                 cerr << "--fuzzy must be combined with --best or paired-end alignment" << endl;
954                 throw 1;
955         }
956         // If both -s and -u are used, we need to adjust qUpto accordingly
957         // since it uses patid to know if we've reached the -u limit (and
958         // patids are all shifted up by skipReads characters)
959         if(qUpto + skipReads > qUpto) {
960                 qUpto += skipReads;
961         }
962         if(useShmem && useMm && !quiet) {
963                 cerr << "Warning: --shmem overrides --mm..." << endl;
964                 useMm = false;
965         }
966         if(snpPhred <= 10 && color && !quiet) {
967                 cerr << "Warning: the colorspace SNP penalty (--snpphred) is very low: " << snpPhred << endl;
968         }
969         if(format == INPUT_CHAIN) {
970                 bool error = false;
971                 if(!stateful) {
972                         cerr << "Error: --chainin must be combined with --best; aborting..." << endl;
973                         error = true;
974                 }
975                 if(paired) {
976                         cerr << "Error: --chainin cannot be combined with paired-end alignment; aborting..." << endl;
977                         error = true;
978                 }
979                 if(error) throw 1;
980         }
981
982         if(outType == OUTPUT_CHAIN) {
983                 bool error = false;
984                 if(refOut) {
985                         cerr << "Error: --chainout is not compatible with --refout; aborting..." << endl;
986                         error = true;
987                 }
988                 if(!stateful) {
989                         cerr << "Error: --chainout must be combined with --best; aborting..." << endl;
990                         error = true;
991                 }
992                 if(paired) {
993                         cerr << "Error: --chainout cannot be combined with paired-end alignment; aborting..." << endl;
994                         error = true;
995                 }
996                 if(error) throw 1;
997         }
998         if(outType == OUTPUT_SAM && refOut) {
999                 cerr << "Error: --refout cannot be combined with -S/--sam" << endl;
1000                 throw 1;
1001         }
1002         if(!mateFwSet) {
1003                 if(color) {
1004                         // Set colorspace default (--ff)
1005                         mate1fw = true;
1006                         mate2fw = true;
1007                 } else {
1008                         // Set nucleotide space default (--fr)
1009                         mate1fw = true;
1010                         mate2fw = false;
1011                 }
1012         }
1013         if(outType != OUTPUT_FULL && suppressOuts.count() > 0 && !quiet) {
1014                 cerr << "Warning: Ignoring --suppress because output type is not default." << endl;
1015                 cerr << "         --suppress is only available for the default output type." << endl;
1016                 suppressOuts.clear();
1017         }
1018 }
1019
1020 static const char *argv0 = NULL;
1021
1022 #define FINISH_READ(p) \
1023         /* Don't do finishRead if the read isn't legit or if the read was skipped by the doneMask */ \
1024         if(!p->empty()) { \
1025                 sink->finishRead(*p, true, !skipped); \
1026         } \
1027         skipped = false;
1028
1029 static inline void finishReadWithHitmask(PatternSourcePerThread* p,
1030                                          HitSinkPerThread* sink,
1031                                          SyncBitset& hitMask,
1032                                          bool r,
1033                                          bool& skipped)
1034 {
1035         /* Don't do finishRead if the read isn't legit */
1036         if(!p->empty()) {
1037                 /* r = whether to consider reporting the read as unaligned */
1038                 bool reportUnAl = r;
1039                 if(reportUnAl) {
1040                         /* If the done-mask already shows the read as done, */
1041                         /* then we already reported the unaligned read and */
1042                         /* should refrain from re-reporting*/
1043                         reportUnAl = !skipped;
1044                         if(reportUnAl) {
1045                                 /* If there hasn't been a hit reported, then report */
1046                                 /* read as unaligned */
1047                                 reportUnAl = !hitMask.test(p->patid());
1048                         }
1049                 }
1050                 if(sink->finishRead(*p, true, reportUnAl) > 0) {
1051                         /* We reported a hit for the read, so we set the */
1052                         /* appropriate bit in the hitMask to prevent it from */
1053                         /* being reported as unaligned. */
1054                         if(!reportUnAl && sink->dumpsReads()) {
1055                                 hitMask.setOver(p->patid());
1056                         }
1057                 }
1058         }
1059         skipped = false;
1060 }
1061
1062 /// Macro for getting the next read, possibly aborting depending on
1063 /// whether the result is empty or the patid exceeds the limit, and
1064 /// marshaling the read into convenient variables.
1065 #define GET_READ(p) \
1066         p->nextReadPair(); \
1067         if(p->empty() || p->patid() >= qUpto) { \
1068                 p->bufa().clearAll(); \
1069                 break; \
1070         } \
1071         assert(!empty(p->bufa().patFw)); \
1072         String<Dna5>& patFw  = p->bufa().patFw;  \
1073         patFw.data_begin += 0; /* suppress "unused" compiler warning */ \
1074         String<Dna5>& patRc  = p->bufa().patRc;  \
1075         patRc.data_begin += 0; /* suppress "unused" compiler warning */ \
1076         String<char>& qual = p->bufa().qual; \
1077         qual.data_begin += 0; /* suppress "unused" compiler warning */ \
1078         String<char>& qualRev = p->bufa().qualRev; \
1079         qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1080         String<Dna5>& patFwRev  = p->bufa().patFwRev;  \
1081         patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1082         String<Dna5>& patRcRev  = p->bufa().patRcRev;  \
1083         patRcRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1084         String<char>& name   = p->bufa().name;   \
1085         name.data_begin += 0; /* suppress "unused" compiler warning */ \
1086         uint32_t      patid  = p->patid();       \
1087         params.setPatId(patid);
1088
1089 /// Macro for getting the forward oriented version of next read,
1090 /// possibly aborting depending on whether the result is empty or the
1091 /// patid exceeds the limit, and marshaling the read into convenient
1092 /// variables.
1093 #define GET_READ_FW(p) \
1094         p->nextReadPair(); \
1095         if(p->empty() || p->patid() >= qUpto) { \
1096                 p->bufa().clearAll(); \
1097                 break; \
1098         } \
1099         params.setPatId(p->patid()); \
1100         assert(!empty(p->bufa().patFw)); \
1101         String<Dna5>& patFw  = p->bufa().patFw;  \
1102         patFw.data_begin += 0; /* suppress "unused" compiler warning */ \
1103         String<char>& qual = p->bufa().qual; \
1104         qual.data_begin += 0; /* suppress "unused" compiler warning */ \
1105         String<Dna5>& patFwRev  = p->bufa().patFwRev;  \
1106         patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1107         String<char>& qualRev = p->bufa().qualRev; \
1108         qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1109         String<char>& name   = p->bufa().name;   \
1110         name.data_begin += 0; /* suppress "unused" compiler warning */ \
1111         uint32_t      patid  = p->patid();
1112
1113 #ifdef BOWTIE_PTHREADS
1114 #define WORKER_EXIT() \
1115         patsrcFact->destroy(patsrc); \
1116         delete patsrcFact; \
1117         sinkFact->destroy(sink); \
1118         delete sinkFact; \
1119         if(tid > 0) { \
1120                 pthread_exit(NULL); \
1121         } \
1122         return NULL;
1123 #else
1124 #define WORKER_EXIT() \
1125         patsrcFact->destroy(patsrc); \
1126         delete patsrcFact; \
1127         sinkFact->destroy(sink); \
1128         delete sinkFact; \
1129         return NULL;
1130 #endif
1131
1132 #ifdef CHUD_PROFILING
1133 #define CHUD_START() chudStartRemotePerfMonitor("Bowtie");
1134 #define CHUD_STOP()  chudStopRemotePerfMonitor();
1135 #else
1136 #define CHUD_START()
1137 #define CHUD_STOP()
1138 #endif
1139
1140 /// Create a PatternSourcePerThread for the current thread according
1141 /// to the global params and return a pointer to it
1142 static PatternSourcePerThreadFactory*
1143 createPatsrcFactory(PairedPatternSource& _patsrc, int tid) {
1144         PatternSourcePerThreadFactory *patsrcFact;
1145         if(randReadsNoSync) {
1146                 patsrcFact = new RandomPatternSourcePerThreadFactory(numRandomReads, lenRandomReads, nthreads, tid);
1147         } else {
1148                 patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc);
1149         }
1150         assert(patsrcFact != NULL);
1151         return patsrcFact;
1152 }
1153
1154 /**
1155  * Allocate a HitSinkPerThreadFactory on the heap according to the
1156  * global params and return a pointer to it.
1157  */
1158 static HitSinkPerThreadFactory*
1159 createSinkFactory(HitSink& _sink) {
1160         HitSinkPerThreadFactory *sink = NULL;
1161         if(format == INPUT_CHAIN) {
1162                 assert(stateful);
1163                 sink = new ChainingHitSinkPerThreadFactory(_sink, allHits ? 0xffffffff : khits, mhits, strata);
1164         } else if(!strata) {
1165                 // Unstratified
1166                 if(!allHits) {
1167                         // First N good; "good" inherently ignores strata
1168                         sink = new NGoodHitSinkPerThreadFactory(_sink, khits, mhits);
1169                 } else {
1170                         // All hits, spanning strata
1171                         sink = new AllHitSinkPerThreadFactory(_sink, mhits);
1172                 }
1173         } else {
1174                 // Stratified
1175                 assert(stateful);
1176                 if(!allHits) {
1177                         assert(stateful);
1178                         // Buffer best hits, assuming they're arriving in best-
1179                         // to-worst order
1180                         sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, khits, mhits);
1181                 } else {
1182                         assert(stateful);
1183                         // Buffer best hits, assuming they're arriving in best-
1184                         // to-worst order
1185                         sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, 0xffffffff/2, mhits);
1186                 }
1187         }
1188         assert(sink != NULL);
1189         return sink;
1190 }
1191
1192 /**
1193  * Search through a single (forward) Ebwt index for exact end-to-end
1194  * hits.  Assumes that index is already loaded into memory.
1195  */
1196 static PairedPatternSource*   exactSearch_patsrc;
1197 static HitSink*               exactSearch_sink;
1198 static Ebwt<String<Dna> >*    exactSearch_ebwt;
1199 static vector<String<Dna5> >* exactSearch_os;
1200 static BitPairReference*      exactSearch_refs;
1201 static void *exactSearchWorker(void *vp) {
1202         int tid = *((int*)vp);
1203         PairedPatternSource& _patsrc = *exactSearch_patsrc;
1204         HitSink& _sink               = *exactSearch_sink;
1205         Ebwt<String<Dna> >& ebwt     = *exactSearch_ebwt;
1206         vector<String<Dna5> >& os    = *exactSearch_os;
1207         const BitPairReference* refs =  exactSearch_refs;
1208
1209         // Per-thread initialization
1210         PatternSourcePerThreadFactory *patsrcFact = createPatsrcFactory(_patsrc, tid);
1211         PatternSourcePerThread *patsrc = patsrcFact->create();
1212         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1213         HitSinkPerThread* sink = sinkFact->create();
1214         EbwtSearchParams<String<Dna> > params(
1215                 *sink,      // HitSink
1216                 os,         // reference sequences
1217                 true,       // read is forward
1218                 true);       // index is forward
1219         GreedyDFSRangeSource bt(
1220                 &ebwt, params,
1221                 refs,           // reference sequence (for colorspace)
1222                 0xffffffff,     // qualThresh
1223                 0xffffffff,     // max backtracks (no max)
1224                 0,              // reportPartials (don't)
1225                 true,           // reportExacts
1226                 rangeMode,      // reportRanges
1227                 NULL,           // seedlings
1228                 NULL,           // mutations
1229                 verbose,        // verbose
1230                 &os,
1231                 false);         // considerQuals
1232         bool skipped = false;
1233         while(true) {
1234                 FINISH_READ(patsrc);
1235                 GET_READ(patsrc);
1236                 #include "search_exact.c"
1237         }
1238         FINISH_READ(patsrc);
1239         WORKER_EXIT();
1240 }
1241
1242 /**
1243  * A statefulness-aware worker driver.  Uses UnpairedExactAlignerV1.
1244  */
1245 static void *exactSearchWorkerStateful(void *vp) {
1246         int tid = *((int*)vp);
1247         PairedPatternSource& _patsrc = *exactSearch_patsrc;
1248         HitSink& _sink               = *exactSearch_sink;
1249         Ebwt<String<Dna> >& ebwt     = *exactSearch_ebwt;
1250         vector<String<Dna5> >& os    = *exactSearch_os;
1251         BitPairReference* refs       =  exactSearch_refs;
1252
1253         // Global initialization
1254         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1255         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1256
1257         ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1258         UnpairedExactAlignerV1Factory alSEfact(
1259                         ebwt,
1260                         NULL,
1261                         !nofw,
1262                         !norc,
1263                         _sink,
1264                         *sinkFact,
1265                         NULL, //&cacheFw,
1266                         NULL, //&cacheBw,
1267                         cacheLimit,
1268                         pool,
1269                         refs,
1270                         os,
1271                         !noMaqRound,
1272                         !better,
1273                         strandFix,
1274                         rangeMode,
1275                         verbose,
1276                         quiet,
1277                         seed);
1278         PairedExactAlignerV1Factory alPEfact(
1279                         ebwt,
1280                         NULL,
1281                         color,
1282                         !nofw,
1283                         !norc,
1284                         useV1,
1285                         _sink,
1286                         *sinkFact,
1287                         mate1fw,
1288                         mate2fw,
1289                         minInsert,
1290                         maxInsert,
1291                         dontReconcileMates,
1292                         mhits,       // for symCeiling
1293                         mixedThresh,
1294                         mixedAttemptLim,
1295                         NULL, //&cacheFw,
1296                         NULL, //&cacheBw,
1297                         cacheLimit,
1298                         pool,
1299                         refs, os,
1300                         reportSe,
1301                         !noMaqRound,
1302                         strandFix,
1303                         !better,
1304                         rangeMode,
1305                         verbose,
1306                         quiet,
1307                         seed);
1308         {
1309                 MixedMultiAligner multi(
1310                                 prefetchWidth,
1311                                 qUpto,
1312                                 alSEfact,
1313                                 alPEfact,
1314                                 *patsrcFact);
1315                 // Run that mother
1316                 multi.run();
1317                 // MultiAligner must be destroyed before patsrcFact
1318         }
1319
1320         delete patsrcFact;
1321         delete sinkFact;
1322         delete pool;
1323 #ifdef BOWTIE_PTHREADS
1324         if(tid > 0) pthread_exit(NULL);
1325 #endif
1326         return NULL;
1327 }
1328
1329 #define SET_A_FW(bt, p, params) \
1330         bt.setQuery(&p->bufa().patFw, &p->bufa().qual, &p->bufa().name); \
1331         params.setFw(true);
1332 #define SET_A_RC(bt, p, params) \
1333         bt.setQuery(&p->bufa().patRc, &p->bufa().qualRev, &p->bufa().name); \
1334         params.setFw(false);
1335 #define SET_B_FW(bt, p, params) \
1336         bt.setQuery(&p->bufb().patFw, &p->bufb().qual, &p->bufb().name); \
1337         params.setFw(true);
1338 #define SET_B_RC(bt, p, params) \
1339         bt.setQuery(&p->bufb().patRc, &p->bufb().qualRev, &p->bufb().name); \
1340         params.setFw(false);
1341
1342 #ifdef BOWTIE_PTHREADS
1343 #define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED)
1344 #endif
1345
1346 /**
1347  * Search through a single (forward) Ebwt index for exact end-to-end
1348  * hits.  Assumes that index is already loaded into memory.
1349  */
1350 static void exactSearch(PairedPatternSource& _patsrc,
1351                         HitSink& _sink,
1352                         Ebwt<String<Dna> >& ebwt,
1353                         vector<String<Dna5> >& os)
1354 {
1355         exactSearch_patsrc = &_patsrc;
1356         exactSearch_sink   = &_sink;
1357         exactSearch_ebwt   = &ebwt;
1358         exactSearch_os     = &os;
1359
1360         assert(!ebwt.isInMemory());
1361         {
1362                 // Load the rest of (vast majority of) the backward Ebwt into
1363                 // memory
1364                 Timer _t(cerr, "Time loading forward index: ", timing);
1365                 ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1366         }
1367
1368         BitPairReference *refs = NULL;
1369         bool pair = mates1.size() > 0 || mates12.size() > 0;
1370         if(color || (pair && mixedThresh < 0xffffffff)) {
1371                 Timer _t(cerr, "Time loading reference: ", timing);
1372                 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1373                 if(!refs->loaded()) throw 1;
1374         }
1375         exactSearch_refs   = refs;
1376
1377 #ifdef BOWTIE_PTHREADS
1378         AutoArray<pthread_t> threads(nthreads-1);
1379         AutoArray<int> tids(nthreads-1);
1380 #endif
1381         CHUD_START();
1382         {
1383                 Timer _t(cerr, "Time for 0-mismatch search: ", timing);
1384 #ifdef BOWTIE_PTHREADS
1385                 for(int i = 0; i < nthreads-1; i++) {
1386                         tids[i] = i+1;
1387                         if(stateful) {
1388                                 createThread(&threads[i],
1389                                              exactSearchWorkerStateful,
1390                                              (void *)&tids[i]);
1391                         } else {
1392                                 createThread(&threads[i],
1393                                              exactSearchWorker,
1394                                              (void *)&tids[i]);
1395                         }
1396                 }
1397 #endif
1398                 int tmp = 0;
1399                 if(stateful) exactSearchWorkerStateful((void*)&tmp);
1400                 else         exactSearchWorker((void*)&tmp);
1401 #ifdef BOWTIE_PTHREADS
1402                 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1403 #endif
1404         }
1405         if(refs != NULL) delete refs;
1406 }
1407
1408 /**
1409  * Search through a pair of Ebwt indexes, one for the forward direction
1410  * and one for the backward direction, for exact end-to-end hits and 1-
1411  * mismatch end-to-end hits.  In my experience, this is slightly faster
1412  * than Maq (default) mode with the -n 1 option.
1413  *
1414  * Forward Ebwt (ebwtFw) is already loaded into memory and backward
1415  * Ebwt (ebwtBw) is not loaded into memory.
1416  */
1417 static PairedPatternSource*           mismatchSearch_patsrc;
1418 static HitSink*                       mismatchSearch_sink;
1419 static Ebwt<String<Dna> >*            mismatchSearch_ebwtFw;
1420 static Ebwt<String<Dna> >*            mismatchSearch_ebwtBw;
1421 static vector<String<Dna5> >*         mismatchSearch_os;
1422 static SyncBitset*                    mismatchSearch_doneMask;
1423 static SyncBitset*                    mismatchSearch_hitMask;
1424 static BitPairReference*              mismatchSearch_refs;
1425
1426 /**
1427  * A statefulness-aware worker driver.  Uses Unpaired/Paired1mmAlignerV1.
1428  */
1429 static void *mismatchSearchWorkerFullStateful(void *vp) {
1430         int tid = *((int*)vp);
1431         PairedPatternSource&   _patsrc = *mismatchSearch_patsrc;
1432         HitSink&               _sink   = *mismatchSearch_sink;
1433         Ebwt<String<Dna> >&    ebwtFw  = *mismatchSearch_ebwtFw;
1434         Ebwt<String<Dna> >&    ebwtBw  = *mismatchSearch_ebwtBw;
1435         vector<String<Dna5> >& os      = *mismatchSearch_os;
1436         BitPairReference*      refs    =  mismatchSearch_refs;
1437
1438         // Global initialization
1439         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1440         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1441         ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1442
1443         Unpaired1mmAlignerV1Factory alSEfact(
1444                         ebwtFw,
1445                         &ebwtBw,
1446                         !nofw,
1447                         !norc,
1448                         _sink,
1449                         *sinkFact,
1450                         NULL, //&cacheFw,
1451                         NULL, //&cacheBw,
1452                         cacheLimit,
1453                         pool,
1454                         refs,
1455                         os,
1456                         !noMaqRound,
1457                         !better,
1458                         strandFix,
1459                         rangeMode,
1460                         verbose,
1461                         quiet,
1462                         seed);
1463         Paired1mmAlignerV1Factory alPEfact(
1464                         ebwtFw,
1465                         &ebwtBw,
1466                         color,
1467                         !nofw,
1468                         !norc,
1469                         useV1,
1470                         _sink,
1471                         *sinkFact,
1472                         mate1fw,
1473                         mate2fw,
1474                         minInsert,
1475                         maxInsert,
1476                         dontReconcileMates,
1477                         mhits,     // for symCeiling
1478                         mixedThresh,
1479                         mixedAttemptLim,
1480                         NULL, //&cacheFw,
1481                         NULL, //&cacheBw,
1482                         cacheLimit,
1483                         pool,
1484                         refs, os,
1485                         reportSe,
1486                         !noMaqRound,
1487                         !better,
1488                         strandFix,
1489                         rangeMode,
1490                         verbose,
1491                         quiet,
1492                         seed);
1493         {
1494                 MixedMultiAligner multi(
1495                                 prefetchWidth,
1496                                 qUpto,
1497                                 alSEfact,
1498                                 alPEfact,
1499                                 *patsrcFact);
1500                 // Run that mother
1501                 multi.run();
1502                 // MultiAligner must be destroyed before patsrcFact
1503         }
1504
1505         delete patsrcFact;
1506         delete sinkFact;
1507         delete pool;
1508 #ifdef BOWTIE_PTHREADS
1509         if(tid > 0) pthread_exit(NULL);
1510 #endif
1511         return NULL;
1512 }
1513
1514 static void* mismatchSearchWorkerFull(void *vp){
1515         int tid = *((int*)vp);
1516         PairedPatternSource&   _patsrc   = *mismatchSearch_patsrc;
1517         HitSink&               _sink     = *mismatchSearch_sink;
1518         Ebwt<String<Dna> >&    ebwtFw    = *mismatchSearch_ebwtFw;
1519         Ebwt<String<Dna> >&    ebwtBw    = *mismatchSearch_ebwtBw;
1520         vector<String<Dna5> >& os        = *mismatchSearch_os;
1521         const BitPairReference* refs     =  mismatchSearch_refs;
1522
1523         // Per-thread initialization
1524         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1525         PatternSourcePerThread* patsrc = patsrcFact->create();
1526         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1527         HitSinkPerThread* sink = sinkFact->create();
1528         EbwtSearchParams<String<Dna> > params(
1529                 *sink,      // HitSinkPerThread
1530                 os,         // reference sequences
1531                 true,       // read is forward
1532                 false);     // index is mirror index
1533         GreedyDFSRangeSource bt(
1534                 &ebwtFw, params,
1535                 refs,           // reference sequence (for colorspace)
1536                 0xffffffff,     // qualThresh
1537                 0xffffffff,     // max backtracks (no max)
1538                 0,              // reportPartials (don't)
1539                 true,           // reportExacts
1540                 rangeMode,      // reportRanges
1541                 NULL,           // seedlings
1542                 NULL,           // mutations
1543                 verbose,        // verbose
1544                 &os,
1545                 false);         // considerQuals
1546         bool skipped = false;
1547         while(true) {
1548                 FINISH_READ(patsrc);
1549                 GET_READ(patsrc);
1550                 uint32_t plen = length(patFw);
1551                 uint32_t s = plen;
1552                 uint32_t s3 = s >> 1; // length of 3' half of seed
1553                 uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
1554                 #define DONEMASK_SET(p)
1555                 #include "search_1mm_phase1.c"
1556                 #include "search_1mm_phase2.c"
1557                 #undef DONEMASK_SET
1558         } // End read loop
1559         FINISH_READ(patsrc);
1560     WORKER_EXIT();
1561 }
1562
1563 /**
1564  * Search through a single (forward) Ebwt index for exact end-to-end
1565  * hits.  Assumes that index is already loaded into memory.
1566  */
1567 static void mismatchSearchFull(PairedPatternSource& _patsrc,
1568                                HitSink& _sink,
1569                                Ebwt<String<Dna> >& ebwtFw,
1570                                Ebwt<String<Dna> >& ebwtBw,
1571                                vector<String<Dna5> >& os)
1572 {
1573         mismatchSearch_patsrc       = &_patsrc;
1574         mismatchSearch_sink         = &_sink;
1575         mismatchSearch_ebwtFw       = &ebwtFw;
1576         mismatchSearch_ebwtBw       = &ebwtBw;
1577         mismatchSearch_doneMask     = NULL;
1578         mismatchSearch_hitMask      = NULL;
1579         mismatchSearch_os           = &os;
1580
1581         assert(!ebwtFw.isInMemory());
1582         assert(!ebwtBw.isInMemory());
1583         {
1584                 // Load the other half of the index into memory
1585                 Timer _t(cerr, "Time loading forward index: ", timing);
1586                 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1587         }
1588         {
1589                 // Load the other half of the index into memory
1590                 Timer _t(cerr, "Time loading mirror index: ", timing);
1591                 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1592         }
1593         // Create range caches, which are shared among all aligners
1594         BitPairReference *refs = NULL;
1595         bool pair = mates1.size() > 0 || mates12.size() > 0;
1596         if(color || (pair && mixedThresh < 0xffffffff)) {
1597                 Timer _t(cerr, "Time loading reference: ", timing);
1598                 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1599                 if(!refs->loaded()) throw 1;
1600         }
1601         mismatchSearch_refs = refs;
1602
1603 #ifdef BOWTIE_PTHREADS
1604         // Allocate structures for threads
1605         AutoArray<pthread_t> threads(nthreads-1);
1606         AutoArray<int> tids(nthreads-1);
1607 #endif
1608     CHUD_START();
1609     {
1610                 Timer _t(cerr, "Time for 1-mismatch full-index search: ", timing);
1611 #ifdef BOWTIE_PTHREADS
1612                 for(int i = 0; i < nthreads-1; i++) {
1613                         tids[i] = i+1;
1614                         if(stateful)
1615                                 createThread(&threads[i], mismatchSearchWorkerFullStateful, (void *)&tids[i]);
1616                         else
1617                                 createThread(&threads[i], mismatchSearchWorkerFull, (void *)&tids[i]);
1618                 }
1619 #endif
1620                 // Go to town
1621                 int tmp = 0;
1622                 if(stateful) mismatchSearchWorkerFullStateful((void*)&tmp);
1623                 else         mismatchSearchWorkerFull((void*)&tmp);
1624 #ifdef BOWTIE_PTHREADS
1625                 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1626 #endif
1627         }
1628         if(refs != NULL) delete refs;
1629 }
1630
1631 #define SWITCH_TO_FW_INDEX() { \
1632         /* Evict the mirror index from memory if necessary */ \
1633         if(ebwtBw.isInMemory()) ebwtBw.evictFromMemory(); \
1634         assert(!ebwtBw.isInMemory()); \
1635         /* Load the forward index into memory if necessary */ \
1636         if(!ebwtFw.isInMemory()) { \
1637                 Timer _t(cerr, "Time loading forward index: ", timing); \
1638                 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose); \
1639         } \
1640         assert(ebwtFw.isInMemory()); \
1641         _patsrc.reset(); /* rewind pattern source to first pattern */ \
1642 }
1643
1644 #define SWITCH_TO_BW_INDEX() { \
1645         /* Evict the forward index from memory if necessary */ \
1646         if(ebwtFw.isInMemory()) ebwtFw.evictFromMemory(); \
1647         assert(!ebwtFw.isInMemory()); \
1648         /* Load the forward index into memory if necessary */ \
1649         if(!ebwtBw.isInMemory()) { \
1650                 Timer _t(cerr, "Time loading mirror index: ", timing); \
1651                 ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); \
1652         } \
1653         assert(ebwtBw.isInMemory()); \
1654         _patsrc.reset(); /* rewind pattern source to first pattern */ \
1655 }
1656
1657 #define ASSERT_NO_HITS_FW(ebwtfw) \
1658         if(sanityCheck && os.size() > 0) { \
1659                 vector<Hit> hits; \
1660                 uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
1661                 uint32_t twoRevOff   = (seedMms <= 2) ? s : 0; \
1662                 uint32_t oneRevOff   = (seedMms <= 1) ? s : 0; \
1663                 uint32_t unrevOff    = (seedMms == 0) ? s : 0; \
1664                 if(hits.size() > 0) { \
1665                         /* Print offending hit obtained by oracle */ \
1666                         ::printHit( \
1667                                 os, \
1668                                 hits[0], \
1669                                 patFw, \
1670                                 plen, \
1671                                 unrevOff, \
1672                                 oneRevOff, \
1673                                 twoRevOff, \
1674                                 threeRevOff, \
1675                                 ebwtfw);  /* ebwtFw */ \
1676                 } \
1677                 assert_eq(0, hits.size()); \
1678         }
1679
1680 #define ASSERT_NO_HITS_RC(ebwtfw) \
1681         if(sanityCheck && os.size() > 0) { \
1682                 vector<Hit> hits; \
1683                 uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
1684                 uint32_t twoRevOff   = (seedMms <= 2) ? s : 0; \
1685                 uint32_t oneRevOff   = (seedMms <= 1) ? s : 0; \
1686                 uint32_t unrevOff    = (seedMms == 0) ? s : 0; \
1687                 if(hits.size() > 0) { \
1688                         /* Print offending hit obtained by oracle */ \
1689                         ::printHit( \
1690                                 os, \
1691                                 hits[0], \
1692                                 patRc, \
1693                                 plen, \
1694                                 unrevOff, \
1695                                 oneRevOff, \
1696                                 twoRevOff, \
1697                                 threeRevOff, \
1698                                 ebwtfw);  /* ebwtFw */ \
1699                 } \
1700                 assert_eq(0, hits.size()); \
1701         }
1702
1703 static PairedPatternSource*           twoOrThreeMismatchSearch_patsrc;
1704 static HitSink*                       twoOrThreeMismatchSearch_sink;
1705 static Ebwt<String<Dna> >*            twoOrThreeMismatchSearch_ebwtFw;
1706 static Ebwt<String<Dna> >*            twoOrThreeMismatchSearch_ebwtBw;
1707 static vector<String<Dna5> >*         twoOrThreeMismatchSearch_os;
1708 static SyncBitset*                    twoOrThreeMismatchSearch_doneMask;
1709 static SyncBitset*                    twoOrThreeMismatchSearch_hitMask;
1710 static bool                           twoOrThreeMismatchSearch_two;
1711 static BitPairReference*              twoOrThreeMismatchSearch_refs;
1712
1713 #define TWOTHREE_WORKER_SETUP() \
1714         int tid = *((int*)vp); \
1715         PairedPatternSource&           _patsrc  = *twoOrThreeMismatchSearch_patsrc;   \
1716         HitSink&                       _sink    = *twoOrThreeMismatchSearch_sink;     \
1717         vector<String<Dna5> >&         os       = *twoOrThreeMismatchSearch_os;       \
1718         bool                           two      = twoOrThreeMismatchSearch_two; \
1719     PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); \
1720         PatternSourcePerThread* patsrc = patsrcFact->create(); \
1721         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink); \
1722         HitSinkPerThread* sink = sinkFact->create(); \
1723         /* Per-thread initialization */ \
1724         EbwtSearchParams<String<Dna> > params( \
1725                         *sink,       /* HitSink */ \
1726                 os,          /* reference sequences */ \
1727                 true,        /* read is forward */ \
1728                 true);       /* index is forward */
1729
1730 /**
1731  * A statefulness-aware worker driver.  Uses UnpairedExactAlignerV1.
1732  */
1733 static void *twoOrThreeMismatchSearchWorkerStateful(void *vp) {
1734         int tid = *((int*)vp);
1735         PairedPatternSource&   _patsrc = *twoOrThreeMismatchSearch_patsrc;
1736         HitSink&               _sink   = *twoOrThreeMismatchSearch_sink;
1737         Ebwt<String<Dna> >&    ebwtFw  = *twoOrThreeMismatchSearch_ebwtFw;
1738         Ebwt<String<Dna> >&    ebwtBw  = *twoOrThreeMismatchSearch_ebwtBw;
1739         vector<String<Dna5> >& os      = *twoOrThreeMismatchSearch_os;
1740         BitPairReference*      refs    =  twoOrThreeMismatchSearch_refs;
1741         static bool            two     =  twoOrThreeMismatchSearch_two;
1742
1743         // Global initialization
1744         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1745         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1746
1747         ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1748         Unpaired23mmAlignerV1Factory alSEfact(
1749                         ebwtFw,
1750                         &ebwtBw,
1751                         two,
1752                         !nofw,
1753                         !norc,
1754                         _sink,
1755                         *sinkFact,
1756                         NULL, //&cacheFw,
1757                         NULL, //&cacheBw,
1758                         cacheLimit,
1759                         pool,
1760                         refs,
1761                         os,
1762                         !noMaqRound,
1763                         !better,
1764                         strandFix,
1765                         rangeMode,
1766                         verbose,
1767                         quiet,
1768                         seed);
1769         Paired23mmAlignerV1Factory alPEfact(
1770                         ebwtFw,
1771                         &ebwtBw,
1772                         color,
1773                         !nofw,
1774                         !norc,
1775                         useV1,
1776                         two,
1777                         _sink,
1778                         *sinkFact,
1779                         mate1fw,
1780                         mate2fw,
1781                         minInsert,
1782                         maxInsert,
1783                         dontReconcileMates,
1784                         mhits,       // for symCeiling
1785                         mixedThresh,
1786                         mixedAttemptLim,
1787                         NULL, //&cacheFw,
1788                         NULL, //&cacheBw,
1789                         cacheLimit,
1790                         pool,
1791                         refs, os,
1792                         reportSe,
1793                         !noMaqRound,
1794                         !better,
1795                         strandFix,
1796                         rangeMode,
1797                         verbose,
1798                         quiet,
1799                         seed);
1800         {
1801                 MixedMultiAligner multi(
1802                                 prefetchWidth,
1803                                 qUpto,
1804                                 alSEfact,
1805                                 alPEfact,
1806                                 *patsrcFact);
1807                 // Run that mother
1808                 multi.run();
1809                 // MultiAligner must be destroyed before patsrcFact
1810         }
1811
1812         delete patsrcFact;
1813         delete sinkFact;
1814         delete pool;
1815 #ifdef BOWTIE_PTHREADS
1816         if(tid > 0) pthread_exit(NULL);
1817 #endif
1818         return NULL;
1819 }
1820
1821 static void* twoOrThreeMismatchSearchWorkerFull(void *vp) {
1822         TWOTHREE_WORKER_SETUP();
1823         Ebwt<String<Dna> >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw;
1824         Ebwt<String<Dna> >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw;
1825         const BitPairReference* refs = twoOrThreeMismatchSearch_refs;
1826         GreedyDFSRangeSource btr1(
1827                 &ebwtFw, params,
1828                 refs,           // reference sequence (for colorspace)
1829                 0xffffffff,     // qualThresh
1830                 // Do not impose maximums in 2/3-mismatch mode
1831                 0xffffffff,     // max backtracks (no limit)
1832                 0,              // reportPartials (don't)
1833                 true,           // reportExacts
1834                 rangeMode,      // reportRanges
1835                 NULL,           // seedlings
1836                 NULL,           // mutations
1837                 verbose,        // verbose
1838                 &os,
1839                 false);         // considerQuals
1840         GreedyDFSRangeSource bt2(
1841                 &ebwtBw, params,
1842                 refs,           // reference sequence (for colorspace)
1843                 0xffffffff,     // qualThresh
1844                 // Do not impose maximums in 2/3-mismatch mode
1845                 0xffffffff,     // max backtracks (no limit)
1846                 0,              // reportPartials (no)
1847                 true,           // reportExacts
1848                 rangeMode,      // reportRanges
1849                 NULL,           // seedlings
1850                 NULL,           // mutations
1851                 verbose,        // verbose
1852                 &os,
1853                 false);         // considerQuals
1854         GreedyDFSRangeSource bt3(
1855                 &ebwtFw, params,
1856                 refs,           // reference sequence (for colorspace)
1857                 0xffffffff,     // qualThresh (none)
1858                 // Do not impose maximums in 2/3-mismatch mode
1859                 0xffffffff,     // max backtracks (no limit)
1860                 0,              // reportPartials (don't)
1861                 true,           // reportExacts
1862                 rangeMode,      // reportRanges
1863                 NULL,           // seedlings
1864                 NULL,           // mutations
1865                 verbose,        // verbose
1866                 &os,
1867                 false);         // considerQuals
1868         GreedyDFSRangeSource bthh3(
1869                 &ebwtFw, params,
1870                 refs,           // reference sequence (for colorspace)
1871                 0xffffffff,     // qualThresh
1872                 // Do not impose maximums in 2/3-mismatch mode
1873                 0xffffffff,     // max backtracks (no limit)
1874                 0,              // reportPartials (don't)
1875                 true,           // reportExacts
1876                 rangeMode,      // reportRanges
1877                 NULL,           // seedlings
1878                 NULL,           // mutations
1879                 verbose,        // verbose
1880                 &os,
1881                 false,          // considerQuals
1882                 true);          // halfAndHalf
1883         bool skipped = false;
1884         while(true) { // Read read-in loop
1885                 FINISH_READ(patsrc);
1886                 GET_READ(patsrc);
1887                 patid += 0; // kill unused variable warning
1888                 uint32_t plen = length(patFw);
1889                 uint32_t s = plen;
1890                 uint32_t s3 = s >> 1; // length of 3' half of seed
1891                 uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
1892                 #define DONEMASK_SET(p)
1893                 #include "search_23mm_phase1.c"
1894                 #include "search_23mm_phase2.c"
1895                 #include "search_23mm_phase3.c"
1896                 #undef DONEMASK_SET
1897         }
1898         FINISH_READ(patsrc);
1899         // Threads join at end of Phase 1
1900         WORKER_EXIT();
1901 }
1902
1903 template<typename TStr>
1904 static void twoOrThreeMismatchSearchFull(
1905                 PairedPatternSource& _patsrc,   /// pattern source
1906                 HitSink& _sink,                 /// hit sink
1907                 Ebwt<TStr>& ebwtFw,             /// index of original text
1908                 Ebwt<TStr>& ebwtBw,             /// index of mirror text
1909                 vector<String<Dna5> >& os,      /// text strings, if available (empty otherwise)
1910                 bool two = true)                /// true -> 2, false -> 3
1911 {
1912         // Global initialization
1913         assert(!ebwtFw.isInMemory());
1914         assert(!ebwtBw.isInMemory());
1915         {
1916                 // Load the other half of the index into memory
1917                 Timer _t(cerr, "Time loading forward index: ", timing);
1918                 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1919         }
1920         {
1921                 // Load the other half of the index into memory
1922                 Timer _t(cerr, "Time loading mirror index: ", timing);
1923                 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1924         }
1925         // Create range caches, which are shared among all aligners
1926         BitPairReference *refs = NULL;
1927         bool pair = mates1.size() > 0 || mates12.size() > 0;
1928         if(color || (pair && mixedThresh < 0xffffffff)) {
1929                 Timer _t(cerr, "Time loading reference: ", timing);
1930                 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1931                 if(!refs->loaded()) throw 1;
1932         }
1933         twoOrThreeMismatchSearch_refs     = refs;
1934         twoOrThreeMismatchSearch_patsrc   = &_patsrc;
1935         twoOrThreeMismatchSearch_sink     = &_sink;
1936         twoOrThreeMismatchSearch_ebwtFw   = &ebwtFw;
1937         twoOrThreeMismatchSearch_ebwtBw   = &ebwtBw;
1938         twoOrThreeMismatchSearch_os       = &os;
1939         twoOrThreeMismatchSearch_doneMask = NULL;
1940         twoOrThreeMismatchSearch_hitMask  = NULL;
1941         twoOrThreeMismatchSearch_two      = two;
1942
1943 #ifdef BOWTIE_PTHREADS
1944         AutoArray<pthread_t> threads(nthreads-1);
1945         AutoArray<int> tids(nthreads-1);
1946 #endif
1947     CHUD_START();
1948     {
1949                 Timer _t(cerr, "End-to-end 2/3-mismatch full-index search: ", timing);
1950 #ifdef BOWTIE_PTHREADS
1951                 for(int i = 0; i < nthreads-1; i++) {
1952                         tids[i] = i+1;
1953                         if(stateful)
1954                                 createThread(&threads[i], twoOrThreeMismatchSearchWorkerStateful, (void *)&tids[i]);
1955                         else
1956                                 createThread(&threads[i], twoOrThreeMismatchSearchWorkerFull, (void *)&tids[i]);
1957                 }
1958 #endif
1959                 int tmp = 0;
1960                 if(stateful) twoOrThreeMismatchSearchWorkerStateful((void*)&tmp);
1961                 else         twoOrThreeMismatchSearchWorkerFull((void*)&tmp);
1962 #ifdef BOWTIE_PTHREADS
1963                 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1964 #endif
1965     }
1966         if(refs != NULL) delete refs;
1967         return;
1968 }
1969
1970 static PairedPatternSource*     seededQualSearch_patsrc;
1971 static HitSink*                 seededQualSearch_sink;
1972 static Ebwt<String<Dna> >*      seededQualSearch_ebwtFw;
1973 static Ebwt<String<Dna> >*      seededQualSearch_ebwtBw;
1974 static vector<String<Dna5> >*   seededQualSearch_os;
1975 static SyncBitset*              seededQualSearch_doneMask;
1976 static SyncBitset*              seededQualSearch_hitMask;
1977 static PartialAlignmentManager* seededQualSearch_pamFw;
1978 static PartialAlignmentManager* seededQualSearch_pamRc;
1979 static int                      seededQualSearch_qualCutoff;
1980 static BitPairReference*        seededQualSearch_refs;
1981
1982 #define SEEDEDQUAL_WORKER_SETUP() \
1983         int tid = *((int*)vp); \
1984         PairedPatternSource&     _patsrc    = *seededQualSearch_patsrc;    \
1985         HitSink&                 _sink      = *seededQualSearch_sink;      \
1986         vector<String<Dna5> >&   os         = *seededQualSearch_os;        \
1987         int                      qualCutoff = seededQualSearch_qualCutoff; \
1988         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); \
1989         PatternSourcePerThread* patsrc = patsrcFact->create(); \
1990         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink); \
1991         HitSinkPerThread* sink = sinkFact->create(); \
1992         /* Per-thread initialization */ \
1993         EbwtSearchParams<String<Dna> > params( \
1994                 *sink,       /* HitSink */ \
1995                 os,          /* reference sequences */ \
1996                 true,        /* read is forward */ \
1997                 true);       /* index is forward */
1998
1999 static void* seededQualSearchWorkerFull(void *vp) {
2000         SEEDEDQUAL_WORKER_SETUP();
2001         Ebwt<String<Dna> >& ebwtFw = *seededQualSearch_ebwtFw;
2002         Ebwt<String<Dna> >& ebwtBw = *seededQualSearch_ebwtBw;
2003         PartialAlignmentManager * pamRc = NULL;
2004         PartialAlignmentManager * pamFw = NULL;
2005         if(seedMms > 0) {
2006                 pamRc = new PartialAlignmentManager(64);
2007                 pamFw = new PartialAlignmentManager(64);
2008         }
2009         vector<PartialAlignment> pals;
2010         const BitPairReference* refs = seededQualSearch_refs;
2011         // GreedyDFSRangeSource for finding exact hits for the forward-
2012         // oriented read
2013         GreedyDFSRangeSource btf1(
2014                 &ebwtFw, params,
2015                 refs,           // reference sequence (for colorspace)
2016                 qualCutoff,            // qualThresh
2017                 maxBtsBetter,          // max backtracks
2018                 0,                     // reportPartials (don't)
2019                 true,                  // reportExacts
2020                 rangeMode,             // reportRanges
2021                 NULL,                  // seedlings
2022                 NULL,                  // mutations
2023                 verbose,               // verbose
2024                 &os,
2025                 false);                // considerQuals
2026         GreedyDFSRangeSource bt1(
2027                 &ebwtFw, params,
2028                 refs,           // reference sequence (for colorspace)
2029                 qualCutoff,            // qualThresh
2030                 maxBtsBetter,          // max backtracks
2031                 0,                     // reportPartials (don't)
2032                 true,                  // reportExacts
2033                 rangeMode,             // reportRanges
2034                 NULL,                  // seedlings
2035                 NULL,                  // mutations
2036                 verbose,               // verbose
2037                 &os,                   // reference sequences
2038                 true,                  // considerQuals
2039                 false, !noMaqRound);
2040         // GreedyDFSRangeSource to search for hits for cases 1F, 2F, 3F
2041         GreedyDFSRangeSource btf2(
2042                 &ebwtBw, params,
2043                 refs,           // reference sequence (for colorspace)
2044                 qualCutoff,            // qualThresh
2045                 maxBtsBetter,          // max backtracks
2046                 0,                     // reportPartials (no)
2047                 true,                  // reportExacts
2048                 rangeMode,             // reportRanges
2049                 NULL,                  // partial alignment manager
2050                 NULL,                  // mutations
2051                 verbose,               // verbose
2052                 &os,                   // reference sequences
2053                 true,                  // considerQuals
2054                 false, !noMaqRound);
2055         // GreedyDFSRangeSource to search for partial alignments for case 4R
2056         GreedyDFSRangeSource btr2(
2057                 &ebwtBw, params,
2058                 refs,           // reference sequence (for colorspace)
2059                 qualCutoff,            // qualThresh (none)
2060                 maxBtsBetter,          // max backtracks
2061                 seedMms,               // report partials (up to seedMms mms)
2062                 true,                  // reportExacts
2063                 rangeMode,             // reportRanges
2064                 pamRc,                 // partial alignment manager
2065                 NULL,                  // mutations
2066                 verbose,               // verbose
2067                 &os,                   // reference sequences
2068                 true,                  // considerQuals
2069                 false, !noMaqRound);
2070         // GreedyDFSRangeSource to search for seedlings for case 4F
2071         GreedyDFSRangeSource btf3(
2072                 &ebwtFw, params,
2073                 refs,           // reference sequence (for colorspace)
2074                 qualCutoff,            // qualThresh (none)
2075                 maxBtsBetter,          // max backtracks
2076                 seedMms,               // reportPartials (do)
2077                 true,                  // reportExacts
2078                 rangeMode,             // reportRanges
2079                 pamFw,                 // seedlings
2080                 NULL,                  // mutations
2081                 verbose,               // verbose
2082                 &os,                   // reference sequences
2083                 true,                  // considerQuals
2084                 false, !noMaqRound);
2085         // GreedyDFSRangeSource to search for hits for case 4R by extending
2086         // the partial alignments found in Phase 2
2087         GreedyDFSRangeSource btr3(
2088                 &ebwtFw, params,
2089                 refs,           // reference sequence (for colorspace)
2090                 qualCutoff, // qualThresh
2091                 maxBtsBetter,          // max backtracks
2092                 0,       // reportPartials (don't)
2093                 true,    // reportExacts
2094                 rangeMode,// reportRanges
2095                 NULL,    // seedlings
2096                 NULL,    // mutations
2097                 verbose, // verbose
2098                 &os,     // reference sequences
2099                 true,    // considerQuals
2100                 false, !noMaqRound);
2101         // The half-and-half GreedyDFSRangeSource
2102         GreedyDFSRangeSource btr23(
2103                 &ebwtFw, params,
2104                 refs,           // reference sequence (for colorspace)
2105                 qualCutoff, // qualThresh
2106                 maxBtsBetter,          // max backtracks
2107                 0,       // reportPartials (don't)
2108                 true,    // reportExacts
2109                 rangeMode,// reportRanges
2110                 NULL,    // seedlings
2111                 NULL,    // mutations
2112                 verbose, // verbose
2113                 &os,
2114                 true,    // considerQuals
2115                 true,    // halfAndHalf
2116                 !noMaqRound);
2117         // GreedyDFSRangeSource to search for hits for case 4F by extending
2118         // the partial alignments found in Phase 3
2119         GreedyDFSRangeSource btf4(
2120                 &ebwtBw, params,
2121                 refs,           // reference sequence (for colorspace)
2122                 qualCutoff, // qualThresh
2123                 maxBtsBetter,          // max backtracks
2124                 0,       // reportPartials (don't)
2125                 true,    // reportExacts
2126                 rangeMode,// reportRanges
2127                 NULL,    // seedlings
2128                 NULL,    // mutations
2129                 verbose, // verbose
2130                 &os,     // reference sequences
2131                 true,    // considerQuals
2132                 false, !noMaqRound);
2133         // Half-and-half GreedyDFSRangeSource for forward read
2134         GreedyDFSRangeSource btf24(
2135                 &ebwtBw, params,
2136                 refs,           // reference sequence (for colorspace)
2137                 qualCutoff, // qualThresh
2138                 maxBtsBetter,          // max backtracks
2139                 0,       // reportPartials (don't)
2140                 true,    // reportExacts
2141                 rangeMode,// reportRanges
2142                 NULL,    // seedlings
2143                 NULL,    // mutations
2144                 verbose, // verbose
2145                 &os,
2146                 true,    // considerQuals
2147                 true,    // halfAndHalf
2148                 !noMaqRound);
2149         String<QueryMutation> muts;
2150         bool skipped = false;
2151         while(true) {
2152                 FINISH_READ(patsrc);
2153                 GET_READ(patsrc);
2154                 size_t plen = length(patFw);
2155                 uint32_t s = seedLen;
2156                 uint32_t s3 = (s >> 1); /* length of 3' half of seed */
2157                 uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */
2158                 uint32_t qs = min<uint32_t>(plen, s);
2159                 uint32_t qs3 = qs >> 1;
2160                 uint32_t qs5 = (qs >> 1) + (qs & 1);
2161                 #define DONEMASK_SET(p)
2162                 #include "search_seeded_phase1.c"
2163                 #include "search_seeded_phase2.c"
2164                 #include "search_seeded_phase3.c"
2165                 #include "search_seeded_phase4.c"
2166                 #undef DONEMASK_SET
2167         }
2168         FINISH_READ(patsrc);
2169         if(seedMms > 0) {
2170                 delete pamRc;
2171                 delete pamFw;
2172         }
2173         WORKER_EXIT();
2174 }
2175
2176 static void* seededQualSearchWorkerFullStateful(void *vp) {
2177         int tid = *((int*)vp);
2178         PairedPatternSource&     _patsrc    = *seededQualSearch_patsrc;
2179         HitSink&                 _sink      = *seededQualSearch_sink;
2180         Ebwt<String<Dna> >&      ebwtFw     = *seededQualSearch_ebwtFw;
2181         Ebwt<String<Dna> >&      ebwtBw     = *seededQualSearch_ebwtBw;
2182         vector<String<Dna5> >&   os         = *seededQualSearch_os;
2183         int                      qualCutoff = seededQualSearch_qualCutoff;
2184         BitPairReference*        refs       = seededQualSearch_refs;
2185
2186         // Global initialization
2187         PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
2188         HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
2189         ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
2190
2191         AlignerMetrics *metrics = NULL;
2192         if(stats) {
2193                 metrics = new AlignerMetrics();
2194         }
2195         UnpairedSeedAlignerFactory alSEfact(
2196                         ebwtFw,
2197                         &ebwtBw,
2198                         !nofw,
2199                         !norc,
2200                         seedMms,
2201                         seedLen,
2202                         qualCutoff,
2203                         maxBts,
2204                         _sink,
2205                         *sinkFact,
2206                         NULL, //&cacheFw,
2207                         NULL, //&cacheBw,
2208                         cacheLimit,
2209                         pool,
2210                         refs,
2211                         os,
2212                         !noMaqRound,
2213                         !better,
2214                         strandFix,
2215                         rangeMode,
2216                         verbose,
2217                         quiet,
2218                         seed,
2219                         metrics);
2220         PairedSeedAlignerFactory alPEfact(
2221                         ebwtFw,
2222                         &ebwtBw,
2223                         color,
2224                         useV1,
2225                         !nofw,
2226                         !norc,
2227                         seedMms,
2228                         seedLen,
2229                         qualCutoff,
2230                         maxBts,
2231                         _sink,
2232                         *sinkFact,
2233                         mate1fw,
2234                         mate2fw,
2235                         minInsert,
2236                         maxInsert,
2237                         dontReconcileMates,
2238                         mhits,       // for symCeiling
2239                         mixedThresh,
2240                         mixedAttemptLim,
2241                         NULL, //&cacheFw,
2242                         NULL, //&cacheBw,
2243                         cacheLimit,
2244                         pool,
2245                         refs,
2246                         os,
2247                         reportSe,
2248                         !noMaqRound,
2249                         !better,
2250                         strandFix,
2251                         rangeMode,
2252                         verbose,
2253                         quiet,
2254                         seed);
2255         {
2256                 MixedMultiAligner multi(
2257                                 prefetchWidth,
2258                                 qUpto,
2259                                 alSEfact,
2260                                 alPEfact,
2261                                 *patsrcFact);
2262                 // Run that mother
2263                 multi.run();
2264                 // MultiAligner must be destroyed before patsrcFact
2265         }
2266         if(metrics != NULL) {
2267                 metrics->printSummary();
2268                 delete metrics;
2269         }
2270
2271         delete patsrcFact;
2272         delete sinkFact;
2273         delete pool;
2274 #ifdef BOWTIE_PTHREADS
2275         if(tid > 0) {
2276                 pthread_exit(NULL);
2277         }
2278 #endif
2279         return NULL;
2280 }
2281
2282 /**
2283  * Search for a good alignments for each read using criteria that
2284  * correspond somewhat faithfully to Maq's.  Search is aided by a pair
2285  * of Ebwt indexes, one for the original references, and one for the
2286  * transpose of the references.  Neither index should be loaded upon
2287  * entry to this function.
2288  *
2289  * Like Maq, we treat the first 24 base pairs of the read (those
2290  * closest to the 5' end) differently from the remainder of the read.
2291  * We call the first 24 base pairs the "seed."
2292  */
2293 template<typename TStr>
2294 static void seededQualCutoffSearchFull(
2295         int seedLen,                    /// length of seed (not a maq option)
2296         int qualCutoff,                 /// maximum sum of mismatch qualities
2297                                         /// like maq map's -e option
2298                                         /// default: 70
2299         int seedMms,                    /// max # mismatches allowed in seed
2300                                         /// (like maq map's -n option)
2301                                         /// Can only be 1 or 2, default: 1
2302         PairedPatternSource& _patsrc,   /// pattern source
2303         HitSink& _sink,                 /// hit sink
2304         Ebwt<TStr>& ebwtFw,             /// index of original text
2305         Ebwt<TStr>& ebwtBw,             /// index of mirror text
2306         vector<String<Dna5> >& os)      /// text strings, if available (empty otherwise)
2307 {
2308         // Global intialization
2309         assert_leq(seedMms, 3);
2310
2311         seededQualSearch_patsrc   = &_patsrc;
2312         seededQualSearch_sink     = &_sink;
2313         seededQualSearch_ebwtFw   = &ebwtFw;
2314         seededQualSearch_ebwtBw   = &ebwtBw;
2315         seededQualSearch_os       = &os;
2316         seededQualSearch_doneMask = NULL;
2317         seededQualSearch_hitMask  = NULL;
2318         seededQualSearch_pamFw    = NULL;
2319         seededQualSearch_pamRc    = NULL;
2320         seededQualSearch_qualCutoff = qualCutoff;
2321
2322         // Create range caches, which are shared among all aligners
2323         BitPairReference *refs = NULL;
2324         bool pair = mates1.size() > 0 || mates12.size() > 0;
2325         if(color || (pair && mixedThresh < 0xffffffff)) {
2326                 Timer _t(cerr, "Time loading reference: ", timing);
2327                 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
2328                 if(!refs->loaded()) throw 1;
2329         }
2330         seededQualSearch_refs = refs;
2331
2332 #ifdef BOWTIE_PTHREADS
2333         AutoArray<pthread_t> threads(nthreads-1);
2334         AutoArray<int> tids(nthreads-1);
2335 #endif
2336
2337         SWITCH_TO_FW_INDEX();
2338         assert(!ebwtBw.isInMemory());
2339         {
2340                 // Load the other half of the index into memory
2341                 Timer _t(cerr, "Time loading mirror index: ", timing);
2342                 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
2343         }
2344         CHUD_START();
2345         {
2346                 // Phase 1: Consider cases 1R and 2R
2347                 Timer _t(cerr, "Seeded quality full-index search: ", timing);
2348 #ifdef BOWTIE_PTHREADS
2349                 for(int i = 0; i < nthreads-1; i++) {
2350                         tids[i] = i+1;
2351                         if(stateful) createThread(&threads[i],
2352                                                   seededQualSearchWorkerFullStateful,
2353                                                   (void*)&tids[i]);
2354                         else         createThread(&threads[i],
2355                                                   seededQualSearchWorkerFull,
2356                                                   (void*)&tids[i]);
2357                 }
2358 #endif
2359                 int tmp = 0;
2360                 if(stateful) seededQualSearchWorkerFullStateful((void*)&tmp);
2361                 else         seededQualSearchWorkerFull((void*)&tmp);
2362 #ifdef BOWTIE_PTHREADS
2363                 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
2364 #endif
2365         }
2366         if(refs != NULL) {
2367                 delete refs;
2368         }
2369         ebwtBw.evictFromMemory();
2370 }
2371
2372 /**
2373  * Return a new dynamically allocated PatternSource for the given
2374  * format, using the given list of strings as the filenames to read
2375  * from or as the sequences themselves (i.e. if -c was used).
2376  */
2377 static PatternSource*
2378 patsrcFromStrings(int format,
2379                   const vector<string>& reads,
2380                   const vector<string>* quals)
2381 {
2382         switch(format) {
2383                 case FASTA:
2384                         return new FastaPatternSource (seed, reads, quals, color,
2385                                                        randomizeQuals,
2386                                                        useSpinlock,
2387                                                        patDumpfile, verbose,
2388                                                        trim3, trim5,
2389                                                        solexaQuals, phred64Quals,
2390                                                        integerQuals,
2391                                                        skipReads);
2392                 case FASTA_CONT:
2393                         return new FastaContinuousPatternSource (
2394                                                        seed, reads, fastaContLen,
2395                                                        fastaContFreq,
2396                                                        useSpinlock,
2397                                                        patDumpfile, verbose,
2398                                                        skipReads);
2399                 case RAW:
2400                         return new RawPatternSource   (seed, reads, color,
2401                                                        randomizeQuals,
2402                                                        useSpinlock,
2403                                                        patDumpfile, verbose,
2404                                                        trim3, trim5,
2405                                                        skipReads);
2406                 case FASTQ:
2407                         return new FastqPatternSource (seed, reads, color,
2408                                                        randomizeQuals,
2409                                                        useSpinlock,
2410                                                        patDumpfile, verbose,
2411                                                        trim3, trim5,
2412                                                        solexaQuals, phred64Quals,
2413                                                        integerQuals, fuzzy,
2414                                                        skipReads);
2415                 case TAB_MATE:
2416                         return new TabbedPatternSource(seed, reads, color,
2417                                                        randomizeQuals,
2418                                                        useSpinlock,
2419                                                        patDumpfile, verbose,
2420                                                        trim3, trim5,
2421                                                        skipReads);
2422                 case CMDLINE:
2423                         return new VectorPatternSource(seed, reads, color,
2424                                                        randomizeQuals,
2425                                                        useSpinlock,
2426                                                        patDumpfile, verbose,
2427                                                        trim3, trim5,
2428                                                        skipReads);
2429                 case INPUT_CHAIN:
2430                         return new ChainPatternSource (seed, reads, useSpinlock, patDumpfile,
2431                                                        verbose, skipReads);
2432                 case RANDOM:
2433                         return new RandomPatternSource(seed, 2000000, lenRandomReads,
2434                                                        useSpinlock, patDumpfile,
2435                                                        verbose);
2436                 default: {
2437                         cerr << "Internal error; bad patsrc format: " << format << endl;
2438                         throw 1;
2439                 }
2440         }
2441 }
2442
2443 #define PASS_DUMP_FILES dumpAlBase, dumpUnalBase, dumpMaxBase
2444
2445 static string argstr;
2446
2447 template<typename TStr>
2448 static void driver(const char * type,
2449                    const string& ebwtFileBase,
2450                    const string& query,
2451                    const vector<string>& queries,
2452                    const vector<string>& qualities,
2453                    const string& outfile)
2454 {
2455         if(verbose || startVerbose)  {
2456                 cerr << "Entered driver(): "; logTime(cerr, true);
2457         }
2458         // Vector of the reference sequences; used for sanity-checking
2459         vector<String<Dna5> > os;
2460         // Read reference sequences from the command-line or from a FASTA file
2461         if(!origString.empty()) {
2462                 // Determine if it's a file by looking at whether it has a FASTA-like
2463                 // extension
2464                 size_t len = origString.length();
2465                 if((len >= 6 && origString.substr(len-6) == ".fasta") ||
2466                    (len >= 4 && origString.substr(len-4) == ".mfa")   ||
2467                    (len >= 4 && origString.substr(len-4) == ".fas")   ||
2468                    (len >= 4 && origString.substr(len-4) == ".fna")   ||
2469                    (len >= 3 && origString.substr(len-3) == ".fa"))
2470                 {
2471                         // Read fasta file
2472                         vector<string> origFiles;
2473                         tokenize(origString, ",", origFiles);
2474                         readSequenceFiles<String<Dna5>, Fasta>(origFiles, os);
2475                 } else {
2476                         // Read sequence
2477                         readSequenceString(origString, os);
2478                 }
2479         }
2480         // Adjust
2481         adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
2482
2483         vector<PatternSource*> patsrcs_a;
2484         vector<PatternSource*> patsrcs_b;
2485         vector<PatternSource*> patsrcs_ab;
2486
2487         // If there were any first-mates specified, we will operate in
2488         // stateful mode
2489         bool paired = mates1.size() > 0 || mates12.size() > 0;
2490         if(paired) stateful = true;
2491
2492         // Create list of pattern sources for paired reads appearing
2493         // interleaved in a single file
2494         if(verbose || startVerbose) {
2495                 cerr << "Creating paired-end patsrcs: "; logTime(cerr, true);
2496         }
2497         for(size_t i = 0; i < mates12.size(); i++) {
2498                 const vector<string>* qs = &mates12;
2499                 vector<string> tmp;
2500                 if(fileParallel) {
2501                         // Feed query files one to each PatternSource
2502                         qs = &tmp;
2503                         tmp.push_back(mates12[i]);
2504                         assert_eq(1, tmp.size());
2505                 }
2506                 patsrcs_ab.push_back(patsrcFromStrings(format, *qs, NULL));
2507                 if(!fileParallel) {
2508                         break;
2509                 }
2510         }
2511
2512         // Create list of pattern sources for paired reads
2513         for(size_t i = 0; i < mates1.size(); i++) {
2514                 const vector<string>* qs = &mates1;
2515                 const vector<string>* quals = &qualities1;
2516                 vector<string> tmpSeq;
2517                 vector<string> tmpQual;
2518                 if(fileParallel) {
2519                         // Feed query files one to each PatternSource
2520                         qs = &tmpSeq;
2521                         tmpSeq.push_back(mates1[i]);
2522                         quals = &tmpSeq;
2523                         tmpQual.push_back(qualities1[i]);
2524                         assert_eq(1, tmpSeq.size());
2525                 }
2526                 if(quals->empty()) quals = NULL;
2527                 patsrcs_a.push_back(patsrcFromStrings(format, *qs, quals));
2528                 if(!fileParallel) {
2529                         break;
2530                 }
2531         }
2532
2533         // Create list of pattern sources for paired reads
2534         for(size_t i = 0; i < mates2.size(); i++) {
2535                 const vector<string>* qs = &mates2;
2536                 const vector<string>* quals = &qualities2;
2537                 vector<string> tmpSeq;
2538                 vector<string> tmpQual;
2539                 if(fileParallel) {
2540                         // Feed query files one to each PatternSource
2541                         qs = &tmpSeq;
2542                         tmpSeq.push_back(mates2[i]);
2543                         quals = &tmpQual;
2544                         tmpQual.push_back(qualities2[i]);
2545                         assert_eq(1, tmpSeq.size());
2546                 }
2547                 if(quals->empty()) quals = NULL;
2548                 patsrcs_b.push_back(patsrcFromStrings(format, *qs, quals));
2549                 if(!fileParallel) {
2550                         break;
2551                 }
2552         }
2553         // All mates/mate files must be paired
2554         assert_eq(patsrcs_a.size(), patsrcs_b.size());
2555
2556         // Create list of pattern sources for the unpaired reads
2557         if(verbose || startVerbose) {
2558                 cerr << "Creating single-end patsrcs: "; logTime(cerr, true);
2559         }
2560         for(size_t i = 0; i < queries.size(); i++) {
2561                 const vector<string>* qs = &queries;
2562                 const vector<string>* quals = &qualities;
2563                 PatternSource* patsrc = NULL;
2564                 vector<string> tmpSeq;
2565                 vector<string> tmpQual;
2566                 if(fileParallel) {
2567                         // Feed query files one to each PatternSource
2568                         qs = &tmpSeq;
2569                         tmpSeq.push_back(queries[i]);
2570                         quals = &tmpQual;
2571                         tmpQual.push_back(qualities[i]);
2572                         assert_eq(1, tmpSeq.size());
2573                 }
2574                 if(quals->empty()) quals = NULL;
2575                 patsrc = patsrcFromStrings(format, *qs, quals);
2576                 assert(patsrc != NULL);
2577                 patsrcs_a.push_back(patsrc);
2578                 patsrcs_b.push_back(NULL);
2579                 if(!fileParallel) {
2580                         break;
2581                 }
2582         }
2583
2584         if(verbose || startVerbose) {
2585                 cerr << "Creating PatternSource: "; logTime(cerr, true);
2586         }
2587         PairedPatternSource *patsrc = NULL;
2588         if(mates12.size() > 0) {
2589                 patsrc = new PairedSoloPatternSource(patsrcs_ab, seed);
2590         } else {
2591                 patsrc = new PairedDualPatternSource(patsrcs_a, patsrcs_b, seed);
2592         }
2593
2594         // Open hit output file
2595         if(verbose || startVerbose) {
2596                 cerr << "Opening hit output file: "; logTime(cerr, true);
2597         }
2598         OutFileBuf *fout;
2599         if(!outfile.empty()) {
2600                 if(refOut) {
2601                         fout = NULL;
2602                         if(!quiet) {
2603                                 cerr << "Warning: ignoring alignment output file " << outfile << " because --refout was specified" << endl;
2604                         }
2605                 } else {
2606                         fout = new OutFileBuf(outfile.c_str(), false);
2607                 }
2608         } else {
2609                 if(outType == OUTPUT_CHAIN && !refOut) {
2610                         cerr << "Error: Must specify an output file when output mode is --chain" << endl;
2611                         throw 1;
2612                 }
2613                 fout = new OutFileBuf();
2614         }
2615         ReferenceMap* rmap = NULL;
2616         if(refMapFile != NULL) {
2617                 if(verbose || startVerbose) {
2618                         cerr << "About to load in a reference map file with name "
2619                              << refMapFile << ": "; logTime(cerr, true);
2620                 }
2621                 rmap = new ReferenceMap(refMapFile, !noRefNames);
2622         }
2623         AnnotationMap* amap = NULL;
2624         if(annotMapFile != NULL) {
2625                 if(verbose || startVerbose) {
2626                         cerr << "About to load in an annotation map file with name "
2627                              << annotMapFile << ": "; logTime(cerr, true);
2628                 }
2629                 amap = new AnnotationMap(annotMapFile);
2630         }
2631         // Initialize Ebwt object and read in header
2632         if(verbose || startVerbose) {
2633                 cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
2634         }
2635         Ebwt<TStr> ebwt(adjustedEbwtFileBase,
2636                         color,  // index is colorspace
2637                         -1,     // don't care about entireReverse
2638                         true,     // index is for the forward direction
2639                         /* overriding: */ offRate,
2640                         /* overriding: */ isaRate,
2641                         useMm,    // whether to use memory-mapped files
2642                         useShmem, // whether to use shared memory
2643                         mmSweep,  // sweep memory-mapped files
2644                         !noRefNames, // load names?
2645                         rmap,     // reference map, or NULL if none is needed
2646                         verbose, // whether to be talkative
2647                         startVerbose, // talkative during initialization
2648                         false /*passMemExc*/,
2649                         sanityCheck);
2650         Ebwt<TStr>* ebwtBw = NULL;
2651         // We need the mirror index if mismatches are allowed
2652         if(mismatches > 0 || maqLike) {
2653                 if(verbose || startVerbose) {
2654                         cerr << "About to initialize rev Ebwt: "; logTime(cerr, true);
2655                 }
2656                 ebwtBw = new Ebwt<TStr>(
2657                         adjustedEbwtFileBase + ".rev",
2658                         color,  // index is colorspace
2659                         -1,     // don't care about entireReverse
2660                         false, // index is for the reverse direction
2661                         /* overriding: */ offRate,
2662                         /* overriding: */ isaRate,
2663                         useMm,    // whether to use memory-mapped files
2664                         useShmem, // whether to use shared memory
2665                         mmSweep,  // sweep memory-mapped files
2666                         !noRefNames, // load names?
2667                         rmap,     // reference map, or NULL if none is needed
2668                         verbose,  // whether to be talkative
2669                         startVerbose, // talkative during initialization
2670                         false /*passMemExc*/,
2671                         sanityCheck);
2672         }
2673         if(!os.empty()) {
2674                 for(size_t i = 0; i < os.size(); i++) {
2675                         size_t olen = seqan::length(os[i]);
2676                         int longestStretch = 0;
2677                         int curStretch = 0;
2678                         for(size_t j = 0; j < olen; j++) {
2679                                 if((int)os[i][j] < 4) {
2680                                         curStretch++;
2681                                         if(curStretch > longestStretch) longestStretch = curStretch;
2682                                 } else {
2683                                         curStretch = 0;
2684                                 }
2685                         }
2686                         if(longestStretch < (color ? 2 : 1)) {
2687                                 os.erase(os.begin() + i);
2688                                 i--;
2689                         }
2690                 }
2691         }
2692         if(sanityCheck && !os.empty()) {
2693                 // Sanity check number of patterns and pattern lengths in Ebwt
2694                 // against original strings
2695                 assert_eq(os.size(), ebwt.nPat());
2696                 for(size_t i = 0; i < os.size(); i++) {
2697                         assert_eq(length(os[i]), ebwt.plen()[i] + (color ? 1 : 0));
2698                 }
2699                 ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
2700                 ebwt.checkOrigs(os, color, false);
2701                 ebwt.evictFromMemory();
2702         }
2703         {
2704                 Timer _t(cerr, "Time searching: ", timing);
2705                 if(verbose || startVerbose) {
2706                         cerr << "Creating HitSink: "; logTime(cerr, true);
2707                 }
2708                 // Set up hit sink; if sanityCheck && !os.empty() is true,
2709                 // then instruct the sink to "retain" hits in a vector in
2710                 // memory so that we can easily sanity check them later on
2711                 HitSink *sink;
2712                 RecalTable *table = NULL;
2713                 if(recal) {
2714                         table = new RecalTable(recalMaxCycle, recalMaxQual, recalQualShift);
2715                 }
2716                 vector<string>* refnames = &ebwt.refnames();
2717                 if(noRefNames) refnames = NULL;
2718                 switch(outType) {
2719                         case OUTPUT_FULL:
2720                                 if(refOut) {
2721                                         sink = new VerboseHitSink(
2722                                                         ebwt.nPat(), offBase,
2723                                                         colorSeq, colorQual, printCost,
2724                                                         suppressOuts, rmap, amap,
2725                                                         fullRef, PASS_DUMP_FILES,
2726                                                         format == TAB_MATE, sampleMax,
2727                                                         table, refnames, partitionSz);
2728                                 } else {
2729                                         sink = new VerboseHitSink(
2730                                                         fout, offBase,
2731                                                         colorSeq, colorQual, printCost,
2732                                                         suppressOuts, rmap, amap,
2733                                                         fullRef, PASS_DUMP_FILES,
2734                                                         format == TAB_MATE, sampleMax,
2735                                                         table, refnames, partitionSz);
2736                                 }
2737                                 break;
2738                         case OUTPUT_SAM:
2739                                 if(refOut) {
2740                                         throw 1;
2741                                 } else {
2742                                         SAMHitSink *sam = new SAMHitSink(
2743                                                         fout, 1, rmap, amap,
2744                                                         fullRef, defaultMapq,
2745                                                         PASS_DUMP_FILES,
2746                                                         format == TAB_MATE, sampleMax,
2747                                                         table, refnames);
2748                                         if(!samNoHead) {
2749                                                 vector<string> refnames;
2750                                                 if(!samNoSQ) {
2751                                                         readEbwtRefnames(adjustedEbwtFileBase, refnames);
2752                                                 }
2753                                                 sam->appendHeaders(
2754                                                                 sam->out(0), ebwt.nPat(),
2755                                                                 refnames, color, samNoSQ, rmap,
2756                                                                 ebwt.plen(), fullRef,
2757                                                                 argstr.c_str(),
2758                                                                 rgs.empty() ? NULL : rgs.c_str());
2759                                         }
2760                                         sink = sam;
2761                                 }
2762                                 break;
2763                         case OUTPUT_CONCISE:
2764                                 if(refOut) {
2765                                         sink = new ConciseHitSink(
2766                                                         ebwt.nPat(), offBase,
2767                                                         PASS_DUMP_FILES,
2768                                                         format == TAB_MATE,  sampleMax,
2769                                                         table, refnames, reportOpps);
2770                                 } else {
2771                                         sink = new ConciseHitSink(
2772                                                         fout, offBase,
2773                                                         PASS_DUMP_FILES,
2774                                                         format == TAB_MATE,  sampleMax,
2775                                                         table, refnames, reportOpps);
2776                                 }
2777                                 break;
2778                         case OUTPUT_CHAIN:
2779                                 assert(stateful);
2780                                 sink = new ChainingHitSink(
2781                                                 fout, strata, amap,
2782                                                 PASS_DUMP_FILES,
2783                                                 true, sampleMax,
2784                                                 table, refnames);
2785                                 break;
2786                         case OUTPUT_NONE:
2787                                 sink = new StubHitSink();
2788                                 break;
2789                         default:
2790                                 cerr << "Invalid output type: " << outType << endl;
2791                                 throw 1;
2792                 }
2793                 if(verbose || startVerbose) {
2794                         cerr << "Dispatching to search driver: "; logTime(cerr, true);
2795                 }
2796                 if(maqLike) {
2797                         seededQualCutoffSearchFull(seedLen,
2798                                                                            qualThresh,
2799                                                                            seedMms,
2800                                                                            *patsrc,
2801                                                                            *sink,
2802                                                                            ebwt,    // forward index
2803                                                                            *ebwtBw, // mirror index (not optional)
2804                                                                            os);     // references, if available
2805                 }
2806                 else if(mismatches > 0) {
2807                         if(mismatches == 1) {
2808                                 assert(ebwtBw != NULL);
2809                                 mismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os);
2810                         } else if(mismatches == 2 || mismatches == 3) {
2811                                 twoOrThreeMismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os, mismatches == 2);
2812                         } else {
2813                                 cerr << "Error: " << mismatches << " is not a supported number of mismatches" << endl;
2814                                 throw 1;
2815                         }
2816                 } else {
2817                         // Search without mismatches
2818                         // Note that --fast doesn't make a difference here because
2819                         // we're only loading half of the index anyway
2820                         exactSearch(*patsrc, *sink, ebwt, os);
2821                 }
2822                 // Evict any loaded indexes from memory
2823                 if(ebwt.isInMemory()) {
2824                         ebwt.evictFromMemory();
2825                 }
2826                 if(ebwtBw != NULL) {
2827                         delete ebwtBw;
2828                 }
2829                 if(!quiet) {
2830                         sink->finish(hadoopOut); // end the hits section of the hit file
2831                 }
2832                 for(size_t i = 0; i < patsrcs_a.size(); i++) {
2833                         assert(patsrcs_a[i] != NULL);
2834                         delete patsrcs_a[i];
2835                 }
2836                 for(size_t i = 0; i < patsrcs_b.size(); i++) {
2837                         if(patsrcs_b[i] != NULL) {
2838                                 delete patsrcs_b[i];
2839                         }
2840                 }
2841                 for(size_t i = 0; i < patsrcs_ab.size(); i++) {
2842                         if(patsrcs_ab[i] != NULL) {
2843                                 delete patsrcs_ab[i];
2844                         }
2845                 }
2846                 delete patsrc;
2847                 delete sink;
2848                 delete amap;
2849                 delete rmap;
2850                 if(fout != NULL) delete fout;
2851         }
2852 }
2853
2854 // C++ name mangling is disabled for the bowtie() function to make it
2855 // easier to use Bowtie as a library.
2856 extern "C" {
2857
2858 /**
2859  * Main bowtie entry function.  Parses argc/argv style command-line
2860  * options, sets global configuration variables, and calls the driver()
2861  * function.
2862  */
2863 int bowtie(int argc, const char **argv) {
2864         try {
2865                 // Reset all global state, including getopt state
2866                 opterr = optind = 1;
2867                 resetOptions();
2868                 for(int i = 0; i < argc; i++) {
2869                         argstr += argv[i];
2870                         if(i < argc-1) argstr += " ";
2871                 }
2872                 string ebwtFile;  // read serialized Ebwt from this file
2873                 string query;   // read query string(s) from this file
2874                 vector<string> queries;
2875                 string outfile; // write query results to this file
2876                 if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
2877                 parseOptions(argc, argv);
2878                 argv0 = argv[0];
2879                 if(showVersion) {
2880                         cout << argv0 << " version " << BOWTIE_VERSION << endl;
2881                         if(sizeof(void*) == 4) {
2882                                 cout << "32-bit" << endl;
2883                         } else if(sizeof(void*) == 8) {
2884                                 cout << "64-bit" << endl;
2885                         } else {
2886                                 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
2887                         }
2888                         cout << "Built on " << BUILD_HOST << endl;
2889                         cout << BUILD_TIME << endl;
2890                         cout << "Compiler: " << COMPILER_VERSION << endl;
2891                         cout << "Options: " << COMPILER_OPTIONS << endl;
2892                         cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
2893                                  << sizeof(int)
2894                                  << ", " << sizeof(long) << ", " << sizeof(long long)
2895                                  << ", " << sizeof(void *) << ", " << sizeof(size_t)
2896                                  << ", " << sizeof(off_t) << "}" << endl;
2897                         return 0;
2898                 }
2899         #ifdef CHUD_PROFILING
2900                 chudInitialize();
2901                 chudAcquireRemoteAccess();
2902         #endif
2903                 {
2904                         Timer _t(cerr, "Overall time: ", timing);
2905                         if(startVerbose) {
2906                                 cerr << "Parsing index and read arguments: "; logTime(cerr, true);
2907                         }
2908
2909                         // Get index basename
2910                         if(optind >= argc) {
2911                                 cerr << "No index, query, or output file specified!" << endl;
2912                                 printUsage(cerr);
2913                                 return 1;
2914                         }
2915                         ebwtFile = argv[optind++];
2916
2917                         // Get query filename
2918                         if(optind >= argc) {
2919                                 if(mates1.size() > 0 || mates12.size() > 0) {
2920                                         query = "";
2921                                 } else {
2922                                         cerr << "No query or output file specified!" << endl;
2923                                         printUsage(cerr);
2924                                         return 1;
2925                                 }
2926                         } else if (mates1.size() == 0 && mates12.size() == 0) {
2927                                 query = argv[optind++];
2928                                 // Tokenize the list of query files
2929                                 tokenize(query, ",", queries);
2930                                 if(queries.size() < 1) {
2931                                         cerr << "Tokenized query file list was empty!" << endl;
2932                                         printUsage(cerr);
2933                                         return 1;
2934                                 }
2935                         }
2936
2937                         // Get output filename
2938                         if(optind < argc) {
2939                                 outfile = argv[optind++];
2940                         }
2941
2942                         // Extra parametesr?
2943                         if(optind < argc) {
2944                                 cerr << "Extra parameter(s) specified: ";
2945                                 for(int i = optind; i < argc; i++) {
2946                                         cerr << "\"" << argv[i] << "\"";
2947                                         if(i < argc-1) cerr << ", ";
2948                                 }
2949                                 cerr << endl;
2950                                 if(mates1.size() > 0) {
2951                                         cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
2952                                                  << "also be specified.  Please run bowtie separately for mates and singles." << endl;
2953                                 }
2954                                 throw 1;
2955                         }
2956
2957                         // Optionally summarize
2958                         if(verbose) {
2959                                 cout << "Input ebwt file: \"" << ebwtFile << "\"" << endl;
2960                                 cout << "Query inputs (DNA, " << file_format_names[format] << "):" << endl;
2961                                 for(size_t i = 0; i < queries.size(); i++) {
2962                                         cout << "  " << queries[i] << endl;
2963                                 }
2964                                 cout << "Quality inputs:" << endl;
2965                                 for(size_t i = 0; i < qualities.size(); i++) {
2966                                         cout << "  " << qualities[i] << endl;
2967                                 }
2968                                 cout << "Output file: \"" << outfile << "\"" << endl;
2969                                 cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
2970                                 cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
2971                         #ifdef NDEBUG
2972                                 cout << "Assertions: disabled" << endl;
2973                         #else
2974                                 cout << "Assertions: enabled" << endl;
2975                         #endif
2976                         }
2977                         if(ipause) {
2978                                 cout << "Press key to continue..." << endl;
2979                                 getchar();
2980                         }
2981                         driver<String<Dna, Alloc<> > >("DNA", ebwtFile, query, queries, qualities, outfile);
2982                         CHUD_STOP();
2983                 }
2984 #ifdef CHUD_PROFILING
2985                 chudReleaseRemoteAccess();
2986 #endif
2987                 return 0;
2988         } catch(exception& e) {
2989                 cerr << "Command: ";
2990                 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
2991                 cerr << endl;
2992                 return 1;
2993         } catch(int e) {
2994                 if(e != 0) {
2995                         cerr << "Command: ";
2996                         for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
2997                         cerr << endl;
2998                 }
2999                 return e;
3000         }
3001 } // bowtie()
3002 } // extern "C"