8 #include <seqan/find.h>
13 #include "assert_helpers.h"
14 #include "endian_swap.h"
17 #include "sequence_io.h"
22 #include "threading.h"
23 #include "range_cache.h"
27 #include "aligner_0mm.h"
28 #include "aligner_1mm.h"
29 #include "aligner_23mm.h"
30 #include "aligner_seed_mm.h"
31 #include "aligner_metrics.h"
34 #include <CHUD/CHUD.h>
38 using namespace seqan;
40 static vector<string> mates1; // mated reads (first mate)
41 static vector<string> mates2; // mated reads (second mate)
42 static vector<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
43 static string adjustedEbwtFileBase;
44 static bool verbose; // be talkative
45 static bool startVerbose; // be talkative at startup
46 bool quiet; // print nothing but the alignments
47 static int sanityCheck; // enable expensive sanity checks
48 static int format; // default read format is FASTQ
49 static string origString; // reference text, or filename(s)
50 static int seed; // srandom() seed
51 static int timing; // whether to report basic timing data
52 static bool allHits; // for multihits, report just one
53 static bool rangeMode; // report BWT ranges instead of ref locs
54 static int showVersion; // just print version and quit?
55 static int ipause; // pause before maching?
56 static uint32_t qUpto; // max # of queries to read
57 static int trim5; // amount to trim from 5' end
58 static int trim3; // amount to trim from 3' end
59 static int reportOpps; // whether to report # of other mappings
60 static int offRate; // keep default offRate
61 static int isaRate; // keep default isaRate
62 static int mismatches; // allow 0 mismatches by default
63 static char *patDumpfile; // filename to dump patterns to
64 static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
65 static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
66 static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
67 static int maqLike; // do maq-like searching
68 static int seedLen; // seed length (changed in Maq 0.6.4 from 24)
69 static int seedMms; // # mismatches allowed in seed (maq's -n)
70 static int qualThresh; // max qual-weighted hamming dist (maq's -e)
71 static int maxBtsBetter; // max # backtracks allowed in half-and-half mode
72 static int maxBts; // max # backtracks allowed in half-and-half mode
73 static int nthreads; // number of pthreads operating concurrently
74 static output_types outType; // style of output
75 static bool randReadsNoSync; // true -> generate reads from per-thread random source
76 static int numRandomReads; // # random reads (see Random*PatternSource in pat.h)
77 static int lenRandomReads; // len of random reads (see Random*PatternSource in pat.h)
78 static bool noRefNames; // true -> print reference indexes; not names
79 static string dumpAlBase; // basename of same-format files to dump aligned reads to
80 static string dumpUnalBase; // basename of same-format files to dump unaligned reads to
81 static string dumpMaxBase; // basename of same-format files to dump reads with more than -m valid alignments to
82 static uint32_t khits; // number of hits per read; >1 is much slower
83 static uint32_t mhits; // don't report any hits if there are > mhits
84 static bool better; // true -> guarantee alignments from best possible stratum
85 static bool strata; // true -> don't stop at stratum boundaries
86 static bool refOut; // if true, alignments go to per-ref files
87 static int partitionSz; // output a partitioning key in first field
88 static bool noMaqRound; // true -> don't round quals to nearest 10 like maq
89 static bool useSpinlock; // false -> don't use of spinlocks even if they're #defines
90 static bool fileParallel; // separate threads read separate input files in parallel
91 static bool useShmem; // use shared memory to hold the index
92 static bool useMm; // use memory-mapped files to hold the index
93 static bool mmSweep; // sweep through memory-mapped files immediately after mapping
94 static bool stateful; // use stateful aligners
95 static uint32_t prefetchWidth; // number of reads to process in parallel w/ --stateful
96 static uint32_t minInsert; // minimum insert size (Maq = 0, SOAP = 400)
97 static uint32_t maxInsert; // maximum insert size (Maq = 250, SOAP = 600)
98 static bool mate1fw; // -1 mate aligns in fw orientation on fw strand
99 static bool mate2fw; // -2 mate aligns in rc orientation on fw strand
100 static bool mateFwSet; // true -> user set --ff/--fr/--rf
101 static uint32_t mixedThresh; // threshold for when to switch to paired-end mixed mode (see aligner.h)
102 static uint32_t mixedAttemptLim; // number of attempts to make in "mixed mode" before giving up on orientation
103 static bool dontReconcileMates; // suppress pairwise all-versus-all way of resolving mates
104 static uint32_t cacheLimit; // ranges w/ size > limit will be cached
105 static uint32_t cacheSize; // # words per range cache
106 static int offBase; // offsets are 0-based by default, but configurable
107 static bool tryHard; // set very high maxBts, mixedAttemptLim
108 static uint32_t skipReads; // # reads/read pairs to skip
109 static bool nofw; // don't align fw orientation of read
110 static bool norc; // don't align rc orientation of read
111 static bool strandFix; // attempt to fix strand bias
112 static bool randomizeQuals; // randomize quality values
113 static bool stats; // print performance stats
114 static int chunkPoolMegabytes; // max MB to dedicate to best-first search frames per thread
115 static int chunkSz; // size of single chunk disbursed by ChunkPool
116 static bool chunkVerbose; // have chunk allocator output status messages?
118 static int recalMaxCycle;
119 static int recalMaxQual;
120 static int recalQualShift;
122 static bool reportSe;
123 static const char * refMapFile; // file containing a map from index coordinates to another coordinate system
124 static const char * annotMapFile; // file containing a map from reference coordinates to annotations
125 static size_t fastaContLen;
126 static size_t fastaContFreq;
127 static bool hadoopOut; // print Hadoop status and summary messages
130 static bool samNoHead; // don't print any header lines in SAM output
131 static bool samNoSQ; // don't print @SQ header lines
132 bool color; // true -> inputs are colorspace
133 bool colorExEnds; // true -> nucleotides on either end of decoded cspace alignment should be excluded
134 static string rgs; // SAM outputs for @RG header line
135 int snpPhred; // probability of SNP, for scoring colorspace alignments
136 static Bitset suppressOuts(64); // output fields to suppress
137 static bool sampleMax; // whether to report a random alignment when maxed-out via -m/-M
138 static int defaultMapq; // default mapping quality to print in SAM mode
139 bool colorSeq; // true -> show colorspace alignments as colors, not decoded bases
140 bool colorQual; // true -> show colorspace qualities as original quals, not decoded quals
141 static bool printCost; // true -> print stratum and cost
143 static vector<string> qualities;
144 static vector<string> qualities1;
145 static vector<string> qualities2;
148 static void resetOptions() {
152 adjustedEbwtFileBase = "";
156 sanityCheck = 0; // enable expensive sanity checks
157 format = FASTQ; // default read format is FASTQ
158 origString = ""; // reference text, or filename(s)
159 seed = 0; // srandom() seed
160 timing = 0; // whether to report basic timing data
161 allHits = false; // for multihits, report just one
162 rangeMode = false; // report BWT ranges instead of ref locs
163 showVersion = 0; // just print version and quit?
164 ipause = 0; // pause before maching?
165 qUpto = 0xffffffff; // max # of queries to read
166 trim5 = 0; // amount to trim from 5' end
167 trim3 = 0; // amount to trim from 3' end
168 reportOpps = 0; // whether to report # of other mappings
169 offRate = -1; // keep default offRate
170 isaRate = -1; // keep default isaRate
171 mismatches = 0; // allow 0 mismatches by default
172 patDumpfile = NULL; // filename to dump patterns to
173 solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
174 phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33)
175 integerQuals = false; // quality strings are space-separated strings of integers, not ASCII
176 maqLike = 1; // do maq-like searching
177 seedLen = 28; // seed length (changed in Maq 0.6.4 from 24)
178 seedMms = 2; // # mismatches allowed in seed (maq's -n)
179 qualThresh = 70; // max qual-weighted hamming dist (maq's -e)
180 maxBtsBetter = 125; // max # backtracks allowed in half-and-half mode
181 maxBts = 800; // max # backtracks allowed in half-and-half mode
182 nthreads = 1; // number of pthreads operating concurrently
183 outType = OUTPUT_FULL; // style of output
184 randReadsNoSync = false; // true -> generate reads from per-thread random source
185 numRandomReads = 50000000; // # random reads (see Random*PatternSource in pat.h)
186 lenRandomReads = 35; // len of random reads (see Random*PatternSource in pat.h)
187 noRefNames = false; // true -> print reference indexes; not names
188 dumpAlBase = ""; // basename of same-format files to dump aligned reads to
189 dumpUnalBase = ""; // basename of same-format files to dump unaligned reads to
190 dumpMaxBase = ""; // basename of same-format files to dump reads with more than -m valid alignments to
191 khits = 1; // number of hits per read; >1 is much slower
192 mhits = 0xffffffff; // don't report any hits if there are > mhits
193 better = false; // true -> guarantee alignments from best possible stratum
194 strata = false; // true -> don't stop at stratum boundaries
195 refOut = false; // if true, alignments go to per-ref files
196 partitionSz = 0; // output a partitioning key in first field
197 noMaqRound = false; // true -> don't round quals to nearest 10 like maq
198 useSpinlock = true; // false -> don't use of spinlocks even if they're #defines
199 fileParallel = false; // separate threads read separate input files in parallel
200 useShmem = false; // use shared memory to hold the index
201 useMm = false; // use memory-mapped files to hold the index
202 mmSweep = false; // sweep through memory-mapped files immediately after mapping
203 stateful = false; // use stateful aligners
204 prefetchWidth = 1; // number of reads to process in parallel w/ --stateful
205 minInsert = 0; // minimum insert size (Maq = 0, SOAP = 400)
206 maxInsert = 250; // maximum insert size (Maq = 250, SOAP = 600)
207 mate1fw = true; // -1 mate aligns in fw orientation on fw strand
208 mate2fw = false; // -2 mate aligns in rc orientation on fw strand
209 mateFwSet = false; // true -> user set mate1fw/mate2fw with --ff/--fr/--rf
210 mixedThresh = 4; // threshold for when to switch to paired-end mixed mode (see aligner.h)
211 mixedAttemptLim = 100; // number of attempts to make in "mixed mode" before giving up on orientation
212 dontReconcileMates = true; // suppress pairwise all-versus-all way of resolving mates
213 cacheLimit = 5; // ranges w/ size > limit will be cached
214 cacheSize = 0; // # words per range cache
215 offBase = 0; // offsets are 0-based by default, but configurable
216 tryHard = false; // set very high maxBts, mixedAttemptLim
217 skipReads = 0; // # reads/read pairs to skip
218 nofw = false; // don't align fw orientation of read
219 norc = false; // don't align rc orientation of read
220 strandFix = true; // attempt to fix strand bias
221 randomizeQuals = false; // randomize quality values
222 stats = false; // print performance stats
223 chunkPoolMegabytes = 64; // max MB to dedicate to best-first search frames per thread
224 chunkSz = 256; // size of single chunk disbursed by ChunkPool (in KB)
225 chunkVerbose = false; // have chunk allocator output status messages?
232 refMapFile = NULL; // file containing a map from index coordinates to another coordinate system
233 annotMapFile = NULL; // file containing a map from reference coordinates to annotations
236 hadoopOut = false; // print Hadoop status and summary messages
237 fuzzy = false; // reads will have alternate basecalls w/ qualities
238 fullRef = false; // print entire reference name instead of just up to 1st space
239 samNoHead = false; // don't print any header lines in SAM output
240 samNoSQ = false; // don't print @SQ header lines
241 color = false; // don't align in colorspace by default
242 colorExEnds = true; // true -> nucleotides on either end of decoded cspace alignment should be excluded
243 rgs = ""; // SAM outputs for @RG header line
244 snpPhred = 30; // probability of SNP, for scoring colorspace alignments
245 suppressOuts.clear(); // output fields to suppress
248 colorSeq = false; // true -> show colorspace alignments as colors, not decoded bases
249 colorQual = false; // true -> show colorspace qualities as original quals, not decoded quals
250 printCost = false; // true -> print cost and stratum
251 showSeed = false; // true -> print per-read pseudo-random seed
258 // mating constraints
260 static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:M:1:2:I:X:x:B:ySCQ:";
274 ARG_RANDOM_READS_NOSYNC,
344 static struct option long_options[] = {
345 {(char*)"verbose", no_argument, 0, ARG_VERBOSE},
346 {(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE},
347 {(char*)"quiet", no_argument, 0, ARG_QUIET},
348 {(char*)"sanity", no_argument, 0, ARG_SANITY},
349 {(char*)"pause", no_argument, &ipause, 1},
350 {(char*)"orig", required_argument, 0, ARG_ORIG},
351 {(char*)"all", no_argument, 0, 'a'},
352 {(char*)"concise", no_argument, 0, ARG_CONCISE},
353 {(char*)"noout", no_argument, 0, ARG_NOOUT},
354 {(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS},
355 {(char*)"integer-quals",no_argument, 0, ARG_integerQuals},
356 {(char*)"time", no_argument, 0, 't'},
357 {(char*)"trim3", required_argument, 0, '3'},
358 {(char*)"trim5", required_argument, 0, '5'},
359 {(char*)"seed", required_argument, 0, ARG_SEED},
360 {(char*)"qupto", required_argument, 0, 'u'},
361 {(char*)"al", required_argument, 0, ARG_AL},
362 {(char*)"un", required_argument, 0, ARG_UN},
363 {(char*)"max", required_argument, 0, ARG_MAXDUMP},
364 {(char*)"offrate", required_argument, 0, 'o'},
365 {(char*)"isarate", required_argument, 0, ARG_ISARATE},
366 {(char*)"reportopps", no_argument, &reportOpps, 1},
367 {(char*)"version", no_argument, &showVersion, 1},
368 {(char*)"dumppats", required_argument, 0, ARG_DUMP_PATS},
369 {(char*)"maqerr", required_argument, 0, 'e'},
370 {(char*)"seedlen", required_argument, 0, 'l'},
371 {(char*)"seedmms", required_argument, 0, 'n'},
372 {(char*)"filepar", no_argument, 0, ARG_FILEPAR},
373 {(char*)"help", no_argument, 0, 'h'},
374 {(char*)"threads", required_argument, 0, 'p'},
375 {(char*)"khits", required_argument, 0, 'k'},
376 {(char*)"mhits", required_argument, 0, 'm'},
377 {(char*)"minins", required_argument, 0, 'I'},
378 {(char*)"maxins", required_argument, 0, 'X'},
379 {(char*)"quals", required_argument, 0, 'Q'},
380 {(char*)"Q1", required_argument, 0, ARG_QUALS1},
381 {(char*)"Q2", required_argument, 0, ARG_QUALS2},
382 {(char*)"best", no_argument, 0, ARG_BEST},
383 {(char*)"better", no_argument, 0, ARG_BETTER},
384 {(char*)"oldbest", no_argument, 0, ARG_OLDBEST},
385 {(char*)"strata", no_argument, 0, ARG_STRATA},
386 {(char*)"nomaqround", no_argument, 0, ARG_NOMAQROUND},
387 {(char*)"refidx", no_argument, 0, ARG_REFIDX},
388 {(char*)"range", no_argument, 0, ARG_RANGE},
389 {(char*)"maxbts", required_argument, 0, ARG_MAXBTS},
390 {(char*)"randread", no_argument, 0, ARG_RANDOM_READS},
391 {(char*)"randreadnosync", no_argument, 0, ARG_RANDOM_READS_NOSYNC},
392 {(char*)"phased", no_argument, 0, 'z'},
393 {(char*)"refout", no_argument, 0, ARG_REFOUT},
394 {(char*)"partition", required_argument, 0, ARG_PARTITION},
395 {(char*)"nospin", no_argument, 0, ARG_USE_SPINLOCK},
396 {(char*)"stateful", no_argument, 0, ARG_STATEFUL},
397 {(char*)"prewidth", required_argument, 0, ARG_PREFETCH_WIDTH},
398 {(char*)"ff", no_argument, 0, ARG_FF},
399 {(char*)"fr", no_argument, 0, ARG_FR},
400 {(char*)"rf", no_argument, 0, ARG_RF},
401 {(char*)"mixthresh", required_argument, 0, 'x'},
402 {(char*)"pairtries", required_argument, 0, ARG_MIXED_ATTEMPTS},
403 {(char*)"noreconcile", no_argument, 0, ARG_NO_RECONCILE},
404 {(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM},
405 {(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ},
406 {(char*)"nofw", no_argument, 0, ARG_NO_FW},
407 {(char*)"norc", no_argument, 0, ARG_NO_RC},
408 {(char*)"offbase", required_argument, 0, 'B'},
409 {(char*)"tryhard", no_argument, 0, 'y'},
410 {(char*)"skip", required_argument, 0, 's'},
411 {(char*)"strandfix", no_argument, 0, ARG_STRAND_FIX},
412 {(char*)"randquals", no_argument, 0, ARG_RANDOMIZE_QUALS},
413 {(char*)"stats", no_argument, 0, ARG_STATS},
414 {(char*)"12", required_argument, 0, ARG_ONETWO},
415 {(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
416 {(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
417 {(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64},
418 {(char*)"chunkmbs", required_argument, 0, ARG_CHUNKMBS},
419 {(char*)"chunksz", required_argument, 0, ARG_CHUNKSZ},
420 {(char*)"chunkverbose", no_argument, 0, ARG_CHUNKVERBOSE},
421 {(char*)"mm", no_argument, 0, ARG_MM},
422 {(char*)"shmem", no_argument, 0, ARG_SHMEM},
423 {(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP},
424 {(char*)"recal", no_argument, 0, ARG_RECAL},
425 {(char*)"pev2", no_argument, 0, ARG_PEV2},
426 {(char*)"chainout", no_argument, 0, ARG_CHAINOUT},
427 {(char*)"chainin", no_argument, 0, ARG_CHAININ},
428 {(char*)"refmap", required_argument, 0, ARG_REFMAP},
429 {(char*)"annotmap", required_argument, 0, ARG_ANNOTMAP},
430 {(char*)"reportse", no_argument, 0, ARG_REPORTSE},
431 {(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT},
432 {(char*)"fuzzy", no_argument, 0, ARG_FUZZY},
433 {(char*)"fullref", no_argument, 0, ARG_FULLREF},
434 {(char*)"usage", no_argument, 0, ARG_USAGE},
435 {(char*)"sam", no_argument, 0, 'S'},
436 {(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD},
437 {(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ},
438 {(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ},
439 {(char*)"color", no_argument, 0, 'C'},
440 {(char*)"sam-RG", required_argument, 0, ARG_SAM_RG},
441 {(char*)"snpphred", required_argument, 0, ARG_SNPPHRED},
442 {(char*)"snpfrac", required_argument, 0, ARG_SNPFRAC},
443 {(char*)"suppress", required_argument, 0, ARG_SUPPRESS_FIELDS},
444 {(char*)"mapq", required_argument, 0, ARG_DEFAULT_MAPQ},
445 {(char*)"col-cseq", no_argument, 0, ARG_COLOR_SEQ},
446 {(char*)"col-cqual", no_argument, 0, ARG_COLOR_QUAL},
447 {(char*)"col-keepends", no_argument, 0, ARG_COLOR_KEEP_ENDS},
448 {(char*)"cost", no_argument, 0, ARG_COST},
449 {(char*)"showseed", no_argument, 0, ARG_SHOWSEED},
450 {(char*)0, 0, 0, 0} // terminator
454 * Print a summary usage message to the provided output stream.
456 static void printUsage(ostream& out) {
457 out << "Usage: " << endl
458 << " bowtie [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>]" << endl
460 << " <m1> Comma-separated list of files containing upstream mates (or the" << endl
461 << " sequences themselves, if -c is set) paired with mates in <m2>" << endl
462 << " <m2> Comma-separated list of files containing downstream mates (or the" << endl
463 << " sequences themselves if -c is set) paired with mates in <m1>" << endl
464 << " <r> Comma-separated list of files containing Crossbow-style reads. Can be" << endl
465 << " a mixture of paired and unpaired. Specify \"-\" for stdin." << endl
466 << " <s> Comma-separated list of files containing unpaired reads, or the" << endl
467 << " sequences themselves, if -c is set. Specify \"-\" for stdin." << endl
468 << " <hit> File to write hits to (default: stdout)" << endl
470 << " -q query input files are FASTQ .fq/.fastq (default)" << endl
471 << " -f query input files are (multi-)FASTA .fa/.mfa" << endl
472 << " -r query input files are raw one-sequence-per-line" << endl
473 << " -c query sequences given on cmd line (as <mates>, <singles>)" << endl
474 << " -C reads and index are in colorspace" << endl
475 << " -Q/--quals <file> QV file(s) corresponding to CSFASTA inputs; use with -f -C" << endl
476 << " --Q1/--Q2 <file> same as -Q, but for mate files 1 and 2 respectively" << endl
477 << " -s/--skip <int> skip the first <int> reads/pairs in the input" << endl
478 << " -u/--qupto <int> stop after first <int> reads/pairs (excl. skipped reads)" << endl
479 << " -5/--trim5 <int> trim <int> bases from 5' (left) end of reads" << endl
480 << " -3/--trim3 <int> trim <int> bases from 3' (right) end of reads" << endl
481 << " --phred33-quals input quals are Phred+33 (default)" << endl
482 << " --phred64-quals input quals are Phred+64 (same as --solexa1.3-quals)" << endl
483 << " --solexa-quals input quals are from GA Pipeline ver. < 1.3" << endl
484 << " --solexa1.3-quals input quals are from GA Pipeline ver. >= 1.3" << endl
485 << " --integer-quals qualities are given as space-separated integers (not ASCII)" << endl
486 << "Alignment:" << endl
487 << " -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities" << endl
489 << " -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2)" << endl
490 << " -e/--maqerr <int> max sum of mismatch quals across alignment for -n (def: 70)" << endl
491 << " -l/--seedlen <int> seed length for -n (default: 28)" << endl
492 << " --nomaqround disable Maq-like quality rounding for -n (nearest 10 <= 30)" << endl
493 << " -I/--minins <int> minimum insert size for paired-end alignment (default: 0)" << endl
494 << " -X/--maxins <int> maximum insert size for paired-end alignment (default: 250)" << endl
495 << " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr)" << endl
496 << " --nofw/--norc do not align to forward/reverse-complement reference strand" << endl
497 << " --maxbts <int> max # backtracks for -n 2/3 (default: 125, 800 for --best)" << endl
498 << " --pairtries <int> max # attempts to find mate for anchor hit (default: 100)" << endl
499 << " -y/--tryhard try hard to find valid alignments, at the expense of speed" << endl
500 << " --chunkmbs <int> max megabytes of RAM for best-first search frames (def: 64)" << endl
501 << "Reporting:" << endl
502 << " -k <int> report up to <int> good alignments per read (default: 1)" << endl
503 << " -a/--all report all alignments per read (much slower than low -k)" << endl
504 << " -m <int> suppress all alignments if > <int> exist (def: no limit)" << endl
505 << " -M <int> like -m, but reports 1 random hit (MAPQ=0); requires --best" << endl
506 << " --best hits guaranteed best stratum; ties broken by quality" << endl
507 << " --strata hits in sub-optimal strata aren't reported (requires --best)" << endl
509 << " -t/--time print wall-clock time taken by search phases" << endl
510 << " -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0)" << endl
511 << " --quiet print nothing but the alignments" << endl
512 << " --refout write alignments to files refXXXXX.map, 1 map per reference" << endl
513 << " --refidx refer to ref. seqs by 0-based index rather than name" << endl
514 << " --al <fname> write aligned reads/pairs to file(s) <fname>" << endl
515 << " --un <fname> write unaligned reads/pairs to file(s) <fname>" << endl
516 << " --max <fname> write reads/pairs over -m limit to file(s) <fname>" << endl
517 << " --suppress <cols> suppresses given columns (comma-delim'ed) in default output" << endl
518 << " --fullref write entire ref name (default: only up to 1st space)" << endl
519 << "Colorspace:" << endl
520 << " --snpphred <int> Phred penalty for SNP when decoding colorspace (def: 30)" << endl
522 << " --snpfrac <dec> approx. fraction of SNP bases (e.g. 0.001); sets --snpphred" << endl
523 << " --col-cseq print aligned colorspace seqs as colors, not decoded bases" << endl
524 << " --col-cqual print original colorspace quals, not decoded quals" << endl
525 << " --col-keepends keep nucleotides at extreme ends of decoded alignment" << endl
527 << " -S/--sam write hits in SAM format" << endl
528 << " --mapq <int> default mapping quality (MAPQ) to print for SAM alignments" << endl
529 << " --sam-nohead supppress header lines (starting with @) for SAM output" << endl
530 << " --sam-nosq supppress @SQ header lines for SAM output" << endl
531 << " --sam-RG <text> add <text> (usually \"lab=value\") to @RG line of SAM header" << endl
532 << "Performance:" << endl
533 << " -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
534 #ifdef BOWTIE_PTHREADS
535 << " -p/--threads <int> number of alignment threads to launch (default: 1)" << endl
538 << " --mm use memory-mapped I/O for index; many 'bowtie's can share" << endl
540 #ifdef BOWTIE_SHARED_MEM
541 << " --shmem use shared mem for index; many 'bowtie's can share" << endl
544 << " --seed <int> seed for random number generator" << endl
545 << " --verbose verbose output (for debugging)" << endl
546 << " --version print version information and quit" << endl
547 << " -h/--help print this usage message" << endl
552 * Parse an int out of optarg and enforce that it be at least 'lower';
553 * if it is less than 'lower', than output the given error message and
554 * exit with an error and a usage message.
556 static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
559 l = strtol(arg, &endPtr, 10);
560 if (endPtr != NULL) {
561 if (l < lower || l > upper) {
562 cerr << errmsg << endl;
568 cerr << errmsg << endl;
575 * Parse from optarg by default.
577 static int parseInt(int lower, const char *errmsg) {
578 return parseInt(lower, INT_MAX, errmsg, optarg);
582 * Upper is INT_MAX by default.
584 static int parseInt(int lower, const char *errmsg, const char *arg) {
585 return parseInt(lower, INT_MAX, errmsg, arg);
589 * Upper is INT_MAX, parse from optarg by default.
591 static int parseInt(int lower, int upper, const char *errmsg) {
592 return parseInt(lower, upper, errmsg, optarg);
596 * Parse a T string 'str'.
599 T parse(const char *s) {
607 * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
610 pair<T, T> parsePair(const char *str, char delim) {
613 tokenize(s, delim, ss);
615 ret.first = parse<T>(ss[0].c_str());
616 ret.second = parse<T>(ss[1].c_str());
621 * Read command-line arguments
623 static void parseOptions(int argc, const char **argv) {
624 int option_index = 0;
626 if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
628 next_option = getopt_long(
629 argc, const_cast<char**>(argv),
630 short_options, long_options, &option_index);
631 switch (next_option) {
632 case '1': tokenize(optarg, ",", mates1); break;
633 case '2': tokenize(optarg, ",", mates2); break;
634 case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
635 case 'f': format = FASTA; break;
638 pair<size_t, size_t> p = parsePair<size_t>(optarg, ',');
639 fastaContLen = p.first;
640 fastaContFreq = p.second;
643 case 'q': format = FASTQ; break;
644 case 'r': format = RAW; break;
645 case 'c': format = CMDLINE; break;
646 case 'C': color = true; break;
647 case ARG_CHAININ: format = INPUT_CHAIN; break;
649 minInsert = (uint32_t)parseInt(0, "-I arg must be positive");
652 maxInsert = (uint32_t)parseInt(1, "-X arg must be at least 1");
655 skipReads = (uint32_t)parseInt(0, "-s arg must be positive");
657 case ARG_FF: mate1fw = true; mate2fw = true; mateFwSet = true; break;
658 case ARG_RF: mate1fw = false; mate2fw = true; mateFwSet = true; break;
659 case ARG_FR: mate1fw = true; mate2fw = false; mateFwSet = true; break;
660 case ARG_RANDOM_READS: format = RANDOM; break;
661 case ARG_RANDOM_READS_NOSYNC:
663 randReadsNoSync = true;
665 case ARG_RANGE: rangeMode = true; break;
666 case ARG_CONCISE: outType = OUTPUT_CONCISE; break;
667 case ARG_CHAINOUT: outType = OUTPUT_CHAIN; break;
668 case 'S': outType = OUTPUT_SAM; break;
669 case ARG_REFOUT: refOut = true; break;
670 case ARG_NOOUT: outType = OUTPUT_NONE; break;
671 case ARG_REFMAP: refMapFile = optarg; break;
672 case ARG_ANNOTMAP: annotMapFile = optarg; break;
673 case ARG_USE_SPINLOCK: useSpinlock = false; break;
674 case ARG_SHMEM: useShmem = true; break;
675 case ARG_COLOR_SEQ: colorSeq = true; break;
676 case ARG_COLOR_QUAL: colorQual = true; break;
677 case ARG_SHOWSEED: showSeed = true; break;
678 case ARG_SUPPRESS_FIELDS: {
680 tokenize(optarg, ",", supp);
681 for(size_t i = 0; i < supp.size(); i++) {
682 int ii = parseInt(1, "--suppress arg must be at least 1", supp[i].c_str());
683 suppressOuts.set(ii-1);
692 cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
693 << "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl
694 << "would like to use memory-mapped I/O on a platform that supports it, please" << endl
695 << "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
699 case ARG_MMSWEEP: mmSweep = true; break;
700 case ARG_HADOOPOUT: hadoopOut = true; break;
701 case ARG_AL: dumpAlBase = optarg; break;
702 case ARG_UN: dumpUnalBase = optarg; break;
703 case ARG_MAXDUMP: dumpMaxBase = optarg; break;
704 case ARG_SOLEXA_QUALS: solexaQuals = true; break;
705 case ARG_integerQuals: integerQuals = true; break;
706 case ARG_PHRED64: phred64Quals = true; break;
707 case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
708 case ARG_NOMAQROUND: noMaqRound = true; break;
709 case ARG_COLOR_KEEP_ENDS: colorExEnds = false; break;
710 case ARG_SNPPHRED: snpPhred = parseInt(0, "--snpphred must be at least 0"); break;
712 double p = parse<double>(optarg);
714 cerr << "Error: --snpfrac parameter must be > 0.0" << endl;
717 p = (log10(p) * -10);
718 snpPhred = (int)(p + 0.5);
720 cout << "snpPhred: " << snpPhred << endl;
724 cerr << "Error: -z/--phased mode is no longer supported" << endl;
727 case ARG_REFIDX: noRefNames = true; break;
728 case ARG_STATEFUL: stateful = true; break;
729 case ARG_FUZZY: fuzzy = true; break;
730 case ARG_REPORTSE: reportSe = true; break;
731 case ARG_FULLREF: fullRef = true; break;
732 case ARG_PREFETCH_WIDTH:
733 prefetchWidth = parseInt(1, "--prewidth must be at least 1");
736 offBase = parseInt(-999999, "-B/--offbase cannot be a large negative number");
739 seed = parseInt(0, "--seed arg must be at least 0");
742 qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1");
745 khits = (uint32_t)parseInt(1, "-k arg must be at least 1");
748 tokenize(optarg, ",", qualities);
752 tokenize(optarg, ",", qualities1);
756 tokenize(optarg, ",", qualities2);
762 mhits = (uint32_t)parseInt(1, "-m arg must be at least 1");
765 mixedThresh = (uint32_t)parseInt(0, "-x arg must be at least 0");
767 case ARG_MIXED_ATTEMPTS:
768 mixedAttemptLim = (uint32_t)parseInt(1, "--mixatt arg must be at least 1");
771 cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1");
774 cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1");
775 cacheSize *= (1024 * 1024); // convert from MB to B
777 case ARG_NO_RECONCILE:
778 dontReconcileMates = true;
781 #ifndef BOWTIE_PTHREADS
782 cerr << "-p/--threads is disabled because bowtie was not compiled with pthreads support" << endl;
785 nthreads = parseInt(1, "-p/--threads arg must be at least 1");
788 #ifndef BOWTIE_PTHREADS
789 cerr << "--filepar is disabled because bowtie was not compiled with pthreads support" << endl;
796 mismatches = parseInt(0, 3, "-v arg must be at least 0 and at most 3");
798 case '3': trim3 = parseInt(0, "-3/--trim3 arg must be at least 0"); break;
799 case '5': trim5 = parseInt(0, "-5/--trim5 arg must be at least 0"); break;
800 case 'o': offRate = parseInt(1, "-o/--offrate arg must be at least 1"); break;
801 case ARG_ISARATE: isaRate = parseInt(0, "--isarate arg must be at least 0"); break;
802 case 'e': qualThresh = parseInt(1, "-e/--err arg must be at least 1"); break;
803 case 'n': seedMms = parseInt(0, 3, "-n/--seedmms arg must be at least 0 and at most 3"); maqLike = 1; break;
804 case 'l': seedLen = parseInt(5, "-l/--seedlen arg must be at least 5"); break;
805 case 'h': printUsage(cout); throw 0; break;
806 case ARG_USAGE: printUsage(cout); throw 0; break;
807 case 'a': allHits = true; break;
808 case 'y': tryHard = true; break;
809 case ARG_RECAL: recal = true; break;
810 case ARG_CHUNKMBS: chunkPoolMegabytes = parseInt(1, "--chunkmbs arg must be at least 1"); break;
811 case ARG_CHUNKSZ: chunkSz = parseInt(1, "--chunksz arg must be at least 1"); break;
812 case ARG_CHUNKVERBOSE: chunkVerbose = true; break;
813 case ARG_BETTER: stateful = true; better = true; break;
814 case ARG_BEST: stateful = true; useV1 = false; break;
815 case ARG_STRATA: strata = true; break;
816 case ARG_VERBOSE: verbose = true; break;
817 case ARG_STARTVERBOSE: startVerbose = true; break;
818 case ARG_QUIET: quiet = true; break;
819 case ARG_SANITY: sanityCheck = true; break;
820 case 't': timing = true; break;
821 case ARG_NO_FW: nofw = true; break;
822 case ARG_NO_RC: norc = true; break;
823 case ARG_STATS: stats = true; break;
824 case ARG_PEV2: useV1 = false; break;
825 case ARG_SAM_NOHEAD: samNoHead = true; break;
826 case ARG_SAM_NOSQ: samNoSQ = true; break;
828 if(!rgs.empty()) rgs += '\t';
832 case ARG_COST: printCost = true; break;
833 case ARG_DEFAULT_MAPQ:
834 defaultMapq = parseInt(0, "--mapq must be positive");
837 maxBts = parseInt(0, "--maxbts must be positive");
838 maxBtsBetter = maxBts;
841 case ARG_DUMP_PATS: patDumpfile = optarg; break;
842 case ARG_STRAND_FIX: strandFix = true; break;
843 case ARG_RANDOMIZE_QUALS: randomizeQuals = true; break;
844 case ARG_PARTITION: partitionSz = parse<int>(optarg); break;
846 if(optarg == NULL || strlen(optarg) == 0) {
847 cerr << "--orig arg must be followed by a string" << endl;
853 case -1: break; /* Done with options. */
855 if (long_options[option_index].flag != 0)
861 } while(next_option != -1);
862 bool paired = mates1.size() > 0 || mates2.size() > 0 || mates12.size() > 0;
864 // Tell the Ebwt loader to ignore the suffix-array portion of
865 // the index. We don't need it because the user isn't asking
866 // for bowtie to report reference positions (just matrix
870 if(!maqLike && mismatches == 3) {
871 // Much faster than normal 3-mismatch mode
874 if(mates1.size() != mates2.size()) {
875 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
876 << "mate files/sequences were specified with -2. The same number of mate files/" << endl
877 << "sequences must be specified with -1 and -2." << endl;
880 if(qualities.size() && format != FASTA) {
881 cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
882 << "enabled. -Q works only in combination with -f and -C." << endl;
885 if(qualities.size() && !color) {
886 cerr << "Error: one or more quality files were specified with -Q but -C was not" << endl
887 << "enabled. -Q works only in combination with -f and -C." << endl;
890 if(qualities1.size() && format != FASTA) {
891 cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
892 << "enabled. --Q1 works only in combination with -f and -C." << endl;
895 if(qualities1.size() && !color) {
896 cerr << "Error: one or more quality files were specified with --Q1 but -C was not" << endl
897 << "enabled. --Q1 works only in combination with -f and -C." << endl;
900 if(qualities2.size() && format != FASTA) {
901 cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
902 << "enabled. --Q2 works only in combination with -f and -C." << endl;
905 if(qualities2.size() && !color) {
906 cerr << "Error: one or more quality files were specified with --Q2 but -C was not" << endl
907 << "enabled. --Q2 works only in combination with -f and -C." << endl;
910 if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
911 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
912 << "quality files were specified with --Q1. The same number of mate and quality" << endl
913 << "files must sequences must be specified with -1 and --Q1." << endl;
916 if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
917 cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
918 << "quality files were specified with --Q2. The same number of mate and quality" << endl
919 << "files must sequences must be specified with -2 and --Q2." << endl;
922 // Check for duplicate mate input files
923 if(format != CMDLINE) {
924 for(size_t i = 0; i < mates1.size(); i++) {
925 for(size_t j = 0; j < mates2.size(); j++) {
926 if(mates1[i] == mates2[j] && !quiet) {
927 cerr << "Warning: Same mate file \"" << mates1[i] << "\" appears as argument to both -1 and -2" << endl;
933 // Increase backtracking limit to huge number
934 maxBts = maxBtsBetter = INT_MAX;
935 // Increase number of paired-end scan attempts to huge number
936 mixedAttemptLim = UINT_MAX;
938 if(!stateful && sampleMax) {
940 cerr << "Warning: -M was specified w/o --best; automatically enabling --best" << endl;
944 if(strata && !stateful) {
945 cerr << "--strata must be combined with --best" << endl;
948 if(strata && !allHits && khits == 1 && mhits == 0xffffffff) {
949 cerr << "--strata has no effect unless combined with -k, -m or -a" << endl;
952 if(fuzzy && (!stateful && !paired)) {
953 cerr << "--fuzzy must be combined with --best or paired-end alignment" << endl;
956 // If both -s and -u are used, we need to adjust qUpto accordingly
957 // since it uses patid to know if we've reached the -u limit (and
958 // patids are all shifted up by skipReads characters)
959 if(qUpto + skipReads > qUpto) {
962 if(useShmem && useMm && !quiet) {
963 cerr << "Warning: --shmem overrides --mm..." << endl;
966 if(snpPhred <= 10 && color && !quiet) {
967 cerr << "Warning: the colorspace SNP penalty (--snpphred) is very low: " << snpPhred << endl;
969 if(format == INPUT_CHAIN) {
972 cerr << "Error: --chainin must be combined with --best; aborting..." << endl;
976 cerr << "Error: --chainin cannot be combined with paired-end alignment; aborting..." << endl;
982 if(outType == OUTPUT_CHAIN) {
985 cerr << "Error: --chainout is not compatible with --refout; aborting..." << endl;
989 cerr << "Error: --chainout must be combined with --best; aborting..." << endl;
993 cerr << "Error: --chainout cannot be combined with paired-end alignment; aborting..." << endl;
998 if(outType == OUTPUT_SAM && refOut) {
999 cerr << "Error: --refout cannot be combined with -S/--sam" << endl;
1004 // Set colorspace default (--ff)
1008 // Set nucleotide space default (--fr)
1013 if(outType != OUTPUT_FULL && suppressOuts.count() > 0 && !quiet) {
1014 cerr << "Warning: Ignoring --suppress because output type is not default." << endl;
1015 cerr << " --suppress is only available for the default output type." << endl;
1016 suppressOuts.clear();
1020 static const char *argv0 = NULL;
1022 #define FINISH_READ(p) \
1023 /* Don't do finishRead if the read isn't legit or if the read was skipped by the doneMask */ \
1025 sink->finishRead(*p, true, !skipped); \
1029 static inline void finishReadWithHitmask(PatternSourcePerThread* p,
1030 HitSinkPerThread* sink,
1031 SyncBitset& hitMask,
1035 /* Don't do finishRead if the read isn't legit */
1037 /* r = whether to consider reporting the read as unaligned */
1038 bool reportUnAl = r;
1040 /* If the done-mask already shows the read as done, */
1041 /* then we already reported the unaligned read and */
1042 /* should refrain from re-reporting*/
1043 reportUnAl = !skipped;
1045 /* If there hasn't been a hit reported, then report */
1046 /* read as unaligned */
1047 reportUnAl = !hitMask.test(p->patid());
1050 if(sink->finishRead(*p, true, reportUnAl) > 0) {
1051 /* We reported a hit for the read, so we set the */
1052 /* appropriate bit in the hitMask to prevent it from */
1053 /* being reported as unaligned. */
1054 if(!reportUnAl && sink->dumpsReads()) {
1055 hitMask.setOver(p->patid());
1062 /// Macro for getting the next read, possibly aborting depending on
1063 /// whether the result is empty or the patid exceeds the limit, and
1064 /// marshaling the read into convenient variables.
1065 #define GET_READ(p) \
1066 p->nextReadPair(); \
1067 if(p->empty() || p->patid() >= qUpto) { \
1068 p->bufa().clearAll(); \
1071 assert(!empty(p->bufa().patFw)); \
1072 String<Dna5>& patFw = p->bufa().patFw; \
1073 patFw.data_begin += 0; /* suppress "unused" compiler warning */ \
1074 String<Dna5>& patRc = p->bufa().patRc; \
1075 patRc.data_begin += 0; /* suppress "unused" compiler warning */ \
1076 String<char>& qual = p->bufa().qual; \
1077 qual.data_begin += 0; /* suppress "unused" compiler warning */ \
1078 String<char>& qualRev = p->bufa().qualRev; \
1079 qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1080 String<Dna5>& patFwRev = p->bufa().patFwRev; \
1081 patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1082 String<Dna5>& patRcRev = p->bufa().patRcRev; \
1083 patRcRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1084 String<char>& name = p->bufa().name; \
1085 name.data_begin += 0; /* suppress "unused" compiler warning */ \
1086 uint32_t patid = p->patid(); \
1087 params.setPatId(patid);
1089 /// Macro for getting the forward oriented version of next read,
1090 /// possibly aborting depending on whether the result is empty or the
1091 /// patid exceeds the limit, and marshaling the read into convenient
1093 #define GET_READ_FW(p) \
1094 p->nextReadPair(); \
1095 if(p->empty() || p->patid() >= qUpto) { \
1096 p->bufa().clearAll(); \
1099 params.setPatId(p->patid()); \
1100 assert(!empty(p->bufa().patFw)); \
1101 String<Dna5>& patFw = p->bufa().patFw; \
1102 patFw.data_begin += 0; /* suppress "unused" compiler warning */ \
1103 String<char>& qual = p->bufa().qual; \
1104 qual.data_begin += 0; /* suppress "unused" compiler warning */ \
1105 String<Dna5>& patFwRev = p->bufa().patFwRev; \
1106 patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1107 String<char>& qualRev = p->bufa().qualRev; \
1108 qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \
1109 String<char>& name = p->bufa().name; \
1110 name.data_begin += 0; /* suppress "unused" compiler warning */ \
1111 uint32_t patid = p->patid();
1113 #ifdef BOWTIE_PTHREADS
1114 #define WORKER_EXIT() \
1115 patsrcFact->destroy(patsrc); \
1116 delete patsrcFact; \
1117 sinkFact->destroy(sink); \
1120 pthread_exit(NULL); \
1124 #define WORKER_EXIT() \
1125 patsrcFact->destroy(patsrc); \
1126 delete patsrcFact; \
1127 sinkFact->destroy(sink); \
1132 #ifdef CHUD_PROFILING
1133 #define CHUD_START() chudStartRemotePerfMonitor("Bowtie");
1134 #define CHUD_STOP() chudStopRemotePerfMonitor();
1136 #define CHUD_START()
1140 /// Create a PatternSourcePerThread for the current thread according
1141 /// to the global params and return a pointer to it
1142 static PatternSourcePerThreadFactory*
1143 createPatsrcFactory(PairedPatternSource& _patsrc, int tid) {
1144 PatternSourcePerThreadFactory *patsrcFact;
1145 if(randReadsNoSync) {
1146 patsrcFact = new RandomPatternSourcePerThreadFactory(numRandomReads, lenRandomReads, nthreads, tid);
1148 patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc);
1150 assert(patsrcFact != NULL);
1155 * Allocate a HitSinkPerThreadFactory on the heap according to the
1156 * global params and return a pointer to it.
1158 static HitSinkPerThreadFactory*
1159 createSinkFactory(HitSink& _sink) {
1160 HitSinkPerThreadFactory *sink = NULL;
1161 if(format == INPUT_CHAIN) {
1163 sink = new ChainingHitSinkPerThreadFactory(_sink, allHits ? 0xffffffff : khits, mhits, strata);
1164 } else if(!strata) {
1167 // First N good; "good" inherently ignores strata
1168 sink = new NGoodHitSinkPerThreadFactory(_sink, khits, mhits);
1170 // All hits, spanning strata
1171 sink = new AllHitSinkPerThreadFactory(_sink, mhits);
1178 // Buffer best hits, assuming they're arriving in best-
1180 sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, khits, mhits);
1183 // Buffer best hits, assuming they're arriving in best-
1185 sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, 0xffffffff/2, mhits);
1188 assert(sink != NULL);
1193 * Search through a single (forward) Ebwt index for exact end-to-end
1194 * hits. Assumes that index is already loaded into memory.
1196 static PairedPatternSource* exactSearch_patsrc;
1197 static HitSink* exactSearch_sink;
1198 static Ebwt<String<Dna> >* exactSearch_ebwt;
1199 static vector<String<Dna5> >* exactSearch_os;
1200 static BitPairReference* exactSearch_refs;
1201 static void *exactSearchWorker(void *vp) {
1202 int tid = *((int*)vp);
1203 PairedPatternSource& _patsrc = *exactSearch_patsrc;
1204 HitSink& _sink = *exactSearch_sink;
1205 Ebwt<String<Dna> >& ebwt = *exactSearch_ebwt;
1206 vector<String<Dna5> >& os = *exactSearch_os;
1207 const BitPairReference* refs = exactSearch_refs;
1209 // Per-thread initialization
1210 PatternSourcePerThreadFactory *patsrcFact = createPatsrcFactory(_patsrc, tid);
1211 PatternSourcePerThread *patsrc = patsrcFact->create();
1212 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1213 HitSinkPerThread* sink = sinkFact->create();
1214 EbwtSearchParams<String<Dna> > params(
1216 os, // reference sequences
1217 true, // read is forward
1218 true); // index is forward
1219 GreedyDFSRangeSource bt(
1221 refs, // reference sequence (for colorspace)
1222 0xffffffff, // qualThresh
1223 0xffffffff, // max backtracks (no max)
1224 0, // reportPartials (don't)
1225 true, // reportExacts
1226 rangeMode, // reportRanges
1231 false); // considerQuals
1232 bool skipped = false;
1234 FINISH_READ(patsrc);
1236 #include "search_exact.c"
1238 FINISH_READ(patsrc);
1243 * A statefulness-aware worker driver. Uses UnpairedExactAlignerV1.
1245 static void *exactSearchWorkerStateful(void *vp) {
1246 int tid = *((int*)vp);
1247 PairedPatternSource& _patsrc = *exactSearch_patsrc;
1248 HitSink& _sink = *exactSearch_sink;
1249 Ebwt<String<Dna> >& ebwt = *exactSearch_ebwt;
1250 vector<String<Dna5> >& os = *exactSearch_os;
1251 BitPairReference* refs = exactSearch_refs;
1253 // Global initialization
1254 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1255 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1257 ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1258 UnpairedExactAlignerV1Factory alSEfact(
1278 PairedExactAlignerV1Factory alPEfact(
1292 mhits, // for symCeiling
1309 MixedMultiAligner multi(
1317 // MultiAligner must be destroyed before patsrcFact
1323 #ifdef BOWTIE_PTHREADS
1324 if(tid > 0) pthread_exit(NULL);
1329 #define SET_A_FW(bt, p, params) \
1330 bt.setQuery(&p->bufa().patFw, &p->bufa().qual, &p->bufa().name); \
1332 #define SET_A_RC(bt, p, params) \
1333 bt.setQuery(&p->bufa().patRc, &p->bufa().qualRev, &p->bufa().name); \
1334 params.setFw(false);
1335 #define SET_B_FW(bt, p, params) \
1336 bt.setQuery(&p->bufb().patFw, &p->bufb().qual, &p->bufb().name); \
1338 #define SET_B_RC(bt, p, params) \
1339 bt.setQuery(&p->bufb().patRc, &p->bufb().qualRev, &p->bufb().name); \
1340 params.setFw(false);
1342 #ifdef BOWTIE_PTHREADS
1343 #define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED)
1347 * Search through a single (forward) Ebwt index for exact end-to-end
1348 * hits. Assumes that index is already loaded into memory.
1350 static void exactSearch(PairedPatternSource& _patsrc,
1352 Ebwt<String<Dna> >& ebwt,
1353 vector<String<Dna5> >& os)
1355 exactSearch_patsrc = &_patsrc;
1356 exactSearch_sink = &_sink;
1357 exactSearch_ebwt = &ebwt;
1358 exactSearch_os = &os;
1360 assert(!ebwt.isInMemory());
1362 // Load the rest of (vast majority of) the backward Ebwt into
1364 Timer _t(cerr, "Time loading forward index: ", timing);
1365 ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1368 BitPairReference *refs = NULL;
1369 bool pair = mates1.size() > 0 || mates12.size() > 0;
1370 if(color || (pair && mixedThresh < 0xffffffff)) {
1371 Timer _t(cerr, "Time loading reference: ", timing);
1372 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1373 if(!refs->loaded()) throw 1;
1375 exactSearch_refs = refs;
1377 #ifdef BOWTIE_PTHREADS
1378 AutoArray<pthread_t> threads(nthreads-1);
1379 AutoArray<int> tids(nthreads-1);
1383 Timer _t(cerr, "Time for 0-mismatch search: ", timing);
1384 #ifdef BOWTIE_PTHREADS
1385 for(int i = 0; i < nthreads-1; i++) {
1388 createThread(&threads[i],
1389 exactSearchWorkerStateful,
1392 createThread(&threads[i],
1399 if(stateful) exactSearchWorkerStateful((void*)&tmp);
1400 else exactSearchWorker((void*)&tmp);
1401 #ifdef BOWTIE_PTHREADS
1402 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1405 if(refs != NULL) delete refs;
1409 * Search through a pair of Ebwt indexes, one for the forward direction
1410 * and one for the backward direction, for exact end-to-end hits and 1-
1411 * mismatch end-to-end hits. In my experience, this is slightly faster
1412 * than Maq (default) mode with the -n 1 option.
1414 * Forward Ebwt (ebwtFw) is already loaded into memory and backward
1415 * Ebwt (ebwtBw) is not loaded into memory.
1417 static PairedPatternSource* mismatchSearch_patsrc;
1418 static HitSink* mismatchSearch_sink;
1419 static Ebwt<String<Dna> >* mismatchSearch_ebwtFw;
1420 static Ebwt<String<Dna> >* mismatchSearch_ebwtBw;
1421 static vector<String<Dna5> >* mismatchSearch_os;
1422 static SyncBitset* mismatchSearch_doneMask;
1423 static SyncBitset* mismatchSearch_hitMask;
1424 static BitPairReference* mismatchSearch_refs;
1427 * A statefulness-aware worker driver. Uses Unpaired/Paired1mmAlignerV1.
1429 static void *mismatchSearchWorkerFullStateful(void *vp) {
1430 int tid = *((int*)vp);
1431 PairedPatternSource& _patsrc = *mismatchSearch_patsrc;
1432 HitSink& _sink = *mismatchSearch_sink;
1433 Ebwt<String<Dna> >& ebwtFw = *mismatchSearch_ebwtFw;
1434 Ebwt<String<Dna> >& ebwtBw = *mismatchSearch_ebwtBw;
1435 vector<String<Dna5> >& os = *mismatchSearch_os;
1436 BitPairReference* refs = mismatchSearch_refs;
1438 // Global initialization
1439 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1440 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1441 ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1443 Unpaired1mmAlignerV1Factory alSEfact(
1463 Paired1mmAlignerV1Factory alPEfact(
1477 mhits, // for symCeiling
1494 MixedMultiAligner multi(
1502 // MultiAligner must be destroyed before patsrcFact
1508 #ifdef BOWTIE_PTHREADS
1509 if(tid > 0) pthread_exit(NULL);
1514 static void* mismatchSearchWorkerFull(void *vp){
1515 int tid = *((int*)vp);
1516 PairedPatternSource& _patsrc = *mismatchSearch_patsrc;
1517 HitSink& _sink = *mismatchSearch_sink;
1518 Ebwt<String<Dna> >& ebwtFw = *mismatchSearch_ebwtFw;
1519 Ebwt<String<Dna> >& ebwtBw = *mismatchSearch_ebwtBw;
1520 vector<String<Dna5> >& os = *mismatchSearch_os;
1521 const BitPairReference* refs = mismatchSearch_refs;
1523 // Per-thread initialization
1524 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1525 PatternSourcePerThread* patsrc = patsrcFact->create();
1526 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1527 HitSinkPerThread* sink = sinkFact->create();
1528 EbwtSearchParams<String<Dna> > params(
1529 *sink, // HitSinkPerThread
1530 os, // reference sequences
1531 true, // read is forward
1532 false); // index is mirror index
1533 GreedyDFSRangeSource bt(
1535 refs, // reference sequence (for colorspace)
1536 0xffffffff, // qualThresh
1537 0xffffffff, // max backtracks (no max)
1538 0, // reportPartials (don't)
1539 true, // reportExacts
1540 rangeMode, // reportRanges
1545 false); // considerQuals
1546 bool skipped = false;
1548 FINISH_READ(patsrc);
1550 uint32_t plen = length(patFw);
1552 uint32_t s3 = s >> 1; // length of 3' half of seed
1553 uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
1554 #define DONEMASK_SET(p)
1555 #include "search_1mm_phase1.c"
1556 #include "search_1mm_phase2.c"
1559 FINISH_READ(patsrc);
1564 * Search through a single (forward) Ebwt index for exact end-to-end
1565 * hits. Assumes that index is already loaded into memory.
1567 static void mismatchSearchFull(PairedPatternSource& _patsrc,
1569 Ebwt<String<Dna> >& ebwtFw,
1570 Ebwt<String<Dna> >& ebwtBw,
1571 vector<String<Dna5> >& os)
1573 mismatchSearch_patsrc = &_patsrc;
1574 mismatchSearch_sink = &_sink;
1575 mismatchSearch_ebwtFw = &ebwtFw;
1576 mismatchSearch_ebwtBw = &ebwtBw;
1577 mismatchSearch_doneMask = NULL;
1578 mismatchSearch_hitMask = NULL;
1579 mismatchSearch_os = &os;
1581 assert(!ebwtFw.isInMemory());
1582 assert(!ebwtBw.isInMemory());
1584 // Load the other half of the index into memory
1585 Timer _t(cerr, "Time loading forward index: ", timing);
1586 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1589 // Load the other half of the index into memory
1590 Timer _t(cerr, "Time loading mirror index: ", timing);
1591 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1593 // Create range caches, which are shared among all aligners
1594 BitPairReference *refs = NULL;
1595 bool pair = mates1.size() > 0 || mates12.size() > 0;
1596 if(color || (pair && mixedThresh < 0xffffffff)) {
1597 Timer _t(cerr, "Time loading reference: ", timing);
1598 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1599 if(!refs->loaded()) throw 1;
1601 mismatchSearch_refs = refs;
1603 #ifdef BOWTIE_PTHREADS
1604 // Allocate structures for threads
1605 AutoArray<pthread_t> threads(nthreads-1);
1606 AutoArray<int> tids(nthreads-1);
1610 Timer _t(cerr, "Time for 1-mismatch full-index search: ", timing);
1611 #ifdef BOWTIE_PTHREADS
1612 for(int i = 0; i < nthreads-1; i++) {
1615 createThread(&threads[i], mismatchSearchWorkerFullStateful, (void *)&tids[i]);
1617 createThread(&threads[i], mismatchSearchWorkerFull, (void *)&tids[i]);
1622 if(stateful) mismatchSearchWorkerFullStateful((void*)&tmp);
1623 else mismatchSearchWorkerFull((void*)&tmp);
1624 #ifdef BOWTIE_PTHREADS
1625 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1628 if(refs != NULL) delete refs;
1631 #define SWITCH_TO_FW_INDEX() { \
1632 /* Evict the mirror index from memory if necessary */ \
1633 if(ebwtBw.isInMemory()) ebwtBw.evictFromMemory(); \
1634 assert(!ebwtBw.isInMemory()); \
1635 /* Load the forward index into memory if necessary */ \
1636 if(!ebwtFw.isInMemory()) { \
1637 Timer _t(cerr, "Time loading forward index: ", timing); \
1638 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose); \
1640 assert(ebwtFw.isInMemory()); \
1641 _patsrc.reset(); /* rewind pattern source to first pattern */ \
1644 #define SWITCH_TO_BW_INDEX() { \
1645 /* Evict the forward index from memory if necessary */ \
1646 if(ebwtFw.isInMemory()) ebwtFw.evictFromMemory(); \
1647 assert(!ebwtFw.isInMemory()); \
1648 /* Load the forward index into memory if necessary */ \
1649 if(!ebwtBw.isInMemory()) { \
1650 Timer _t(cerr, "Time loading mirror index: ", timing); \
1651 ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); \
1653 assert(ebwtBw.isInMemory()); \
1654 _patsrc.reset(); /* rewind pattern source to first pattern */ \
1657 #define ASSERT_NO_HITS_FW(ebwtfw) \
1658 if(sanityCheck && os.size() > 0) { \
1660 uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
1661 uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \
1662 uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \
1663 uint32_t unrevOff = (seedMms == 0) ? s : 0; \
1664 if(hits.size() > 0) { \
1665 /* Print offending hit obtained by oracle */ \
1675 ebwtfw); /* ebwtFw */ \
1677 assert_eq(0, hits.size()); \
1680 #define ASSERT_NO_HITS_RC(ebwtfw) \
1681 if(sanityCheck && os.size() > 0) { \
1683 uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
1684 uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \
1685 uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \
1686 uint32_t unrevOff = (seedMms == 0) ? s : 0; \
1687 if(hits.size() > 0) { \
1688 /* Print offending hit obtained by oracle */ \
1698 ebwtfw); /* ebwtFw */ \
1700 assert_eq(0, hits.size()); \
1703 static PairedPatternSource* twoOrThreeMismatchSearch_patsrc;
1704 static HitSink* twoOrThreeMismatchSearch_sink;
1705 static Ebwt<String<Dna> >* twoOrThreeMismatchSearch_ebwtFw;
1706 static Ebwt<String<Dna> >* twoOrThreeMismatchSearch_ebwtBw;
1707 static vector<String<Dna5> >* twoOrThreeMismatchSearch_os;
1708 static SyncBitset* twoOrThreeMismatchSearch_doneMask;
1709 static SyncBitset* twoOrThreeMismatchSearch_hitMask;
1710 static bool twoOrThreeMismatchSearch_two;
1711 static BitPairReference* twoOrThreeMismatchSearch_refs;
1713 #define TWOTHREE_WORKER_SETUP() \
1714 int tid = *((int*)vp); \
1715 PairedPatternSource& _patsrc = *twoOrThreeMismatchSearch_patsrc; \
1716 HitSink& _sink = *twoOrThreeMismatchSearch_sink; \
1717 vector<String<Dna5> >& os = *twoOrThreeMismatchSearch_os; \
1718 bool two = twoOrThreeMismatchSearch_two; \
1719 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); \
1720 PatternSourcePerThread* patsrc = patsrcFact->create(); \
1721 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink); \
1722 HitSinkPerThread* sink = sinkFact->create(); \
1723 /* Per-thread initialization */ \
1724 EbwtSearchParams<String<Dna> > params( \
1725 *sink, /* HitSink */ \
1726 os, /* reference sequences */ \
1727 true, /* read is forward */ \
1728 true); /* index is forward */
1731 * A statefulness-aware worker driver. Uses UnpairedExactAlignerV1.
1733 static void *twoOrThreeMismatchSearchWorkerStateful(void *vp) {
1734 int tid = *((int*)vp);
1735 PairedPatternSource& _patsrc = *twoOrThreeMismatchSearch_patsrc;
1736 HitSink& _sink = *twoOrThreeMismatchSearch_sink;
1737 Ebwt<String<Dna> >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw;
1738 Ebwt<String<Dna> >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw;
1739 vector<String<Dna5> >& os = *twoOrThreeMismatchSearch_os;
1740 BitPairReference* refs = twoOrThreeMismatchSearch_refs;
1741 static bool two = twoOrThreeMismatchSearch_two;
1743 // Global initialization
1744 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
1745 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
1747 ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
1748 Unpaired23mmAlignerV1Factory alSEfact(
1769 Paired23mmAlignerV1Factory alPEfact(
1784 mhits, // for symCeiling
1801 MixedMultiAligner multi(
1809 // MultiAligner must be destroyed before patsrcFact
1815 #ifdef BOWTIE_PTHREADS
1816 if(tid > 0) pthread_exit(NULL);
1821 static void* twoOrThreeMismatchSearchWorkerFull(void *vp) {
1822 TWOTHREE_WORKER_SETUP();
1823 Ebwt<String<Dna> >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw;
1824 Ebwt<String<Dna> >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw;
1825 const BitPairReference* refs = twoOrThreeMismatchSearch_refs;
1826 GreedyDFSRangeSource btr1(
1828 refs, // reference sequence (for colorspace)
1829 0xffffffff, // qualThresh
1830 // Do not impose maximums in 2/3-mismatch mode
1831 0xffffffff, // max backtracks (no limit)
1832 0, // reportPartials (don't)
1833 true, // reportExacts
1834 rangeMode, // reportRanges
1839 false); // considerQuals
1840 GreedyDFSRangeSource bt2(
1842 refs, // reference sequence (for colorspace)
1843 0xffffffff, // qualThresh
1844 // Do not impose maximums in 2/3-mismatch mode
1845 0xffffffff, // max backtracks (no limit)
1846 0, // reportPartials (no)
1847 true, // reportExacts
1848 rangeMode, // reportRanges
1853 false); // considerQuals
1854 GreedyDFSRangeSource bt3(
1856 refs, // reference sequence (for colorspace)
1857 0xffffffff, // qualThresh (none)
1858 // Do not impose maximums in 2/3-mismatch mode
1859 0xffffffff, // max backtracks (no limit)
1860 0, // reportPartials (don't)
1861 true, // reportExacts
1862 rangeMode, // reportRanges
1867 false); // considerQuals
1868 GreedyDFSRangeSource bthh3(
1870 refs, // reference sequence (for colorspace)
1871 0xffffffff, // qualThresh
1872 // Do not impose maximums in 2/3-mismatch mode
1873 0xffffffff, // max backtracks (no limit)
1874 0, // reportPartials (don't)
1875 true, // reportExacts
1876 rangeMode, // reportRanges
1881 false, // considerQuals
1882 true); // halfAndHalf
1883 bool skipped = false;
1884 while(true) { // Read read-in loop
1885 FINISH_READ(patsrc);
1887 patid += 0; // kill unused variable warning
1888 uint32_t plen = length(patFw);
1890 uint32_t s3 = s >> 1; // length of 3' half of seed
1891 uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
1892 #define DONEMASK_SET(p)
1893 #include "search_23mm_phase1.c"
1894 #include "search_23mm_phase2.c"
1895 #include "search_23mm_phase3.c"
1898 FINISH_READ(patsrc);
1899 // Threads join at end of Phase 1
1903 template<typename TStr>
1904 static void twoOrThreeMismatchSearchFull(
1905 PairedPatternSource& _patsrc, /// pattern source
1906 HitSink& _sink, /// hit sink
1907 Ebwt<TStr>& ebwtFw, /// index of original text
1908 Ebwt<TStr>& ebwtBw, /// index of mirror text
1909 vector<String<Dna5> >& os, /// text strings, if available (empty otherwise)
1910 bool two = true) /// true -> 2, false -> 3
1912 // Global initialization
1913 assert(!ebwtFw.isInMemory());
1914 assert(!ebwtBw.isInMemory());
1916 // Load the other half of the index into memory
1917 Timer _t(cerr, "Time loading forward index: ", timing);
1918 ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1921 // Load the other half of the index into memory
1922 Timer _t(cerr, "Time loading mirror index: ", timing);
1923 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
1925 // Create range caches, which are shared among all aligners
1926 BitPairReference *refs = NULL;
1927 bool pair = mates1.size() > 0 || mates12.size() > 0;
1928 if(color || (pair && mixedThresh < 0xffffffff)) {
1929 Timer _t(cerr, "Time loading reference: ", timing);
1930 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
1931 if(!refs->loaded()) throw 1;
1933 twoOrThreeMismatchSearch_refs = refs;
1934 twoOrThreeMismatchSearch_patsrc = &_patsrc;
1935 twoOrThreeMismatchSearch_sink = &_sink;
1936 twoOrThreeMismatchSearch_ebwtFw = &ebwtFw;
1937 twoOrThreeMismatchSearch_ebwtBw = &ebwtBw;
1938 twoOrThreeMismatchSearch_os = &os;
1939 twoOrThreeMismatchSearch_doneMask = NULL;
1940 twoOrThreeMismatchSearch_hitMask = NULL;
1941 twoOrThreeMismatchSearch_two = two;
1943 #ifdef BOWTIE_PTHREADS
1944 AutoArray<pthread_t> threads(nthreads-1);
1945 AutoArray<int> tids(nthreads-1);
1949 Timer _t(cerr, "End-to-end 2/3-mismatch full-index search: ", timing);
1950 #ifdef BOWTIE_PTHREADS
1951 for(int i = 0; i < nthreads-1; i++) {
1954 createThread(&threads[i], twoOrThreeMismatchSearchWorkerStateful, (void *)&tids[i]);
1956 createThread(&threads[i], twoOrThreeMismatchSearchWorkerFull, (void *)&tids[i]);
1960 if(stateful) twoOrThreeMismatchSearchWorkerStateful((void*)&tmp);
1961 else twoOrThreeMismatchSearchWorkerFull((void*)&tmp);
1962 #ifdef BOWTIE_PTHREADS
1963 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
1966 if(refs != NULL) delete refs;
1970 static PairedPatternSource* seededQualSearch_patsrc;
1971 static HitSink* seededQualSearch_sink;
1972 static Ebwt<String<Dna> >* seededQualSearch_ebwtFw;
1973 static Ebwt<String<Dna> >* seededQualSearch_ebwtBw;
1974 static vector<String<Dna5> >* seededQualSearch_os;
1975 static SyncBitset* seededQualSearch_doneMask;
1976 static SyncBitset* seededQualSearch_hitMask;
1977 static PartialAlignmentManager* seededQualSearch_pamFw;
1978 static PartialAlignmentManager* seededQualSearch_pamRc;
1979 static int seededQualSearch_qualCutoff;
1980 static BitPairReference* seededQualSearch_refs;
1982 #define SEEDEDQUAL_WORKER_SETUP() \
1983 int tid = *((int*)vp); \
1984 PairedPatternSource& _patsrc = *seededQualSearch_patsrc; \
1985 HitSink& _sink = *seededQualSearch_sink; \
1986 vector<String<Dna5> >& os = *seededQualSearch_os; \
1987 int qualCutoff = seededQualSearch_qualCutoff; \
1988 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid); \
1989 PatternSourcePerThread* patsrc = patsrcFact->create(); \
1990 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink); \
1991 HitSinkPerThread* sink = sinkFact->create(); \
1992 /* Per-thread initialization */ \
1993 EbwtSearchParams<String<Dna> > params( \
1994 *sink, /* HitSink */ \
1995 os, /* reference sequences */ \
1996 true, /* read is forward */ \
1997 true); /* index is forward */
1999 static void* seededQualSearchWorkerFull(void *vp) {
2000 SEEDEDQUAL_WORKER_SETUP();
2001 Ebwt<String<Dna> >& ebwtFw = *seededQualSearch_ebwtFw;
2002 Ebwt<String<Dna> >& ebwtBw = *seededQualSearch_ebwtBw;
2003 PartialAlignmentManager * pamRc = NULL;
2004 PartialAlignmentManager * pamFw = NULL;
2006 pamRc = new PartialAlignmentManager(64);
2007 pamFw = new PartialAlignmentManager(64);
2009 vector<PartialAlignment> pals;
2010 const BitPairReference* refs = seededQualSearch_refs;
2011 // GreedyDFSRangeSource for finding exact hits for the forward-
2013 GreedyDFSRangeSource btf1(
2015 refs, // reference sequence (for colorspace)
2016 qualCutoff, // qualThresh
2017 maxBtsBetter, // max backtracks
2018 0, // reportPartials (don't)
2019 true, // reportExacts
2020 rangeMode, // reportRanges
2025 false); // considerQuals
2026 GreedyDFSRangeSource bt1(
2028 refs, // reference sequence (for colorspace)
2029 qualCutoff, // qualThresh
2030 maxBtsBetter, // max backtracks
2031 0, // reportPartials (don't)
2032 true, // reportExacts
2033 rangeMode, // reportRanges
2037 &os, // reference sequences
2038 true, // considerQuals
2039 false, !noMaqRound);
2040 // GreedyDFSRangeSource to search for hits for cases 1F, 2F, 3F
2041 GreedyDFSRangeSource btf2(
2043 refs, // reference sequence (for colorspace)
2044 qualCutoff, // qualThresh
2045 maxBtsBetter, // max backtracks
2046 0, // reportPartials (no)
2047 true, // reportExacts
2048 rangeMode, // reportRanges
2049 NULL, // partial alignment manager
2052 &os, // reference sequences
2053 true, // considerQuals
2054 false, !noMaqRound);
2055 // GreedyDFSRangeSource to search for partial alignments for case 4R
2056 GreedyDFSRangeSource btr2(
2058 refs, // reference sequence (for colorspace)
2059 qualCutoff, // qualThresh (none)
2060 maxBtsBetter, // max backtracks
2061 seedMms, // report partials (up to seedMms mms)
2062 true, // reportExacts
2063 rangeMode, // reportRanges
2064 pamRc, // partial alignment manager
2067 &os, // reference sequences
2068 true, // considerQuals
2069 false, !noMaqRound);
2070 // GreedyDFSRangeSource to search for seedlings for case 4F
2071 GreedyDFSRangeSource btf3(
2073 refs, // reference sequence (for colorspace)
2074 qualCutoff, // qualThresh (none)
2075 maxBtsBetter, // max backtracks
2076 seedMms, // reportPartials (do)
2077 true, // reportExacts
2078 rangeMode, // reportRanges
2082 &os, // reference sequences
2083 true, // considerQuals
2084 false, !noMaqRound);
2085 // GreedyDFSRangeSource to search for hits for case 4R by extending
2086 // the partial alignments found in Phase 2
2087 GreedyDFSRangeSource btr3(
2089 refs, // reference sequence (for colorspace)
2090 qualCutoff, // qualThresh
2091 maxBtsBetter, // max backtracks
2092 0, // reportPartials (don't)
2093 true, // reportExacts
2094 rangeMode,// reportRanges
2098 &os, // reference sequences
2099 true, // considerQuals
2100 false, !noMaqRound);
2101 // The half-and-half GreedyDFSRangeSource
2102 GreedyDFSRangeSource btr23(
2104 refs, // reference sequence (for colorspace)
2105 qualCutoff, // qualThresh
2106 maxBtsBetter, // max backtracks
2107 0, // reportPartials (don't)
2108 true, // reportExacts
2109 rangeMode,// reportRanges
2114 true, // considerQuals
2115 true, // halfAndHalf
2117 // GreedyDFSRangeSource to search for hits for case 4F by extending
2118 // the partial alignments found in Phase 3
2119 GreedyDFSRangeSource btf4(
2121 refs, // reference sequence (for colorspace)
2122 qualCutoff, // qualThresh
2123 maxBtsBetter, // max backtracks
2124 0, // reportPartials (don't)
2125 true, // reportExacts
2126 rangeMode,// reportRanges
2130 &os, // reference sequences
2131 true, // considerQuals
2132 false, !noMaqRound);
2133 // Half-and-half GreedyDFSRangeSource for forward read
2134 GreedyDFSRangeSource btf24(
2136 refs, // reference sequence (for colorspace)
2137 qualCutoff, // qualThresh
2138 maxBtsBetter, // max backtracks
2139 0, // reportPartials (don't)
2140 true, // reportExacts
2141 rangeMode,// reportRanges
2146 true, // considerQuals
2147 true, // halfAndHalf
2149 String<QueryMutation> muts;
2150 bool skipped = false;
2152 FINISH_READ(patsrc);
2154 size_t plen = length(patFw);
2155 uint32_t s = seedLen;
2156 uint32_t s3 = (s >> 1); /* length of 3' half of seed */
2157 uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */
2158 uint32_t qs = min<uint32_t>(plen, s);
2159 uint32_t qs3 = qs >> 1;
2160 uint32_t qs5 = (qs >> 1) + (qs & 1);
2161 #define DONEMASK_SET(p)
2162 #include "search_seeded_phase1.c"
2163 #include "search_seeded_phase2.c"
2164 #include "search_seeded_phase3.c"
2165 #include "search_seeded_phase4.c"
2168 FINISH_READ(patsrc);
2176 static void* seededQualSearchWorkerFullStateful(void *vp) {
2177 int tid = *((int*)vp);
2178 PairedPatternSource& _patsrc = *seededQualSearch_patsrc;
2179 HitSink& _sink = *seededQualSearch_sink;
2180 Ebwt<String<Dna> >& ebwtFw = *seededQualSearch_ebwtFw;
2181 Ebwt<String<Dna> >& ebwtBw = *seededQualSearch_ebwtBw;
2182 vector<String<Dna5> >& os = *seededQualSearch_os;
2183 int qualCutoff = seededQualSearch_qualCutoff;
2184 BitPairReference* refs = seededQualSearch_refs;
2186 // Global initialization
2187 PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid);
2188 HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink);
2189 ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
2191 AlignerMetrics *metrics = NULL;
2193 metrics = new AlignerMetrics();
2195 UnpairedSeedAlignerFactory alSEfact(
2220 PairedSeedAlignerFactory alPEfact(
2238 mhits, // for symCeiling
2256 MixedMultiAligner multi(
2264 // MultiAligner must be destroyed before patsrcFact
2266 if(metrics != NULL) {
2267 metrics->printSummary();
2274 #ifdef BOWTIE_PTHREADS
2283 * Search for a good alignments for each read using criteria that
2284 * correspond somewhat faithfully to Maq's. Search is aided by a pair
2285 * of Ebwt indexes, one for the original references, and one for the
2286 * transpose of the references. Neither index should be loaded upon
2287 * entry to this function.
2289 * Like Maq, we treat the first 24 base pairs of the read (those
2290 * closest to the 5' end) differently from the remainder of the read.
2291 * We call the first 24 base pairs the "seed."
2293 template<typename TStr>
2294 static void seededQualCutoffSearchFull(
2295 int seedLen, /// length of seed (not a maq option)
2296 int qualCutoff, /// maximum sum of mismatch qualities
2297 /// like maq map's -e option
2299 int seedMms, /// max # mismatches allowed in seed
2300 /// (like maq map's -n option)
2301 /// Can only be 1 or 2, default: 1
2302 PairedPatternSource& _patsrc, /// pattern source
2303 HitSink& _sink, /// hit sink
2304 Ebwt<TStr>& ebwtFw, /// index of original text
2305 Ebwt<TStr>& ebwtBw, /// index of mirror text
2306 vector<String<Dna5> >& os) /// text strings, if available (empty otherwise)
2308 // Global intialization
2309 assert_leq(seedMms, 3);
2311 seededQualSearch_patsrc = &_patsrc;
2312 seededQualSearch_sink = &_sink;
2313 seededQualSearch_ebwtFw = &ebwtFw;
2314 seededQualSearch_ebwtBw = &ebwtBw;
2315 seededQualSearch_os = &os;
2316 seededQualSearch_doneMask = NULL;
2317 seededQualSearch_hitMask = NULL;
2318 seededQualSearch_pamFw = NULL;
2319 seededQualSearch_pamRc = NULL;
2320 seededQualSearch_qualCutoff = qualCutoff;
2322 // Create range caches, which are shared among all aligners
2323 BitPairReference *refs = NULL;
2324 bool pair = mates1.size() > 0 || mates12.size() > 0;
2325 if(color || (pair && mixedThresh < 0xffffffff)) {
2326 Timer _t(cerr, "Time loading reference: ", timing);
2327 refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
2328 if(!refs->loaded()) throw 1;
2330 seededQualSearch_refs = refs;
2332 #ifdef BOWTIE_PTHREADS
2333 AutoArray<pthread_t> threads(nthreads-1);
2334 AutoArray<int> tids(nthreads-1);
2337 SWITCH_TO_FW_INDEX();
2338 assert(!ebwtBw.isInMemory());
2340 // Load the other half of the index into memory
2341 Timer _t(cerr, "Time loading mirror index: ", timing);
2342 ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
2346 // Phase 1: Consider cases 1R and 2R
2347 Timer _t(cerr, "Seeded quality full-index search: ", timing);
2348 #ifdef BOWTIE_PTHREADS
2349 for(int i = 0; i < nthreads-1; i++) {
2351 if(stateful) createThread(&threads[i],
2352 seededQualSearchWorkerFullStateful,
2354 else createThread(&threads[i],
2355 seededQualSearchWorkerFull,
2360 if(stateful) seededQualSearchWorkerFullStateful((void*)&tmp);
2361 else seededQualSearchWorkerFull((void*)&tmp);
2362 #ifdef BOWTIE_PTHREADS
2363 for(int i = 0; i < nthreads-1; i++) joinThread(threads[i]);
2369 ebwtBw.evictFromMemory();
2373 * Return a new dynamically allocated PatternSource for the given
2374 * format, using the given list of strings as the filenames to read
2375 * from or as the sequences themselves (i.e. if -c was used).
2377 static PatternSource*
2378 patsrcFromStrings(int format,
2379 const vector<string>& reads,
2380 const vector<string>* quals)
2384 return new FastaPatternSource (seed, reads, quals, color,
2387 patDumpfile, verbose,
2389 solexaQuals, phred64Quals,
2393 return new FastaContinuousPatternSource (
2394 seed, reads, fastaContLen,
2397 patDumpfile, verbose,
2400 return new RawPatternSource (seed, reads, color,
2403 patDumpfile, verbose,
2407 return new FastqPatternSource (seed, reads, color,
2410 patDumpfile, verbose,
2412 solexaQuals, phred64Quals,
2413 integerQuals, fuzzy,
2416 return new TabbedPatternSource(seed, reads, color,
2419 patDumpfile, verbose,
2423 return new VectorPatternSource(seed, reads, color,
2426 patDumpfile, verbose,
2430 return new ChainPatternSource (seed, reads, useSpinlock, patDumpfile,
2431 verbose, skipReads);
2433 return new RandomPatternSource(seed, 2000000, lenRandomReads,
2434 useSpinlock, patDumpfile,
2437 cerr << "Internal error; bad patsrc format: " << format << endl;
2443 #define PASS_DUMP_FILES dumpAlBase, dumpUnalBase, dumpMaxBase
2445 static string argstr;
2447 template<typename TStr>
2448 static void driver(const char * type,
2449 const string& ebwtFileBase,
2450 const string& query,
2451 const vector<string>& queries,
2452 const vector<string>& qualities,
2453 const string& outfile)
2455 if(verbose || startVerbose) {
2456 cerr << "Entered driver(): "; logTime(cerr, true);
2458 // Vector of the reference sequences; used for sanity-checking
2459 vector<String<Dna5> > os;
2460 // Read reference sequences from the command-line or from a FASTA file
2461 if(!origString.empty()) {
2462 // Determine if it's a file by looking at whether it has a FASTA-like
2464 size_t len = origString.length();
2465 if((len >= 6 && origString.substr(len-6) == ".fasta") ||
2466 (len >= 4 && origString.substr(len-4) == ".mfa") ||
2467 (len >= 4 && origString.substr(len-4) == ".fas") ||
2468 (len >= 4 && origString.substr(len-4) == ".fna") ||
2469 (len >= 3 && origString.substr(len-3) == ".fa"))
2472 vector<string> origFiles;
2473 tokenize(origString, ",", origFiles);
2474 readSequenceFiles<String<Dna5>, Fasta>(origFiles, os);
2477 readSequenceString(origString, os);
2481 adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
2483 vector<PatternSource*> patsrcs_a;
2484 vector<PatternSource*> patsrcs_b;
2485 vector<PatternSource*> patsrcs_ab;
2487 // If there were any first-mates specified, we will operate in
2489 bool paired = mates1.size() > 0 || mates12.size() > 0;
2490 if(paired) stateful = true;
2492 // Create list of pattern sources for paired reads appearing
2493 // interleaved in a single file
2494 if(verbose || startVerbose) {
2495 cerr << "Creating paired-end patsrcs: "; logTime(cerr, true);
2497 for(size_t i = 0; i < mates12.size(); i++) {
2498 const vector<string>* qs = &mates12;
2501 // Feed query files one to each PatternSource
2503 tmp.push_back(mates12[i]);
2504 assert_eq(1, tmp.size());
2506 patsrcs_ab.push_back(patsrcFromStrings(format, *qs, NULL));
2512 // Create list of pattern sources for paired reads
2513 for(size_t i = 0; i < mates1.size(); i++) {
2514 const vector<string>* qs = &mates1;
2515 const vector<string>* quals = &qualities1;
2516 vector<string> tmpSeq;
2517 vector<string> tmpQual;
2519 // Feed query files one to each PatternSource
2521 tmpSeq.push_back(mates1[i]);
2523 tmpQual.push_back(qualities1[i]);
2524 assert_eq(1, tmpSeq.size());
2526 if(quals->empty()) quals = NULL;
2527 patsrcs_a.push_back(patsrcFromStrings(format, *qs, quals));
2533 // Create list of pattern sources for paired reads
2534 for(size_t i = 0; i < mates2.size(); i++) {
2535 const vector<string>* qs = &mates2;
2536 const vector<string>* quals = &qualities2;
2537 vector<string> tmpSeq;
2538 vector<string> tmpQual;
2540 // Feed query files one to each PatternSource
2542 tmpSeq.push_back(mates2[i]);
2544 tmpQual.push_back(qualities2[i]);
2545 assert_eq(1, tmpSeq.size());
2547 if(quals->empty()) quals = NULL;
2548 patsrcs_b.push_back(patsrcFromStrings(format, *qs, quals));
2553 // All mates/mate files must be paired
2554 assert_eq(patsrcs_a.size(), patsrcs_b.size());
2556 // Create list of pattern sources for the unpaired reads
2557 if(verbose || startVerbose) {
2558 cerr << "Creating single-end patsrcs: "; logTime(cerr, true);
2560 for(size_t i = 0; i < queries.size(); i++) {
2561 const vector<string>* qs = &queries;
2562 const vector<string>* quals = &qualities;
2563 PatternSource* patsrc = NULL;
2564 vector<string> tmpSeq;
2565 vector<string> tmpQual;
2567 // Feed query files one to each PatternSource
2569 tmpSeq.push_back(queries[i]);
2571 tmpQual.push_back(qualities[i]);
2572 assert_eq(1, tmpSeq.size());
2574 if(quals->empty()) quals = NULL;
2575 patsrc = patsrcFromStrings(format, *qs, quals);
2576 assert(patsrc != NULL);
2577 patsrcs_a.push_back(patsrc);
2578 patsrcs_b.push_back(NULL);
2584 if(verbose || startVerbose) {
2585 cerr << "Creating PatternSource: "; logTime(cerr, true);
2587 PairedPatternSource *patsrc = NULL;
2588 if(mates12.size() > 0) {
2589 patsrc = new PairedSoloPatternSource(patsrcs_ab, seed);
2591 patsrc = new PairedDualPatternSource(patsrcs_a, patsrcs_b, seed);
2594 // Open hit output file
2595 if(verbose || startVerbose) {
2596 cerr << "Opening hit output file: "; logTime(cerr, true);
2599 if(!outfile.empty()) {
2603 cerr << "Warning: ignoring alignment output file " << outfile << " because --refout was specified" << endl;
2606 fout = new OutFileBuf(outfile.c_str(), false);
2609 if(outType == OUTPUT_CHAIN && !refOut) {
2610 cerr << "Error: Must specify an output file when output mode is --chain" << endl;
2613 fout = new OutFileBuf();
2615 ReferenceMap* rmap = NULL;
2616 if(refMapFile != NULL) {
2617 if(verbose || startVerbose) {
2618 cerr << "About to load in a reference map file with name "
2619 << refMapFile << ": "; logTime(cerr, true);
2621 rmap = new ReferenceMap(refMapFile, !noRefNames);
2623 AnnotationMap* amap = NULL;
2624 if(annotMapFile != NULL) {
2625 if(verbose || startVerbose) {
2626 cerr << "About to load in an annotation map file with name "
2627 << annotMapFile << ": "; logTime(cerr, true);
2629 amap = new AnnotationMap(annotMapFile);
2631 // Initialize Ebwt object and read in header
2632 if(verbose || startVerbose) {
2633 cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
2635 Ebwt<TStr> ebwt(adjustedEbwtFileBase,
2636 color, // index is colorspace
2637 -1, // don't care about entireReverse
2638 true, // index is for the forward direction
2639 /* overriding: */ offRate,
2640 /* overriding: */ isaRate,
2641 useMm, // whether to use memory-mapped files
2642 useShmem, // whether to use shared memory
2643 mmSweep, // sweep memory-mapped files
2644 !noRefNames, // load names?
2645 rmap, // reference map, or NULL if none is needed
2646 verbose, // whether to be talkative
2647 startVerbose, // talkative during initialization
2648 false /*passMemExc*/,
2650 Ebwt<TStr>* ebwtBw = NULL;
2651 // We need the mirror index if mismatches are allowed
2652 if(mismatches > 0 || maqLike) {
2653 if(verbose || startVerbose) {
2654 cerr << "About to initialize rev Ebwt: "; logTime(cerr, true);
2656 ebwtBw = new Ebwt<TStr>(
2657 adjustedEbwtFileBase + ".rev",
2658 color, // index is colorspace
2659 -1, // don't care about entireReverse
2660 false, // index is for the reverse direction
2661 /* overriding: */ offRate,
2662 /* overriding: */ isaRate,
2663 useMm, // whether to use memory-mapped files
2664 useShmem, // whether to use shared memory
2665 mmSweep, // sweep memory-mapped files
2666 !noRefNames, // load names?
2667 rmap, // reference map, or NULL if none is needed
2668 verbose, // whether to be talkative
2669 startVerbose, // talkative during initialization
2670 false /*passMemExc*/,
2674 for(size_t i = 0; i < os.size(); i++) {
2675 size_t olen = seqan::length(os[i]);
2676 int longestStretch = 0;
2678 for(size_t j = 0; j < olen; j++) {
2679 if((int)os[i][j] < 4) {
2681 if(curStretch > longestStretch) longestStretch = curStretch;
2686 if(longestStretch < (color ? 2 : 1)) {
2687 os.erase(os.begin() + i);
2692 if(sanityCheck && !os.empty()) {
2693 // Sanity check number of patterns and pattern lengths in Ebwt
2694 // against original strings
2695 assert_eq(os.size(), ebwt.nPat());
2696 for(size_t i = 0; i < os.size(); i++) {
2697 assert_eq(length(os[i]), ebwt.plen()[i] + (color ? 1 : 0));
2699 ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
2700 ebwt.checkOrigs(os, color, false);
2701 ebwt.evictFromMemory();
2704 Timer _t(cerr, "Time searching: ", timing);
2705 if(verbose || startVerbose) {
2706 cerr << "Creating HitSink: "; logTime(cerr, true);
2708 // Set up hit sink; if sanityCheck && !os.empty() is true,
2709 // then instruct the sink to "retain" hits in a vector in
2710 // memory so that we can easily sanity check them later on
2712 RecalTable *table = NULL;
2714 table = new RecalTable(recalMaxCycle, recalMaxQual, recalQualShift);
2716 vector<string>* refnames = &ebwt.refnames();
2717 if(noRefNames) refnames = NULL;
2721 sink = new VerboseHitSink(
2722 ebwt.nPat(), offBase,
2723 colorSeq, colorQual, printCost,
2724 suppressOuts, rmap, amap,
2725 fullRef, PASS_DUMP_FILES,
2726 format == TAB_MATE, sampleMax,
2727 table, refnames, partitionSz);
2729 sink = new VerboseHitSink(
2731 colorSeq, colorQual, printCost,
2732 suppressOuts, rmap, amap,
2733 fullRef, PASS_DUMP_FILES,
2734 format == TAB_MATE, sampleMax,
2735 table, refnames, partitionSz);
2742 SAMHitSink *sam = new SAMHitSink(
2743 fout, 1, rmap, amap,
2744 fullRef, defaultMapq,
2746 format == TAB_MATE, sampleMax,
2749 vector<string> refnames;
2751 readEbwtRefnames(adjustedEbwtFileBase, refnames);
2754 sam->out(0), ebwt.nPat(),
2755 refnames, color, samNoSQ, rmap,
2756 ebwt.plen(), fullRef,
2758 rgs.empty() ? NULL : rgs.c_str());
2763 case OUTPUT_CONCISE:
2765 sink = new ConciseHitSink(
2766 ebwt.nPat(), offBase,
2768 format == TAB_MATE, sampleMax,
2769 table, refnames, reportOpps);
2771 sink = new ConciseHitSink(
2774 format == TAB_MATE, sampleMax,
2775 table, refnames, reportOpps);
2780 sink = new ChainingHitSink(
2787 sink = new StubHitSink();
2790 cerr << "Invalid output type: " << outType << endl;
2793 if(verbose || startVerbose) {
2794 cerr << "Dispatching to search driver: "; logTime(cerr, true);
2797 seededQualCutoffSearchFull(seedLen,
2802 ebwt, // forward index
2803 *ebwtBw, // mirror index (not optional)
2804 os); // references, if available
2806 else if(mismatches > 0) {
2807 if(mismatches == 1) {
2808 assert(ebwtBw != NULL);
2809 mismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os);
2810 } else if(mismatches == 2 || mismatches == 3) {
2811 twoOrThreeMismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os, mismatches == 2);
2813 cerr << "Error: " << mismatches << " is not a supported number of mismatches" << endl;
2817 // Search without mismatches
2818 // Note that --fast doesn't make a difference here because
2819 // we're only loading half of the index anyway
2820 exactSearch(*patsrc, *sink, ebwt, os);
2822 // Evict any loaded indexes from memory
2823 if(ebwt.isInMemory()) {
2824 ebwt.evictFromMemory();
2826 if(ebwtBw != NULL) {
2830 sink->finish(hadoopOut); // end the hits section of the hit file
2832 for(size_t i = 0; i < patsrcs_a.size(); i++) {
2833 assert(patsrcs_a[i] != NULL);
2834 delete patsrcs_a[i];
2836 for(size_t i = 0; i < patsrcs_b.size(); i++) {
2837 if(patsrcs_b[i] != NULL) {
2838 delete patsrcs_b[i];
2841 for(size_t i = 0; i < patsrcs_ab.size(); i++) {
2842 if(patsrcs_ab[i] != NULL) {
2843 delete patsrcs_ab[i];
2850 if(fout != NULL) delete fout;
2854 // C++ name mangling is disabled for the bowtie() function to make it
2855 // easier to use Bowtie as a library.
2859 * Main bowtie entry function. Parses argc/argv style command-line
2860 * options, sets global configuration variables, and calls the driver()
2863 int bowtie(int argc, const char **argv) {
2865 // Reset all global state, including getopt state
2866 opterr = optind = 1;
2868 for(int i = 0; i < argc; i++) {
2870 if(i < argc-1) argstr += " ";
2872 string ebwtFile; // read serialized Ebwt from this file
2873 string query; // read query string(s) from this file
2874 vector<string> queries;
2875 string outfile; // write query results to this file
2876 if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
2877 parseOptions(argc, argv);
2880 cout << argv0 << " version " << BOWTIE_VERSION << endl;
2881 if(sizeof(void*) == 4) {
2882 cout << "32-bit" << endl;
2883 } else if(sizeof(void*) == 8) {
2884 cout << "64-bit" << endl;
2886 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
2888 cout << "Built on " << BUILD_HOST << endl;
2889 cout << BUILD_TIME << endl;
2890 cout << "Compiler: " << COMPILER_VERSION << endl;
2891 cout << "Options: " << COMPILER_OPTIONS << endl;
2892 cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
2894 << ", " << sizeof(long) << ", " << sizeof(long long)
2895 << ", " << sizeof(void *) << ", " << sizeof(size_t)
2896 << ", " << sizeof(off_t) << "}" << endl;
2899 #ifdef CHUD_PROFILING
2901 chudAcquireRemoteAccess();
2904 Timer _t(cerr, "Overall time: ", timing);
2906 cerr << "Parsing index and read arguments: "; logTime(cerr, true);
2909 // Get index basename
2910 if(optind >= argc) {
2911 cerr << "No index, query, or output file specified!" << endl;
2915 ebwtFile = argv[optind++];
2917 // Get query filename
2918 if(optind >= argc) {
2919 if(mates1.size() > 0 || mates12.size() > 0) {
2922 cerr << "No query or output file specified!" << endl;
2926 } else if (mates1.size() == 0 && mates12.size() == 0) {
2927 query = argv[optind++];
2928 // Tokenize the list of query files
2929 tokenize(query, ",", queries);
2930 if(queries.size() < 1) {
2931 cerr << "Tokenized query file list was empty!" << endl;
2937 // Get output filename
2939 outfile = argv[optind++];
2942 // Extra parametesr?
2944 cerr << "Extra parameter(s) specified: ";
2945 for(int i = optind; i < argc; i++) {
2946 cerr << "\"" << argv[i] << "\"";
2947 if(i < argc-1) cerr << ", ";
2950 if(mates1.size() > 0) {
2951 cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
2952 << "also be specified. Please run bowtie separately for mates and singles." << endl;
2957 // Optionally summarize
2959 cout << "Input ebwt file: \"" << ebwtFile << "\"" << endl;
2960 cout << "Query inputs (DNA, " << file_format_names[format] << "):" << endl;
2961 for(size_t i = 0; i < queries.size(); i++) {
2962 cout << " " << queries[i] << endl;
2964 cout << "Quality inputs:" << endl;
2965 for(size_t i = 0; i < qualities.size(); i++) {
2966 cout << " " << qualities[i] << endl;
2968 cout << "Output file: \"" << outfile << "\"" << endl;
2969 cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
2970 cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
2972 cout << "Assertions: disabled" << endl;
2974 cout << "Assertions: enabled" << endl;
2978 cout << "Press key to continue..." << endl;
2981 driver<String<Dna, Alloc<> > >("DNA", ebwtFile, query, queries, qualities, outfile);
2984 #ifdef CHUD_PROFILING
2985 chudReleaseRemoteAccess();
2988 } catch(exception& e) {
2989 cerr << "Command: ";
2990 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
2995 cerr << "Command: ";
2996 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";