5 #include <seqan/index.h>
6 #include <seqan/sequence.h>
7 #include <seqan/file.h>
9 #include "assert_helpers.h"
10 #include "endian_swap.h"
13 #include "sequence_io.h"
18 #include "reference.h"
21 * \file Driver for the bowtie-build indexing tool.
26 static int sanityCheck;
29 static uint32_t bmaxMultSqrt;
30 static uint32_t bmaxDivN;
35 static int showVersion;
36 static bool doubleEbwt;
38 static int32_t lineRate;
39 static int32_t linesPerSide;
40 static int32_t offRate;
41 static int32_t ftabChars;
48 static int reverseType;
51 static void resetOptions() {
52 verbose = true; // be talkative (default)
53 sanityCheck = 0; // do slow sanity checks
54 format = FASTA; // input sequence format
55 bmax = 0xffffffff; // max blockwise SA bucket size
56 bmaxMultSqrt = 0xffffffff; // same, as multplier of sqrt(n)
57 bmaxDivN = 4; // same, as divisor of n
58 dcv = 1024; // bwise SA difference-cover sample sz
59 noDc = 0; // disable difference-cover sample
60 entireSA = 0; // 1 = disable blockwise SA
61 seed = 0; // srandom seed
62 showVersion = 0; // just print version and quit?
63 doubleEbwt = true; // build forward and reverse Ebwts
65 lineRate = 6; // a "line" is 64 bytes
66 linesPerSide = 1; // 1 64-byte line on a side
67 offRate = 5; // sample 1 out of 32 SA elts
68 ftabChars = 10; // 10 chars in initial lookup table
69 bigEndian = 0; // little endian
70 nsToAs = false; // convert reference Ns to As prior to indexing
71 autoMem = true; // automatically adjust memory usage parameters
73 writeRef = true; // write compact reference to .3.ebwt/.4.ebwt
74 justRef = false; // *just* write compact reference, don't index
75 reverseType = REF_READ_REVERSE_EACH;
79 // Argument constants for getopts
94 * Print a detailed usage message to the provided output stream.
96 static void printUsage(ostream& out) {
97 out << "Usage: bowtie-build [options]* <reference_in> <ebwt_outfile_base>" << endl
98 << " reference_in comma-separated list of files with ref sequences" << endl
99 << " ebwt_outfile_base write Ebwt data to files with this dir/basename" << endl
100 << "Options:" << endl
101 << " -f reference files are Fasta (default)" << endl
102 << " -c reference sequences given on cmd line (as <seq_in>)" << endl
103 << " -C/--color build a colorspace index" << endl
104 << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl
105 << " -p/--packed use packed strings internally; slower, uses less mem" << endl
106 << " -B build both letter- and colorspace indexes" << endl
107 << " --bmax <int> max bucket sz for blockwise suffix-array builder" << endl
108 //<< " --bmaxmultsqrt <int> max bucket sz as multiple of sqrt(ref len)" << endl
109 << " --bmaxdivn <int> max bucket sz as divisor of ref len (default: 4)" << endl
110 << " --dcv <int> diff-cover period for blockwise (default: 1024)" << endl
111 << " --nodc disable diff-cover (algorithm becomes quadratic)" << endl
112 << " -r/--noref don't build .3/.4.ebwt (packed reference) portion" << endl
113 << " -3/--justref just build .3/.4.ebwt (packed reference) portion" << endl
114 << " -o/--offrate <int> SA is sampled every 2^offRate BWT chars (default: 5)" << endl
115 << " -t/--ftabchars <int> # of chars consumed in initial lookup (default: 10)" << endl
116 << " --ntoa convert Ns in reference to As" << endl
117 //<< " --big --little endianness (default: little, this host: "
118 //<< (currentlyBigEndian()? "big":"little") << ")" << endl
119 << " --seed <int> seed for random number generator" << endl
120 //<< " --new-reverse concatenate then reverse stretches, not vice versa" << endl
121 << " -q/--quiet verbose output (for debugging)" << endl
122 << " -h/--help print detailed description of tool and its options" << endl
123 << " --usage print this usage message" << endl
124 << " --version print version information and quit" << endl
128 static const char *short_options = "qraph?nscfl:i:o:t:h:3C";
130 static struct option long_options[] = {
131 {(char*)"quiet", no_argument, 0, 'q'},
132 {(char*)"sanity", no_argument, 0, 's'},
133 {(char*)"packed", no_argument, 0, 'p'},
134 {(char*)"little", no_argument, &bigEndian, 0},
135 {(char*)"big", no_argument, &bigEndian, 1},
136 {(char*)"bmax", required_argument, 0, ARG_BMAX},
137 {(char*)"bmaxmultsqrt", required_argument, 0, ARG_BMAX_MULT},
138 {(char*)"bmaxdivn", required_argument, 0, ARG_BMAX_DIV},
139 {(char*)"dcv", required_argument, 0, ARG_DCV},
140 {(char*)"nodc", no_argument, &noDc, 1},
141 {(char*)"seed", required_argument, 0, ARG_SEED},
142 {(char*)"entiresa", no_argument, &entireSA, 1},
143 {(char*)"version", no_argument, &showVersion, 1},
144 {(char*)"noauto", no_argument, 0, 'a'},
145 {(char*)"noblocks", required_argument, 0, 'n'},
146 {(char*)"linerate", required_argument, 0, 'l'},
147 {(char*)"linesperside", required_argument, 0, 'i'},
148 {(char*)"offrate", required_argument, 0, 'o'},
149 {(char*)"ftabchars", required_argument, 0, 't'},
150 {(char*)"help", no_argument, 0, 'h'},
151 {(char*)"ntoa", no_argument, 0, ARG_NTOA},
152 {(char*)"justref", no_argument, 0, '3'},
153 {(char*)"noref", no_argument, 0, 'r'},
154 {(char*)"color", no_argument, 0, 'C'},
155 {(char*)"usage", no_argument, 0, ARG_USAGE},
156 {(char*)"new-reverse", no_argument, 0, ARG_NEW_REVERSE},
157 {(char*)0, 0, 0, 0} // terminator
161 * Parse an int out of optarg and enforce that it be at least 'lower';
162 * if it is less than 'lower', then output the given error message and
163 * exit with an error and a usage message.
166 static int parseNumber(T lower, const char *errmsg) {
168 T t = (T)strtoll(optarg, &endPtr, 10);
169 if (endPtr != NULL) {
171 cerr << errmsg << endl;
177 cerr << errmsg << endl;
184 * Read command-line arguments
186 static void parseOptions(int argc, const char **argv) {
187 int option_index = 0;
190 next_option = getopt_long(
191 argc, const_cast<char**>(argv),
192 short_options, long_options, &option_index);
193 switch (next_option) {
194 case 'f': format = FASTA; break;
195 case 'c': format = CMDLINE; break;
196 case 'p': packed = true; break;
197 case 'C': color = true; break;
199 lineRate = parseNumber<int>(3, "-l/--lineRate arg must be at least 3");
202 linesPerSide = parseNumber<int>(1, "-i/--linesPerSide arg must be at least 1");
205 offRate = parseNumber<int>(0, "-o/--offRate arg must be at least 0");
211 ftabChars = parseNumber<int>(1, "-t/--ftabChars arg must be at least 1");
214 // all f-s is used to mean "not set", so put 'e' on end
223 bmax = parseNumber<uint32_t>(1, "--bmax arg must be at least 1");
224 bmaxMultSqrt = 0xffffffff; // don't use multSqrt
225 bmaxDivN = 0xffffffff; // don't use multSqrt
228 bmaxMultSqrt = parseNumber<uint32_t>(1, "--bmaxmultsqrt arg must be at least 1");
229 bmax = 0xffffffff; // don't use bmax
230 bmaxDivN = 0xffffffff; // don't use multSqrt
233 bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
234 bmax = 0xffffffff; // don't use bmax
235 bmaxMultSqrt = 0xffffffff; // don't use multSqrt
238 dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
241 seed = parseNumber<int>(0, "--seed arg must be at least 0");
243 case ARG_NTOA: nsToAs = true; break;
244 case ARG_NEW_REVERSE: reverseType = REF_READ_REVERSE; break;
245 case 'a': autoMem = false; break;
246 case 'q': verbose = false; break;
247 case 's': sanityCheck = true; break;
248 case 'r': writeRef = false; break;
250 case -1: /* Done with options. */
253 if (long_options[option_index].flag != 0)
259 } while(next_option != -1);
261 cerr << "Warning: specified bmax is very small (" << bmax << "). This can lead to" << endl
262 << "extremely slow performance and memory exhaustion. Perhaps you meant to specify" << endl
263 << "a small --bmaxdivn?" << endl;
268 * Drive the Ebwt construction process and optionally sanity-check the
271 template<typename TStr>
272 static void driver(const string& infile,
273 vector<string>& infiles,
274 const string& outfile,
275 bool reverse = false)
278 bool bisulfite = false;
279 RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite);
280 assert_gt(infiles.size(), 0);
281 if(format == CMDLINE) {
282 // Adapt sequence strings to stringstreams open for input
283 stringstream *ss = new stringstream();
284 for(size_t i = 0; i < infiles.size(); i++) {
285 (*ss) << ">" << i << endl << infiles[i] << endl;
287 FileBuf *fb = new FileBuf(ss);
290 assert(fb->get() == '>');
291 ASSERT_ONLY(fb->reset());
295 // Adapt sequence files to ifstreams
296 for(size_t i = 0; i < infiles.size(); i++) {
297 FILE *f = fopen(infiles[i].c_str(), "r");
299 cerr << "Error: could not open "<< infiles[i] << endl;
302 FileBuf *fb = new FileBuf(f);
305 assert(fb->get() == '>');
306 ASSERT_ONLY(fb->reset());
311 // Vector for the ordered list of "records" comprising the input
312 // sequences. A record represents a stretch of unambiguous
313 // characters in one of the input sequences.
314 vector<RefRecord> szs;
315 vector<uint32_t> plens;
316 std::pair<size_t, size_t> sztot;
318 if(verbose) cout << "Reading reference sizes" << endl;
319 Timer _t(cout, " Time reading reference sizes: ", verbose);
320 if(!reverse && (writeRef || justRef)) {
321 // For forward reference, dump it to .3.ebwt and .4.ebwt
323 string file3 = outfile + ".3.ebwt";
324 string file4 = outfile + ".4.ebwt";
325 // Open output stream for the '.3.ebwt' file which will
326 // hold the size records.
327 ofstream fout3(file3.c_str(), ios::binary);
329 cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl
330 << "Please make sure the directory exists and that permissions allow writing by" << endl
331 << "Bowtie." << endl;
334 BitpairOutFileBuf bpout(file4.c_str());
335 // Read in the sizes of all the unambiguous stretches of
336 // the genome into a vector of RefRecords. The input
337 // streams are reset once it's done.
338 writeU32(fout3, 1, bigEndian); // endianness sentinel
340 refparams.color = false;
341 // Make sure the .3.ebwt and .4.ebwt files contain
342 // nucleotides; not colors
344 std::pair<size_t, size_t> sztot2 =
345 fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
346 refparams.color = true;
347 writeU32(fout3, szs.size(), bigEndian); // write # records
348 for(size_t i = 0; i < szs.size(); i++) {
349 szs[i].write(fout3, bigEndian);
353 // Now read in the colorspace size records; these are
354 // the ones that were indexed
356 sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2);
357 assert_geq(numSeqs, numSeqs2);
358 //assert_eq(sztot2.second, sztot.second + numSeqs);
361 sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
362 writeU32(fout3, szs.size(), bigEndian); // write # records
363 for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
365 if(sztot.first == 0) {
366 cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl;
369 assert_gt(sztot.first, 0);
370 assert_gt(sztot.second, 0);
375 BitPairReference bpr(
376 outfile, // ebwt basename
377 color, // expect color?
378 true, // sanity check?
379 &infiles,// files to check against
380 NULL, // sequences to check against
381 format == CMDLINE, // whether infiles contains strings
382 true, // load sequence?
383 false, // use memory-mapped files
384 false, // use shared memory
385 false, // sweep through memory-mapped memory
386 false, // be talkative
387 false); // be talkative
391 // Read in the sizes of all the unambiguous stretches of the
392 // genome into a vector of RefRecords
394 sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs);
396 if(refparams.color) {
397 refparams.color = false;
398 vector<RefRecord> szs2;
399 vector<uint32_t> plens2;
401 std::pair<size_t, size_t> sztot2 =
402 fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2);
403 assert_leq(numSeqs, numSeqs2);
404 // One less color than base
405 //assert_geq(sztot2.second, sztot.second + numSeqs);
406 refparams.color = true;
412 assert_gt(sztot.first, 0);
413 assert_gt(sztot.second, 0);
414 assert_gt(szs.size(), 0);
415 // Construct Ebwt from input strings and parameters
416 Ebwt<TStr> ebwt(refparams.color ? 1 : 0,
419 offRate, // suffix-array sampling rate
420 -1, // ISA sampling rate
421 ftabChars, // number of chars in initial arrow-pair calc
422 outfile, // basename for .?.ebwt files
424 !entireSA, // useBlockwise
425 bmax, // block size for blockwise SA builder
426 bmaxMultSqrt, // block size as multiplier of sqrt(len)
427 bmaxDivN, // block size as divisor of len
428 noDc? 0 : dcv,// difference-cover period
429 is, // list of input streams
430 szs, // list of reference sizes
431 plens, // list of not-all-gap reference sequence lengths
432 sztot.first, // total size of all unambiguous ref chars
433 refparams, // reference read-in parameters
434 seed, // pseudo-random number generator seed
435 -1, // override offRate
436 -1, // override isaRate
437 verbose, // be talkative
438 autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
439 sanityCheck); // verify results and internal consistency
440 // Note that the Ebwt is *not* resident in memory at this time. To
441 // load it into memory, call ebwt.loadIntoMemory()
443 // Print Ebwt's vital stats
444 ebwt.eh().print(cout);
447 // Try restoring the original string (if there were
448 // multiple texts, what we'll get back is the joined,
449 // padded string, not a list)
451 refparams.color ? 1 : 0,
455 TStr s2; ebwt.restore(s2);
456 ebwt.evictFromMemory();
458 TStr joinedss = Ebwt<TStr>::join(
459 is, // list of input streams
460 szs, // list of reference sizes
461 sztot.first, // total size of all unambiguous ref chars
462 refparams, // reference read-in parameters
463 seed); // pseudo-random number generator seed
464 if(refparams.reverse == REF_READ_REVERSE) {
465 reverseInPlace(joinedss);
467 assert_eq(length(joinedss), length(s2));
468 assert_eq(joinedss, s2);
471 if(length(s2) < 1000) {
472 cout << "Passed restore check: " << s2 << endl;
474 cout << "Passed restore check: (" << length(s2) << " chars)" << endl;
480 static const char *argv0 = NULL;
484 * main function. Parses command-line arguments.
486 int bowtie_build(int argc, const char **argv) {
488 // Reset all global state, including getopt state
493 vector<string> infiles;
496 parseOptions(argc, argv);
499 cout << argv0 << " version " << BOWTIE_VERSION << endl;
500 if(sizeof(void*) == 4) {
501 cout << "32-bit" << endl;
502 } else if(sizeof(void*) == 8) {
503 cout << "64-bit" << endl;
505 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
507 cout << "Built on " << BUILD_HOST << endl;
508 cout << BUILD_TIME << endl;
509 cout << "Compiler: " << COMPILER_VERSION << endl;
510 cout << "Options: " << COMPILER_OPTIONS << endl;
511 cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
513 << ", " << sizeof(long) << ", " << sizeof(long long)
514 << ", " << sizeof(void *) << ", " << sizeof(size_t)
515 << ", " << sizeof(off_t) << "}" << endl;
519 // Get input filename
521 cerr << "No input sequence or sequence file specified!" << endl;
525 infile = argv[optind++];
527 // Get output filename
529 cerr << "No output file specified!" << endl;
533 outfile = argv[optind++];
535 tokenize(infile, ",", infiles);
536 if(infiles.size() < 1) {
537 cerr << "Tokenized input file list was empty!" << endl;
542 // Optionally summarize
544 cout << "Settings:" << endl
545 << " Output files: \"" << outfile << ".*.ebwt\"" << endl
546 << " Line rate: " << lineRate << " (line is " << (1<<lineRate) << " bytes)" << endl
547 << " Lines per side: " << linesPerSide << " (side is " << ((1<<lineRate)*linesPerSide) << " bytes)" << endl
548 << " Offset rate: " << offRate << " (one in " << (1<<offRate) << ")" << endl
549 << " FTable chars: " << ftabChars << endl
550 << " Strings: " << (packed? "packed" : "unpacked") << endl
552 if(bmax == 0xffffffff) {
553 cout << " Max bucket size: default" << endl;
555 cout << " Max bucket size: " << bmax << endl;
557 if(bmaxMultSqrt == 0xffffffff) {
558 cout << " Max bucket size, sqrt multiplier: default" << endl;
560 cout << " Max bucket size, sqrt multiplier: " << bmaxMultSqrt << endl;
562 if(bmaxDivN == 0xffffffff) {
563 cout << " Max bucket size, len divisor: default" << endl;
565 cout << " Max bucket size, len divisor: " << bmaxDivN << endl;
567 cout << " Difference-cover sample period: " << dcv << endl;
568 cout << " Endianness: " << (bigEndian? "big":"little") << endl
569 << " Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl
570 << " Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
572 cout << " Assertions: disabled" << endl;
574 cout << " Assertions: enabled" << endl;
576 cout << " Random seed: " << seed << endl;
577 cout << " Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl;
578 cout << "Input files DNA, " << file_format_names[format] << ":" << endl;
579 for(size_t i = 0; i < infiles.size(); i++) {
580 cout << " " << infiles[i] << endl;
583 // Seed random number generator
586 Timer timer(cout, "Total time for call to driver() for forward index: ", verbose);
589 driver<String<Dna, Alloc<> > >(infile, infiles, outfile);
590 } catch(bad_alloc& e) {
592 cerr << "Switching to a packed string representation." << endl;
600 driver<String<Dna, Packed<Alloc<> > > >(infile, infiles, outfile);
605 Timer timer(cout, "Total time for backward call to driver() for mirror index: ", verbose);
608 driver<String<Dna, Alloc<> > >(infile, infiles, outfile + ".rev", true);
609 } catch(bad_alloc& e) {
611 cerr << "Switching to a packed string representation." << endl;
619 driver<String<Dna, Packed<Alloc<> > > >(infile, infiles, outfile + ".rev", true);
623 } catch(std::exception& e) {
625 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
631 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";