6 #include <seqan/find.h>
8 #include "assert_helpers.h"
9 #include "endian_swap.h"
11 #include "reference.h"
14 using namespace seqan;
16 static bool showVersion = false; // just print version and quit?
17 int verbose = 0; // be talkative
18 static int names_only = 0; // just print the sequence names in the index
19 static int summarize_only = 0; // just print summary of index and quit
20 static int across = 60; // number of characters across in FASTA output
21 static bool extra = false; // print extra summary info
22 static bool exclAllGaps = false; // print extra summary info
23 static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
25 static const char *short_options = "vhnsea:";
34 static struct option long_options[] = {
35 {(char*)"verbose", no_argument, 0, 'v'},
36 {(char*)"version", no_argument, 0, ARG_VERSION},
37 {(char*)"usage", no_argument, 0, ARG_USAGE},
38 {(char*)"extra", no_argument, 0, ARG_EXTRA},
39 {(char*)"excl-ambig",no_argument, 0, ARG_EXCL_AMBIG},
40 {(char*)"names", no_argument, 0, 'n'},
41 {(char*)"summary", no_argument, 0, 's'},
42 {(char*)"help", no_argument, 0, 'h'},
43 {(char*)"across", required_argument, 0, 'a'},
44 {(char*)"ebwt-ref", no_argument, 0, 'e'},
45 {(char*)0, 0, 0, 0} // terminator
49 * Print a summary usage message to the provided output stream.
51 static void printUsage(ostream& out) {
53 << "Usage: bowtie-inspect [options]* <ebwt_base>" << endl
54 << " <ebwt_base> ebwt filename minus trailing .1.ebwt/.2.ebwt" << endl
56 << " By default, prints FASTA records of the indexed nucleotide sequences to" << endl
57 << " standard out. With -n, just prints names. With -s, just prints a summary of" << endl
58 << " the index parameters and sequences. With -e, preserves colors if applicable." << endl
61 << " -a/--across <int> Number of characters across in FASTA output (default: 60)" << endl
62 << " -n/--names Print reference sequence names only" << endl
63 << " -s/--summary Print summary incl. ref names, lengths, index properties" << endl
64 << " -e/--ebwt-ref Reconstruct reference from ebwt (slow, preserves colors)" << endl
65 << " -v/--verbose Verbose output (for debugging)" << endl
66 << " -h/--help print detailed description of tool and its options" << endl
67 << " --help print this usage message" << endl
72 * Parse an int out of optarg and enforce that it be at least 'lower';
73 * if it is less than 'lower', than output the given error message and
74 * exit with an error and a usage message.
76 static int parseInt(int lower, const char *errmsg) {
79 l = strtol(optarg, &endPtr, 10);
82 cerr << errmsg << endl;
88 cerr << errmsg << endl;
95 * Read command-line arguments
97 static void parseOptions(int argc, char **argv) {
101 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
102 switch (next_option) {
108 case 'v': verbose = true; break;
109 case ARG_VERSION: showVersion = true; break;
110 case ARG_EXCL_AMBIG: exclAllGaps = true; break;
111 case ARG_EXTRA: extra = true; break;
112 case 'e': refFromEbwt = true; break;
113 case 'n': names_only = true; break;
114 case 's': summarize_only = true; break;
115 case 'a': across = parseInt(-1, "-a/--across arg must be at least 1"); break;
116 case -1: break; /* Done with options. */
118 if (long_options[option_index].flag != 0)
124 } while(next_option != -1);
127 void print_fasta_record(ostream& fout,
128 const string& defline,
132 fout << defline << endl;
136 while (i + across < seq.length())
138 fout << seq.substr(i, across) << endl;
141 if (i < seq.length())
142 fout << seq.substr(i) << endl;
149 * Given output stream, name and length, print a string of Ns with the
150 * appropriate number of columns.
152 void print_alln_ref_sequence(
157 fout << ">" << name << "\n";
159 for(size_t i = 0; i < len; i += across) {
160 while(j < len && j < i+across) {
169 * Given output stream, BitPairReference, reference index, name and
170 * length, print the whole nucleotide reference with the appropriate
173 void print_ref_sequence(
175 BitPairReference& ref,
180 bool newlines = across > 0;
181 int myacross = across > 0 ? across : 60;
182 size_t incr = myacross * 1000;
183 uint32_t *buf = new uint32_t[(incr + 128)/4];
184 fout << ">" << name << "\n";
185 for(size_t i = 0; i < len; i += incr) {
186 size_t amt = min(incr, len-i);
187 assert_leq(amt, incr);
188 int off = ref.getStretch(buf, refi, i, amt);
189 uint8_t *cb = ((uint8_t*)buf) + off;
190 for(size_t j = 0; j < amt; j++) {
191 if(newlines && j > 0 && (j % myacross) == 0) fout << "\n";
192 assert_range(0, 4, (int)cb[j]);
193 fout << "ACGTN"[(int)cb[j]];
201 * Create a BitPairReference encapsulating the reference portion of the
202 * index at the given basename. Iterate through the reference
203 * sequences, sending each one to print_ref_sequence to print.
205 void print_ref_sequences(
208 const vector<string>& refnames,
209 const uint32_t* plen,
210 const string& adjustedEbwtFileBase)
212 BitPairReference ref(
213 adjustedEbwtFileBase, // input basename
214 color, // true -> expect colorspace reference
215 false, // sanity-check reference
218 false, // infiles are sequences
219 true, // load sequence
221 false, // use shared memory
222 false, // sweep mm-mapped ref
223 verbose, // be talkative
224 verbose); // be talkative at startup
225 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
226 for(size_t i = 0; i < ref.numRefs(); i++) {
227 if(ref.isAllGaps(i) && !exclAllGaps) {
228 print_alln_ref_sequence(
242 assert_eq(refnames.size(), ref.numNonGapRefs());
243 for(size_t i = 0; i < ref.numNonGapRefs(); i++) {
249 plen[i] + (color ? 1 : 0));
255 * Given an index, reconstruct the reference by LF mapping through the
258 template<typename TStr>
259 void print_index_sequences(
262 const BitPairReference& refs)
264 vector<string>* refnames = &(ebwt.refnames());
267 ebwt.restore(cat_ref);
269 uint32_t curr_ref = 0xffffffff;
270 string curr_ref_seq = "";
271 uint32_t curr_ref_len = 0xffffffff;
272 uint32_t last_text_off = 0;
273 size_t orig_len = seqan::length(cat_ref);
274 uint32_t tlen = 0xffffffff;
276 for(size_t i = 0; i < orig_len; i++) {
277 uint32_t tidx = 0xffffffff;
278 uint32_t textoff = 0xffffffff;
281 ebwt.joinedToTextOff(1 /* qlen */, i, tidx, textoff, tlen);
283 if (tidx != 0xffffffff && textoff < tlen)
285 if (curr_ref != tidx)
287 if (curr_ref != 0xffffffff)
289 // Add trailing gaps, if any exist
290 if(curr_ref_seq.length() < curr_ref_len) {
291 curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
293 print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
302 uint32_t textoff_adj = textoff;
303 if(first && textoff > 0) textoff_adj++;
304 if (textoff_adj - last_text_off > 1)
305 curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
307 curr_ref_seq.push_back(getValue(cat_ref,i));
308 last_text_off = textoff;
312 if (curr_ref < refnames->size())
314 // Add trailing gaps, if any exist
315 if(curr_ref_seq.length() < curr_ref_len) {
316 curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
318 print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
323 static char *argv0 = NULL;
325 void print_index_sequence_names(const string& fname, ostream& fout)
327 vector<string> p_refnames;
328 readEbwtRefnames(fname, p_refnames);
329 for(size_t i = 0; i < p_refnames.size(); i++) {
330 cout << p_refnames[i] << endl;
334 typedef Ebwt<String<Dna, Packed<Alloc<> > > > TPackedEbwt;
337 * Print a short summary of what's in the index and its flags.
339 void print_index_summary(
342 const BitPairReference& refs)
344 int32_t flags = readFlags(fname);
345 int32_t flagsr = readFlags(fname + ".rev");
346 bool color = readEbwtColor(fname);
347 bool entireReverse = readEntireReverse(fname + ".rev");
350 color, // index is colorspace
351 -1, // don't require entire reverse
352 true, // index is for the forward direction
353 -1, // offrate (-1 = index default)
355 false, // use memory-mapped IO
356 false, // use shared memory
357 false, // sweep memory-mapped memory
359 //false, // load SA sample?
360 NULL, // no reference map
361 verbose, // be talkative?
362 verbose, // be talkative at startup?
363 false, // pass up memory exceptions?
364 false); // sanity check?
365 vector<string> p_refnames;
366 readEbwtRefnames(fname, p_refnames);
368 cout << "Flags" << '\t' << (-flags) << endl;
369 cout << "Reverse flags" << '\t' << (-flagsr) << endl;
371 cout << "Colorspace" << '\t' << (color ? "1" : "0") << endl;
373 cout << "Concat then reverse" << '\t' << (entireReverse ? "1" : "0") << endl;
374 cout << "Reverse then concat" << '\t' << (entireReverse ? "0" : "1") << endl;
375 cout << "nPat" << '\t' << ebwt.nPat() << endl;
376 cout << "refnames.size()" << '\t' << p_refnames.size() << endl;
377 cout << "refs.numRefs()" << '\t' << refs.numRefs() << endl;
378 cout << "refs.numNonGapRefs()" << '\t' << refs.numNonGapRefs() << endl;
380 cout << "SA-Sample" << "\t1 in " << (1 << ebwt.eh().offRate()) << endl;
381 cout << "FTab-Chars" << '\t' << ebwt.eh().ftabChars() << endl;
382 for(size_t i = 0; i < ebwt.nPat(); i++) {
383 cout << "Sequence-" << (i+1)
384 << '\t' << p_refnames[refs.expandIdx(i)]
385 << '\t' << (ebwt.plen()[i] + (color ? 1 : 0))
389 cout << "RefRecords:\n";
390 for(size_t i = 0; i < refs.refRecords().size(); i++) {
391 RefRecord r = refs.refRecords()[i];
392 cout << r.first << "\t(" << r.off << ", " << r.len << ")" << endl;
398 const string& ebwtFileBase,
402 string adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
404 print_index_sequence_names(adjustedEbwtFileBase, cout);
407 bool color = readEbwtColor(adjustedEbwtFileBase);
408 BitPairReference refs(
409 adjustedEbwtFileBase,
415 false, // don't load sequence (yet)
422 print_index_summary(adjustedEbwtFileBase, cout, refs);
424 // Initialize Ebwt object
426 adjustedEbwtFileBase,
427 color, // index is colorspace
428 -1, // don't care about entire-reverse
429 true, // index is for the forward direction
430 -1, // offrate (-1 = index default)
432 false, // use memory-mapped IO
433 false, // use shared memory
434 false, // sweep memory-mapped memory
436 //true, // load SA sample?
437 NULL, // no reference map
438 verbose, // be talkative?
439 verbose, // be talkative at startup?
440 false, // pass up memory exceptions?
441 false); // sanity check?
442 // Load whole index into memory
444 ebwt.loadIntoMemory(-1, -1, true, false);
445 print_index_sequences(cout, ebwt, refs);
447 vector<string> refnames;
448 readEbwtRefnames(adjustedEbwtFileBase, refnames);
451 readEbwtColor(ebwtFileBase),
454 adjustedEbwtFileBase);
456 // Evict any loaded indexes from memory
457 if(ebwt.isInMemory()) {
458 ebwt.evictFromMemory();
464 * main function. Parses command-line arguments.
466 int main(int argc, char **argv) {
468 string ebwtFile; // read serialized Ebwt from this file
469 string query; // read query string(s) from this file
470 vector<string> queries;
471 string outfile; // write query results to this file
473 parseOptions(argc, argv);
475 cout << argv0 << " version " << BOWTIE_VERSION << endl;
476 if(sizeof(void*) == 4) {
477 cout << "32-bit" << endl;
478 } else if(sizeof(void*) == 8) {
479 cout << "64-bit" << endl;
481 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
483 cout << "Built on " << BUILD_HOST << endl;
484 cout << BUILD_TIME << endl;
485 cout << "Compiler: " << COMPILER_VERSION << endl;
486 cout << "Options: " << COMPILER_OPTIONS << endl;
487 cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
489 << ", " << sizeof(long) << ", " << sizeof(long long)
490 << ", " << sizeof(void *) << ", " << sizeof(size_t)
491 << ", " << sizeof(off_t) << "}" << endl;
495 // Get input filename
497 cerr << "No index name given!" << endl;
501 ebwtFile = argv[optind++];
503 // Optionally summarize
505 cout << "Input ebwt file: \"" << ebwtFile << "\"" << endl;
506 cout << "Output file: \"" << outfile << "\"" << endl;
507 cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
509 cout << "Assertions: disabled" << endl;
511 cout << "Assertions: enabled" << endl;
514 driver(ebwtFile, query);
516 } catch(std::exception& e) {
518 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
524 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";