Imported Upstream version 0.12.7
[bowtie.git] / bowtie_inspect.cpp
1 #include <string>
2 #include <vector>
3 #include <iostream>
4 #include <getopt.h>
5 #include <stdexcept>
6 #include <seqan/find.h>
7
8 #include "assert_helpers.h"
9 #include "endian_swap.h"
10 #include "ebwt.h"
11 #include "reference.h"
12
13 using namespace std;
14 using namespace seqan;
15
16 static bool showVersion = false; // just print version and quit?
17 int verbose             = 0;  // be talkative
18 static int names_only   = 0;  // just print the sequence names in the index
19 static int summarize_only = 0; // just print summary of index and quit
20 static int across       = 60; // number of characters across in FASTA output
21 static bool extra       = false; // print extra summary info
22 static bool exclAllGaps = false; // print extra summary info
23 static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
24
25 static const char *short_options = "vhnsea:";
26
27 enum {
28         ARG_VERSION = 256,
29         ARG_USAGE,
30         ARG_EXTRA,
31         ARG_EXCL_AMBIG
32 };
33
34 static struct option long_options[] = {
35         {(char*)"verbose",  no_argument,        0, 'v'},
36         {(char*)"version",  no_argument,        0, ARG_VERSION},
37         {(char*)"usage",    no_argument,        0, ARG_USAGE},
38         {(char*)"extra",    no_argument,        0, ARG_EXTRA},
39         {(char*)"excl-ambig",no_argument,       0, ARG_EXCL_AMBIG},
40         {(char*)"names",    no_argument,        0, 'n'},
41         {(char*)"summary",  no_argument,        0, 's'},
42         {(char*)"help",     no_argument,        0, 'h'},
43         {(char*)"across",   required_argument,  0, 'a'},
44         {(char*)"ebwt-ref", no_argument,        0, 'e'},
45         {(char*)0, 0, 0, 0} // terminator
46 };
47
48 /**
49  * Print a summary usage message to the provided output stream.
50  */
51 static void printUsage(ostream& out) {
52         out
53         << "Usage: bowtie-inspect [options]* <ebwt_base>" << endl
54         << "  <ebwt_base>        ebwt filename minus trailing .1.ebwt/.2.ebwt" << endl
55         << endl
56         << "  By default, prints FASTA records of the indexed nucleotide sequences to" << endl
57         << "  standard out.  With -n, just prints names.  With -s, just prints a summary of" << endl
58         << "  the index parameters and sequences.  With -e, preserves colors if applicable." << endl
59         << endl
60         << "Options:" << endl
61         << "  -a/--across <int>  Number of characters across in FASTA output (default: 60)" << endl
62         << "  -n/--names         Print reference sequence names only" << endl
63         << "  -s/--summary       Print summary incl. ref names, lengths, index properties" << endl
64         << "  -e/--ebwt-ref      Reconstruct reference from ebwt (slow, preserves colors)" << endl
65         << "  -v/--verbose       Verbose output (for debugging)" << endl
66         << "  -h/--help          print detailed description of tool and its options" << endl
67         << "  --help             print this usage message" << endl
68         ;
69 }
70
71 /**
72  * Parse an int out of optarg and enforce that it be at least 'lower';
73  * if it is less than 'lower', than output the given error message and
74  * exit with an error and a usage message.
75  */
76 static int parseInt(int lower, const char *errmsg) {
77         long l;
78         char *endPtr= NULL;
79         l = strtol(optarg, &endPtr, 10);
80         if (endPtr != NULL) {
81                 if (l < lower) {
82                         cerr << errmsg << endl;
83                         printUsage(cerr);
84                         throw 1;
85                 }
86                 return (int32_t)l;
87         }
88         cerr << errmsg << endl;
89         printUsage(cerr);
90         throw 1;
91         return -1;
92 }
93
94 /**
95  * Read command-line arguments
96  */
97 static void parseOptions(int argc, char **argv) {
98         int option_index = 0;
99         int next_option;
100         do {
101                 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
102                 switch (next_option) {
103                         case ARG_USAGE:
104                         case 'h':
105                                 printUsage(cout);
106                                 throw 0;
107                                 break;
108                         case 'v': verbose = true; break;
109                         case ARG_VERSION: showVersion = true; break;
110                         case ARG_EXCL_AMBIG: exclAllGaps = true; break;
111                         case ARG_EXTRA: extra = true; break;
112                         case 'e': refFromEbwt = true; break;
113                         case 'n': names_only = true; break;
114                         case 's': summarize_only = true; break;
115                         case 'a': across = parseInt(-1, "-a/--across arg must be at least 1"); break;
116                         case -1: break; /* Done with options. */
117                         case 0:
118                                 if (long_options[option_index].flag != 0)
119                                         break;
120                         default:
121                                 printUsage(cerr);
122                                 throw 1;
123                 }
124         } while(next_option != -1);
125 }
126
127 void print_fasta_record(ostream& fout,
128                                                 const string& defline,
129                                                 const string& seq)
130 {
131         fout << ">";
132         fout << defline << endl;
133
134         if(across > 0) {
135                 size_t i = 0;
136                 while (i + across < seq.length())
137                 {
138                         fout << seq.substr(i, across) << endl;
139                         i += across;
140                 }
141                 if (i < seq.length())
142                         fout << seq.substr(i) << endl;
143         } else {
144                 fout << seq << endl;
145         }
146 }
147
148 /**
149  * Given output stream, name and length, print a string of Ns with the
150  * appropriate number of columns.
151  */
152 void print_alln_ref_sequence(
153         ostream& fout,
154         const string& name,
155         size_t len)
156 {
157         fout << ">" << name << "\n";
158         size_t j = 0;
159         for(size_t i = 0; i < len; i += across) {
160                 while(j < len && j < i+across) {
161                         fout << 'N';
162                         j++;
163                 }
164                 fout << "\n";
165         }
166 }
167
168 /**
169  * Given output stream, BitPairReference, reference index, name and
170  * length, print the whole nucleotide reference with the appropriate
171  * number of columns.
172  */
173 void print_ref_sequence(
174         ostream& fout,
175         BitPairReference& ref,
176         const string& name,
177         size_t refi,
178         size_t len)
179 {
180         bool newlines = across > 0;
181         int myacross = across > 0 ? across : 60;
182         size_t incr = myacross * 1000;
183         uint32_t *buf = new uint32_t[(incr + 128)/4];
184         fout << ">" << name << "\n";
185         for(size_t i = 0; i < len; i += incr) {
186                 size_t amt = min(incr, len-i);
187                 assert_leq(amt, incr);
188                 int off = ref.getStretch(buf, refi, i, amt);
189                 uint8_t *cb = ((uint8_t*)buf) + off;
190                 for(size_t j = 0; j < amt; j++) {
191                         if(newlines && j > 0 && (j % myacross) == 0) fout << "\n";
192                         assert_range(0, 4, (int)cb[j]);
193                         fout << "ACGTN"[(int)cb[j]];
194                 }
195                 fout << "\n";
196         }
197         delete buf;
198 }
199
200 /**
201  * Create a BitPairReference encapsulating the reference portion of the
202  * index at the given basename.  Iterate through the reference
203  * sequences, sending each one to print_ref_sequence to print.
204  */
205 void print_ref_sequences(
206         ostream& fout,
207         bool color,
208         const vector<string>& refnames,
209         const uint32_t* plen,
210         const string& adjustedEbwtFileBase)
211 {
212         BitPairReference ref(
213                 adjustedEbwtFileBase, // input basename
214                 color,                // true -> expect colorspace reference
215                 false,                // sanity-check reference
216                 NULL,                 // infiles
217                 NULL,                 // originals
218                 false,                // infiles are sequences
219                 true,                 // load sequence
220                 false,                // memory-map
221                 false,                // use shared memory
222                 false,                // sweep mm-mapped ref
223                 verbose,              // be talkative
224                 verbose);             // be talkative at startup
225 #ifdef ACCOUNT_FOR_ALL_GAP_REFS
226         for(size_t i = 0; i < ref.numRefs(); i++) {
227                 if(ref.isAllGaps(i) && !exclAllGaps) {
228                         print_alln_ref_sequence(
229                                 fout,
230                                 refnames[i],
231                                 ref.len(i));
232                 } else {
233                         print_ref_sequence(
234                                 fout,
235                                 ref,
236                                 refnames[i],
237                                 ref.shrinkIdx(i),
238                                 ref.len(i));
239                 }
240         }
241 #else
242         assert_eq(refnames.size(), ref.numNonGapRefs());
243         for(size_t i = 0; i < ref.numNonGapRefs(); i++) {
244                 print_ref_sequence(
245                         fout,
246                         ref,
247                         refnames[i],
248                         i,
249                         plen[i] + (color ? 1 : 0));
250         }
251 #endif
252 }
253
254 /**
255  * Given an index, reconstruct the reference by LF mapping through the
256  * entire thing.
257  */
258 template<typename TStr>
259 void print_index_sequences(
260         ostream& fout,
261         Ebwt<TStr>& ebwt,
262         const BitPairReference& refs)
263 {
264         vector<string>* refnames = &(ebwt.refnames());
265
266         TStr cat_ref;
267         ebwt.restore(cat_ref);
268
269         uint32_t curr_ref = 0xffffffff;
270         string curr_ref_seq = "";
271         uint32_t curr_ref_len = 0xffffffff;
272         uint32_t last_text_off = 0;
273         size_t orig_len = seqan::length(cat_ref);
274         uint32_t tlen = 0xffffffff;
275         bool first = true;
276         for(size_t i = 0; i < orig_len; i++) {
277                 uint32_t tidx = 0xffffffff;
278                 uint32_t textoff = 0xffffffff;
279                 tlen = 0xffffffff;
280
281                 ebwt.joinedToTextOff(1 /* qlen */, i, tidx, textoff, tlen);
282
283                 if (tidx != 0xffffffff && textoff < tlen)
284                 {
285                         if (curr_ref != tidx)
286                         {
287                                 if (curr_ref != 0xffffffff)
288                                 {
289                                         // Add trailing gaps, if any exist
290                                         if(curr_ref_seq.length() < curr_ref_len) {
291                                                 curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
292                                         }
293                                         print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
294                                 }
295                                 curr_ref = tidx;
296                                 curr_ref_seq = "";
297                                 curr_ref_len = tlen;
298                                 last_text_off = 0;
299                                 first = true;
300                         }
301
302                         uint32_t textoff_adj = textoff;
303                         if(first && textoff > 0) textoff_adj++;
304                         if (textoff_adj - last_text_off > 1)
305                                 curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
306
307                         curr_ref_seq.push_back(getValue(cat_ref,i));
308                         last_text_off = textoff;
309                         first = false;
310                 }
311         }
312         if (curr_ref < refnames->size())
313         {
314                 // Add trailing gaps, if any exist
315                 if(curr_ref_seq.length() < curr_ref_len) {
316                         curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
317                 }
318                 print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
319         }
320
321 }
322
323 static char *argv0 = NULL;
324
325 void print_index_sequence_names(const string& fname, ostream& fout)
326 {
327         vector<string> p_refnames;
328         readEbwtRefnames(fname, p_refnames);
329         for(size_t i = 0; i < p_refnames.size(); i++) {
330                 cout << p_refnames[i] << endl;
331         }
332 }
333
334 typedef Ebwt<String<Dna, Packed<Alloc<> > > > TPackedEbwt;
335
336 /**
337  * Print a short summary of what's in the index and its flags.
338  */
339 void print_index_summary(
340         const string& fname,
341         ostream& fout,
342         const BitPairReference& refs)
343 {
344         int32_t flags = readFlags(fname);
345         int32_t flagsr = readFlags(fname + ".rev");
346         bool color = readEbwtColor(fname);
347         bool entireReverse = readEntireReverse(fname + ".rev");
348         TPackedEbwt ebwt(
349                 fname,
350                 color,                // index is colorspace
351                 -1,                   // don't require entire reverse
352                 true,                 // index is for the forward direction
353                 -1,                   // offrate (-1 = index default)
354                 -1,
355                 false,                // use memory-mapped IO
356                 false,                // use shared memory
357                 false,                // sweep memory-mapped memory
358                 true,                 // load names?
359                 //false,                // load SA sample?
360                 NULL,                 // no reference map
361                 verbose,              // be talkative?
362                 verbose,              // be talkative at startup?
363                 false,                // pass up memory exceptions?
364                 false);               // sanity check?
365         vector<string> p_refnames;
366         readEbwtRefnames(fname, p_refnames);
367         if(extra) {
368                 cout << "Flags" << '\t' << (-flags) << endl;
369                 cout << "Reverse flags" << '\t' << (-flagsr) << endl;
370         }
371         cout << "Colorspace" << '\t' << (color ? "1" : "0") << endl;
372         if(extra) {
373                 cout << "Concat then reverse" << '\t' << (entireReverse ? "1" : "0") << endl;
374                 cout << "Reverse then concat" << '\t' << (entireReverse ? "0" : "1") << endl;
375                 cout << "nPat" << '\t' << ebwt.nPat() << endl;
376                 cout << "refnames.size()" << '\t' << p_refnames.size() << endl;
377                 cout << "refs.numRefs()" << '\t' << refs.numRefs() << endl;
378                 cout << "refs.numNonGapRefs()" << '\t' << refs.numNonGapRefs() << endl;
379         }
380         cout << "SA-Sample" << "\t1 in " << (1 << ebwt.eh().offRate()) << endl;
381         cout << "FTab-Chars" << '\t' << ebwt.eh().ftabChars() << endl;
382         for(size_t i = 0; i < ebwt.nPat(); i++) {
383                 cout << "Sequence-" << (i+1)
384                      << '\t' << p_refnames[refs.expandIdx(i)]
385                      << '\t' << (ebwt.plen()[i] + (color ? 1 : 0))
386                      << endl;
387         }
388         if(extra) {
389                 cout << "RefRecords:\n";
390                 for(size_t i = 0; i < refs.refRecords().size(); i++) {
391                         RefRecord r = refs.refRecords()[i];
392                         cout << r.first << "\t(" << r.off << ", " << r.len << ")" << endl;
393                 }
394         }
395 }
396
397 static void driver(
398         const string& ebwtFileBase,
399         const string& query)
400 {
401         // Adjust
402         string adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
403         if (names_only) {
404                 print_index_sequence_names(adjustedEbwtFileBase, cout);
405                 return;
406         }
407         bool color = readEbwtColor(adjustedEbwtFileBase);
408         BitPairReference refs(
409                 adjustedEbwtFileBase,
410                 color,
411                 false,
412                 NULL,
413                 NULL,
414                 false,
415                 false, // don't load sequence (yet)
416                 false,
417                 false,
418                 false, // mmSweep
419                 verbose,
420                 verbose);
421         if(summarize_only) {
422                 print_index_summary(adjustedEbwtFileBase, cout, refs);
423         } else {
424                 // Initialize Ebwt object
425                 TPackedEbwt ebwt(
426                         adjustedEbwtFileBase,
427                         color,                // index is colorspace
428                         -1,                   // don't care about entire-reverse
429                         true,                 // index is for the forward direction
430                         -1,                   // offrate (-1 = index default)
431                         -1,
432                         false,                // use memory-mapped IO
433                         false,                // use shared memory
434                         false,                // sweep memory-mapped memory
435                         true,                 // load names?
436                         //true,                 // load SA sample?
437                         NULL,                 // no reference map
438                         verbose,              // be talkative?
439                         verbose,              // be talkative at startup?
440                         false,                // pass up memory exceptions?
441                         false);               // sanity check?
442                 // Load whole index into memory
443                 if(refFromEbwt) {
444                         ebwt.loadIntoMemory(-1, -1, true, false);
445                         print_index_sequences(cout, ebwt, refs);
446                 } else {
447                         vector<string> refnames;
448                         readEbwtRefnames(adjustedEbwtFileBase, refnames);
449                         print_ref_sequences(
450                                 cout,
451                                 readEbwtColor(ebwtFileBase),
452                                 refnames,
453                                 ebwt.plen(),
454                                 adjustedEbwtFileBase);
455                 }
456                 // Evict any loaded indexes from memory
457                 if(ebwt.isInMemory()) {
458                         ebwt.evictFromMemory();
459                 }
460         }
461 }
462
463 /**
464  * main function.  Parses command-line arguments.
465  */
466 int main(int argc, char **argv) {
467         try {
468                 string ebwtFile;  // read serialized Ebwt from this file
469                 string query;   // read query string(s) from this file
470                 vector<string> queries;
471                 string outfile; // write query results to this file
472                 argv0 = argv[0];
473                 parseOptions(argc, argv);
474                 if(showVersion) {
475                         cout << argv0 << " version " << BOWTIE_VERSION << endl;
476                         if(sizeof(void*) == 4) {
477                                 cout << "32-bit" << endl;
478                         } else if(sizeof(void*) == 8) {
479                                 cout << "64-bit" << endl;
480                         } else {
481                                 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
482                         }
483                         cout << "Built on " << BUILD_HOST << endl;
484                         cout << BUILD_TIME << endl;
485                         cout << "Compiler: " << COMPILER_VERSION << endl;
486                         cout << "Options: " << COMPILER_OPTIONS << endl;
487                         cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
488                                  << sizeof(int)
489                                  << ", " << sizeof(long) << ", " << sizeof(long long)
490                                  << ", " << sizeof(void *) << ", " << sizeof(size_t)
491                                  << ", " << sizeof(off_t) << "}" << endl;
492                         return 0;
493                 }
494
495                 // Get input filename
496                 if(optind >= argc) {
497                         cerr << "No index name given!" << endl;
498                         printUsage(cerr);
499                         return 1;
500                 }
501                 ebwtFile = argv[optind++];
502
503                 // Optionally summarize
504                 if(verbose) {
505                         cout << "Input ebwt file: \"" << ebwtFile << "\"" << endl;
506                         cout << "Output file: \"" << outfile << "\"" << endl;
507                         cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
508 #ifdef NDEBUG
509                         cout << "Assertions: disabled" << endl;
510 #else
511                         cout << "Assertions: enabled" << endl;
512 #endif
513                 }
514                 driver(ebwtFile, query);
515                 return 0;
516         } catch(std::exception& e) {
517                 cerr << "Command: ";
518                 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
519                 cerr << endl;
520                 return 1;
521         } catch(int e) {
522                 if(e != 0) {
523                         cerr << "Command: ";
524                         for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
525                         cerr << endl;
526                 }
527                 return e;
528         }
529 }