Commit patch to not break on spaces.
[bowtie.git] / pat.h
1 #ifndef PAT_H_
2 #define PAT_H_
3
4 #include <cassert>
5 #include <cmath>
6 #include <stdexcept>
7 #include <vector>
8 #include <string>
9 #include <cstring>
10 #include <ctype.h>
11 #include <fstream>
12 #include <seqan/sequence.h>
13 #include "alphabet.h"
14 #include "assert_helpers.h"
15 #include "tokenize.h"
16 #include "random_source.h"
17 #include "spinlock.h"
18 #include "threading.h"
19 #include "filebuf.h"
20 #include "qual.h"
21 #include "hit_set.h"
22 #include "search_globals.h"
23
24 /**
25  * Classes and routines for reading reads from various input sources.
26  */
27
28 using namespace std;
29 using namespace seqan;
30
31 /// Constructs string base-10 representation of integer 'value'
32 extern char* itoa10(int value, char* result);
33
34 /**
35  * Calculate a per-read random seed based on a combination of
36  * the read data (incl. sequence, name, quals) and the global
37  * seed in '_randSeed'.
38  */
39 static inline uint32_t genRandSeed(const String<Dna5>& qry,
40                                    const String<char>& qual,
41                                    const String<char>& name,
42                                    uint32_t seed)
43 {
44         // Calculate a per-read random seed based on a combination of
45         // the read data (incl. sequence, name, quals) and the global
46         // seed
47         uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
48         size_t qlen = seqan::length(qry);
49         // Throw all the characters of the read into the random seed
50         for(size_t i = 0; i < qlen; i++) {
51                 int p = (int)qry[i];
52                 assert_leq(p, 4);
53                 size_t off = ((i & 15) << 1);
54                 rseed ^= (p << off);
55         }
56         // Throw all the quality values for the read into the random
57         // seed
58         for(size_t i = 0; i < qlen; i++) {
59                 int p = (int)qual[i];
60                 assert_leq(p, 255);
61                 size_t off = ((i & 3) << 3);
62                 rseed ^= (p << off);
63         }
64         // Throw all the characters in the read name into the random
65         // seed
66         size_t namelen = seqan::length(name);
67         for(size_t i = 0; i < namelen; i++) {
68                 int p = (int)name[i];
69                 assert_leq(p, 255);
70                 size_t off = ((i & 3) << 3);
71                 rseed ^= (p << off);
72         }
73         return rseed;
74 }
75
76 /**
77  * A buffer for keeping all relevant information about a single read.
78  * Each search thread has one.
79  */
80 struct ReadBuf {
81         ReadBuf() { reset(); }
82
83         ~ReadBuf() {
84                 clearAll(); reset();
85                 // Prevent seqan from trying to free buffers
86                 _setBegin(patFw, NULL);
87                 _setBegin(patRc, NULL);
88                 _setBegin(qual, NULL);
89                 _setBegin(patFwRev, NULL);
90                 _setBegin(patRcRev, NULL);
91                 _setBegin(qualRev, NULL);
92                 _setBegin(name, NULL);
93                 for(int j = 0; j < 3; j++) {
94                         _setBegin(altPatFw[j], NULL);
95                         _setBegin(altPatFwRev[j], NULL);
96                         _setBegin(altPatRc[j], NULL);
97                         _setBegin(altPatRcRev[j], NULL);
98                         _setBegin(altQual[j], NULL);
99                         _setBegin(altQualRev[j], NULL);
100                 }
101         }
102
103 #define RESET_BUF(str, buf, typ) _setBegin(str, (typ*)buf); _setLength(str, 0); _setCapacity(str, BUF_SIZE);
104 #define RESET_BUF_LEN(str, buf, len, typ) _setBegin(str, (typ*)buf); _setLength(str, len); _setCapacity(str, BUF_SIZE);
105
106         /// Point all Strings to the beginning of their respective buffers
107         /// and set all lengths to 0
108         void reset() {
109                 patid = 0;
110                 readOrigBufLen = 0;
111                 qualOrigBufLen = 0;
112                 alts = 0;
113                 trimmed5 = trimmed3 = 0;
114                 fuzzy = false;
115                 color = false;
116                 primer = '?';
117                 trimc = '?';
118                 seed = 0;
119                 RESET_BUF(patFw, patBufFw, Dna5);
120                 RESET_BUF(patRc, patBufRc, Dna5);
121                 RESET_BUF(qual, qualBuf, char);
122                 RESET_BUF(patFwRev, patBufFwRev, Dna5);
123                 RESET_BUF(patRcRev, patBufRcRev, Dna5);
124                 RESET_BUF(qualRev, qualBufRev, char);
125                 RESET_BUF(name, nameBuf, char);
126                 for(int j = 0; j < 3; j++) {
127                         RESET_BUF(altPatFw[j], altPatBufFw[j], Dna5);
128                         RESET_BUF(altPatFwRev[j], altPatBufFwRev[j], Dna5);
129                         RESET_BUF(altPatRc[j], altPatBufRc[j], Dna5);
130                         RESET_BUF(altPatRcRev[j], altPatBufRcRev[j], Dna5);
131                         RESET_BUF(altQual[j], altQualBuf[j], char);
132                         RESET_BUF(altQualRev[j], altQualBufRev[j], char);
133                 }
134         }
135
136         void clearAll() {
137                 seqan::clear(patFw);
138                 seqan::clear(patRc);
139                 seqan::clear(qual);
140                 seqan::clear(patFwRev);
141                 seqan::clear(patRcRev);
142                 seqan::clear(qualRev);
143                 seqan::clear(name);
144                 for(int j = 0; j < 3; j++) {
145                         seqan::clear(altPatFw[j]);
146                         seqan::clear(altPatFwRev[j]);
147                         seqan::clear(altPatRc[j]);
148                         seqan::clear(altPatRcRev[j]);
149                         seqan::clear(altQual[j]);
150                         seqan::clear(altQualRev[j]);
151                 }
152                 trimmed5 = trimmed3 = 0;
153                 readOrigBufLen = 0;
154                 qualOrigBufLen = 0;
155                 color = fuzzy = false;
156                 primer = '?';
157                 trimc = '?';
158                 seed = 0;
159         }
160
161         /// Return true iff the read (pair) is empty
162         bool empty() const {
163                 return seqan::empty(patFw);
164         }
165
166         /// Return length of the read in the buffer
167         uint32_t length() const {
168                 return seqan::length(patFw);
169         }
170
171         /**
172          * Construct reverse complement of the pattern and the fuzzy
173          * alternative patters.  If read is in colorspace, just reverse
174          * them.
175          */
176         void constructRevComps() {
177                 uint32_t len = length();
178                 assert_gt(len, 0);
179                 RESET_BUF_LEN(patRc, patBufRc, len, Dna5);
180                 for(int j = 0; j < alts; j++) {
181                         RESET_BUF_LEN(altPatRc[j], altPatBufRc[j], len, Dna5);
182                 }
183                 if(color) {
184                         for(uint32_t i = 0; i < len; i++) {
185                                 // Reverse the sequence
186                                 patBufRc[i]  = patBufFw[len-i-1];
187                                 for(int j = 0; j < alts; j++) {
188                                         // Reverse the fuzzy alternatives
189                                         altPatBufRc[j][i] = altPatBufFw[j][len-i-1];
190                                 }
191                         }
192                 } else {
193                         for(uint32_t i = 0; i < len; i++) {
194                                 // Reverse-complement the sequence
195                                 patBufRc[i]  = (patBufFw[len-i-1] == 4) ? 4 : (patBufFw[len-i-1] ^ 3);
196                                 for(int j = 0; j < alts; j++) {
197                                         // Reverse-complement the fuzzy alternatives
198                                         altPatBufRc[j][i] = (altPatBufFw[j][len-i-1] == 4) ? 4 : (altPatBufFw[j][len-i-1] ^ 3);
199                                 }
200                         }
201                 }
202         }
203
204         /**
205          * Given patFw, patRc, and qual, construct the *Rev versions in
206          * place.  Assumes constructRevComps() was called previously.
207          */
208         void constructReverses() {
209                 uint32_t len = length();
210                 assert_gt(len, 0);
211                 RESET_BUF_LEN(patFwRev, patBufFwRev, len, Dna5);
212                 RESET_BUF_LEN(patRcRev, patBufRcRev, len, Dna5);
213                 RESET_BUF_LEN(qualRev, qualBufRev, len, char);
214                 for(int j = 0; j < alts; j++) {
215                         RESET_BUF_LEN(altPatFwRev[j], altPatBufFwRev[j], len, Dna5);
216                         RESET_BUF_LEN(altPatRcRev[j], altPatBufRcRev[j], len, Dna5);
217                         RESET_BUF_LEN(altQualRev[j], altQualBufRev[j], len, char);
218                 }
219                 for(uint32_t i = 0; i < len; i++) {
220                         patFwRev[i]  = patFw[len-i-1];
221                         patRcRev[i]  = patRc[len-i-1];
222                         qualRev[i]   = qual[len-i-1];
223                         for(int j = 0; j < alts; j++) {
224                                 altPatFwRev[j][i] = altPatFw[j][len-i-1];
225                                 altPatRcRev[j][i] = altPatRc[j][len-i-1];
226                                 altQualRev[j][i]  = altQual[j][len-i-1];
227                         }
228                 }
229         }
230
231         /**
232          * Append a "/1" or "/2" string onto the end of the name buf if
233          * it's not already there.
234          */
235         void fixMateName(int i) {
236                 assert(i == 1 || i == 2);
237                 size_t namelen = seqan::length(name);
238                 bool append = false;
239                 if(namelen < 2) {
240                         // Name is too short to possibly have /1 or /2 on the end
241                         append = true;
242                 } else {
243                         if(i == 1) {
244                                 // append = true iff mate name does not already end in /1
245                                 append =
246                                         nameBuf[namelen-2] != '/' ||
247                                         nameBuf[namelen-1] != '1';
248                         } else {
249                                 // append = true iff mate name does not already end in /2
250                                 append =
251                                         nameBuf[namelen-2] != '/' ||
252                                         nameBuf[namelen-1] != '2';
253                         }
254                 }
255                 if(append) {
256                         assert_leq(namelen, BUF_SIZE-2);
257                         _setLength(name, namelen + 2);
258                         nameBuf[namelen] = '/';
259                         nameBuf[namelen+1] = "012"[i];
260                 }
261         }
262
263         /**
264          * Dump basic information about this read to the given ostream.
265          */
266         void dump(std::ostream& os) const {
267                 os << name << ' ';
268                 if(color) {
269                         for(size_t i = 0; i < seqan::length(patFw); i++) {
270                                 os << "0123."[(int)patFw[i]];
271                         }
272                 } else {
273                         os << patFw;
274                 }
275                 os << ' ';
276                 // Print out the fuzzy alternative sequences
277                 for(int j = 0; j < 3; j++) {
278                         bool started = false;
279                         if(seqan::length(altQual[j]) > 0) {
280                                 for(size_t i = 0; i < length(); i++) {
281                                         if(altQual[j][i] != '!') {
282                                                 started = true;
283                                         }
284                                         if(started) {
285                                                 if(altQual[j][i] == '!') {
286                                                         os << '-';
287                                                 } else {
288                                                         if(color) {
289                                                                 os << "0123."[(int)altPatFw[j][i]];
290                                                         } else {
291                                                                 os << altPatFw[j][i];
292                                                         }
293                                                 }
294                                         }
295                                 }
296                         }
297                         cout << " ";
298                 }
299                 os << qual << " ";
300                 // Print out the fuzzy alternative quality strings
301                 for(int j = 0; j < 3; j++) {
302                         bool started = false;
303                         if(seqan::length(altQual[j]) > 0) {
304                                 for(size_t i = 0; i < length(); i++) {
305                                         if(altQual[j][i] != '!') {
306                                                 started = true;
307                                         }
308                                         if(started) {
309                                                 os << altQual[j][i];
310                                         }
311                                 }
312                         }
313                         if(j == 2) {
314                                 os << endl;
315                         } else {
316                                 os << " ";
317                         }
318                 }
319         }
320
321         /**
322          * Write read details to a HitSet object.
323          */
324         void toHitSet(HitSet& hs) {
325                 assert(!empty());
326                 hs.name = name;
327                 hs.seq = patFw;
328                 hs.qual = qual;
329                 hs.color = color;
330         }
331
332         static const int BUF_SIZE = 1024;
333
334         String<Dna5>  patFw;               // forward-strand sequence
335         uint8_t       patBufFw[BUF_SIZE];  // forward-strand sequence buffer
336         String<Dna5>  patRc;               // reverse-complement sequence
337         uint8_t       patBufRc[BUF_SIZE];  // reverse-complement sequence buffer
338         String<char>  qual;                // quality values
339         char          qualBuf[BUF_SIZE];   // quality value buffer
340
341         String<Dna5>  altPatFw[3];              // forward-strand sequence
342         uint8_t       altPatBufFw[3][BUF_SIZE]; // forward-strand sequence buffer
343         String<Dna5>  altPatRc[3];              // reverse-complement sequence
344         uint8_t       altPatBufRc[3][BUF_SIZE]; // reverse-complement sequence buffer
345         String<char>  altQual[3];               // quality values for alternate basecalls
346         char          altQualBuf[3][BUF_SIZE];  // quality value buffer for alternate basecalls
347
348         String<Dna5>  patFwRev;               // forward-strand sequence reversed
349         uint8_t       patBufFwRev[BUF_SIZE];  // forward-strand sequence buffer reversed
350         String<Dna5>  patRcRev;               // reverse-complement sequence reversed
351         uint8_t       patBufRcRev[BUF_SIZE];  // reverse-complement sequence buffer reversed
352         String<char>  qualRev;                // quality values reversed
353         char          qualBufRev[BUF_SIZE];   // quality value buffer reversed
354
355         String<Dna5>  altPatFwRev[3];              // forward-strand sequence reversed
356         uint8_t       altPatBufFwRev[3][BUF_SIZE]; // forward-strand sequence buffer reversed
357         String<Dna5>  altPatRcRev[3];              // reverse-complement sequence reversed
358         uint8_t       altPatBufRcRev[3][BUF_SIZE]; // reverse-complement sequence buffer reversed
359         String<char>  altQualRev[3];              // quality values for alternate basecalls reversed
360         char          altQualBufRev[3][BUF_SIZE]; // quality value buffer for alternate basecalls reversed
361
362         // For remembering the exact input text used to define a read
363         char          readOrigBuf[FileBuf::LASTN_BUF_SZ];
364         size_t        readOrigBufLen;
365
366         // For when qualities are in a separate file
367         char          qualOrigBuf[FileBuf::LASTN_BUF_SZ];
368         size_t        qualOrigBufLen;
369
370         String<char>  name;                // read name
371         char          nameBuf[BUF_SIZE];   // read name buffer
372         uint32_t      patid;               // unique 0-based id based on order in read file(s)
373         int           mate;                // 0 = single-end, 1 = mate1, 2 = mate2
374         uint32_t      seed;                // random seed
375         int           alts;                // number of alternatives
376         bool          fuzzy;               // whether to employ fuzziness
377         bool          color;               // whether read is in color space
378         char          primer;              // primer base, for csfasta files
379         char          trimc;               // trimmed color, for csfasta files
380         int           trimmed5;            // amount actually trimmed off 5' end
381         int           trimmed3;            // amount actually trimmed off 3' end
382         HitSet        hitset;              // holds previously-found hits; for chaining
383 };
384
385 /**
386  * Encapsulates a synchronized source of patterns; usually a file.
387  * Handles dumping patterns to a logfile (useful for debugging).  Also
388  * optionally reverses reads and quality strings before returning them,
389  * though that is usually more efficiently done by the concrete
390  * subclass.  Concrete subclasses should delimit critical sections with
391  * calls to lock() and unlock().
392  */
393 class PatternSource {
394 public:
395         PatternSource(uint32_t seed,
396                       bool randomizeQuals = false,
397                       bool useSpinlock = true,
398                       const char *dumpfile = NULL,
399                       bool verbose = false) :
400                 seed_(seed),
401                 readCnt_(0),
402                 dumpfile_(dumpfile),
403                 numWrappers_(0),
404                 doLocking_(true),
405                 useSpinlock_(useSpinlock),
406                 randomizeQuals_(randomizeQuals),
407                 lock_(),
408                 verbose_(verbose)
409         {
410                 // Open dumpfile, if specified
411                 if(dumpfile_ != NULL) {
412                         out_.open(dumpfile_, ios_base::out);
413                         if(!out_.good()) {
414                                 cerr << "Could not open pattern dump file \"" << dumpfile_ << "\" for writing" << endl;
415                                 throw 1;
416                         }
417                 }
418                 MUTEX_INIT(lock_);
419         }
420
421         virtual ~PatternSource() { }
422
423         /**
424          * Call this whenever this PatternSource is wrapped by a new
425          * WrappedPatternSourcePerThread.  This helps us keep track of
426          * whether locks will be contended.
427          */
428         void addWrapper() {
429                 numWrappers_++;
430         }
431
432         /**
433          * The main member function for dispensing patterns.
434          */
435         virtual void nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
436                 // nextPatternImpl does the reading from the ultimate source;
437                 // it is implemented in concrete subclasses
438                 nextReadPairImpl(ra, rb, patid);
439                 if(!ra.empty()) {
440                         // Possibly randomize the qualities so that they're more
441                         // scattered throughout the range of possible values
442                         if(randomizeQuals_) {
443                                 randomizeQuals(ra);
444                                 if(!rb.empty()) {
445                                         randomizeQuals(rb);
446                                 }
447                         }
448                         // TODO: Perhaps bundle all of the following up into a
449                         // finalize() member in the ReadBuf class?
450
451                         // Construct the reversed versions of the fw and rc seqs
452                         // and quals
453                         ra.constructRevComps();
454                         ra.constructReverses();
455                         if(!rb.empty()) {
456                                 rb.constructRevComps();
457                                 rb.constructReverses();
458                         }
459                         // Fill in the random-seed field using a combination of
460                         // information from the user-specified seed and the read
461                         // sequence, qualities, and name
462                         ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
463                         if(!rb.empty()) {
464                                 rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
465                         }
466                         // Output it, if desired
467                         if(dumpfile_ != NULL) {
468                                 dumpBuf(ra);
469                                 if(!rb.empty()) {
470                                         dumpBuf(rb);
471                                 }
472                         }
473                         if(verbose_) {
474                                 cout << "Parsed mate 1: "; ra.dump(cout);
475                                 cout << "Parsed mate 2: "; rb.dump(cout);
476                         }
477                 }
478         }
479
480         /**
481          * The main member function for dispensing patterns.
482          */
483         virtual void nextRead(ReadBuf& r, uint32_t& patid) {
484                 // nextPatternImpl does the reading from the ultimate source;
485                 // it is implemented in concrete subclasses
486                 nextReadImpl(r, patid);
487                 if(!r.empty()) {
488                         // Possibly randomize the qualities so that they're more
489                         // scattered throughout the range of possible values
490                         if(randomizeQuals_) {
491                                 randomizeQuals(r);
492                         }
493                         // Construct the reversed versions of the fw and rc seqs
494                         // and quals
495                         r.constructRevComps();
496                         r.constructReverses();
497                         // Fill in the random-seed field using a combination of
498                         // information from the user-specified seed and the read
499                         // sequence, qualities, and name
500                         r.seed = genRandSeed(r.patFw, r.qual, r.name, seed_);
501                         // Output it, if desired
502                         if(dumpfile_ != NULL) {
503                                 dumpBuf(r);
504                         }
505                         if(verbose_) {
506                                 cout << "Parsed read: "; r.dump(cout);
507                         }
508                 }
509         }
510
511         /**
512          * Implementation to be provided by concrete subclasses.  An
513          * implementation for this member is only relevant for formats that
514          * can read in a pair of reads in a single transaction with a
515          * single input source.  If paired-end input is given as a pair of
516          * parallel files, this member should throw an error and exit.
517          */
518         virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
519
520         /**
521          * Implementation to be provided by concrete subclasses.  An
522          * implementation for this member is only relevant for formats
523          * where individual input sources look like single-end-read
524          * sources, e.g., formats where paired-end reads are specified in
525          * parallel read files.
526          */
527         virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) = 0;
528
529         /// Reset state to start over again with the first read
530         virtual void reset() { readCnt_ = 0; }
531
532         /**
533          * Concrete subclasses call lock() to enter a critical region.
534          * What constitutes a critical region depends on the subclass.
535          */
536         void lock() {
537                 if(!doLocking_) return; // no contention
538 #ifdef USE_SPINLOCK
539                 if(useSpinlock_) {
540                         // User can ask to use the normal pthreads lock even if
541                         // spinlocks are compiled in.
542                         spinlock_.Enter();
543                 } else {
544 #endif
545                         MUTEX_LOCK(lock_);
546 #ifdef USE_SPINLOCK
547                 }
548 #endif
549         }
550
551         /**
552          * Concrete subclasses call unlock() to exit a critical region
553          * What constitutes a critical region depends on the subclass.
554          */
555         void unlock() {
556                 if(!doLocking_) return; // no contention
557 #ifdef USE_SPINLOCK
558                 if(useSpinlock_) {
559                         // User can ask to use the normal pthreads lock even if
560                         // spinlocks are compiled in.
561                         spinlock_.Leave();
562                 } else {
563 #endif
564                         MUTEX_UNLOCK(lock_);
565 #ifdef USE_SPINLOCK
566                 }
567 #endif
568         }
569
570         /**
571          * Return the number of reads attempted.
572          */
573         uint64_t readCnt() const { return readCnt_ - 1; }
574
575 protected:
576
577         /**
578          * Mix up the quality values for ReadBuf r.  There's probably a
579          * more (pseudo-)randomly rigorous way to do this; the output looks
580          * pretty cyclic.
581          */
582         void randomizeQuals(ReadBuf& r) {
583                 const size_t rlen = r.length();
584                 for(size_t i = 0; i < rlen; i++) {
585                         if(i < rlen-1) {
586                                 r.qual[i] *= (r.qual[i+1] + 7);
587                         }
588                         if(i > 0) {
589                                 r.qual[i] *= (r.qual[i-1] + 11);
590                         }
591                         // A user says that g++ complained here about "comparison
592                         // is always false due to limited range of data type", but
593                         // I can't see why.  I added the (int) cast to try to avoid
594                         // the warning.
595                         if((int)r.qual[i] < 0) r.qual[i] = -(r.qual[i]+1);
596                         r.qual[i] %= 41;
597                         assert_leq(r.qual[i], 40);
598                         r.qual[i] += 33;
599                 }
600         }
601
602         /**
603          * Dump the contents of the ReadBuf to the dump file.
604          */
605         void dumpBuf(const ReadBuf& r) {
606                 assert(dumpfile_ != NULL);
607                 dump(out_, r.patFw,
608                      empty(r.qual) ? String<char>("(empty)") : r.qual,
609                      empty(r.name)   ? String<char>("(empty)") : r.name);
610                 dump(out_, r.patRc,
611                      empty(r.qualRev) ? String<char>("(empty)") : r.qualRev,
612                      empty(r.name)   ? String<char>("(empty)") : r.name);
613         }
614
615         /**
616          * Default format for dumping a read to an output stream.  Concrete
617          * subclasses might want to do something fancier.
618          */
619         virtual void dump(ostream& out,
620                           const String<Dna5>& seq,
621                           const String<char>& qual,
622                           const String<char>& name)
623         {
624                 out << name << ": " << seq << " " << qual << endl;
625         }
626
627         uint32_t seed_;
628
629         /// The number of reads read by this PatternSource
630         uint64_t readCnt_;
631
632         const char *dumpfile_; /// dump patterns to this file before returning them
633         ofstream out_;         /// output stream for dumpfile
634         int numWrappers_;      /// # threads that own a wrapper for this PatternSource
635         bool doLocking_;       /// override whether to lock (true = don't override)
636         /// User can ask to use the normal pthreads-style lock even if
637         /// spinlocks is enabled and compiled in.  This is sometimes better
638         /// if we expect bad I/O latency on some reads.
639         bool useSpinlock_;
640         bool randomizeQuals_;  /// true -> mess up qualities in a random way
641 #ifdef USE_SPINLOCK
642         SpinLock spinlock_;
643 #endif
644         MUTEX_T lock_; /// mutex for locking critical regions
645         bool verbose_;
646 };
647
648 /**
649  * Abstract parent class for synhconized sources of paired-end reads
650  * (and possibly also single-end reads).
651  */
652 class PairedPatternSource {
653 public:
654         PairedPatternSource(uint32_t seed) {
655                 MUTEX_INIT(lock_);
656                 seed_ = seed;
657         }
658         virtual ~PairedPatternSource() { }
659
660         virtual void addWrapper() = 0;
661         virtual void reset() = 0;
662         virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
663         virtual pair<uint64_t,uint64_t> readCnt() const = 0;
664
665         /**
666          * Lock this PairedPatternSource, usually because one of its shared
667          * fields is being updated.
668          */
669         void lock() {
670 #ifdef USE_SPINLOCK
671                 spinlock_.Enter();
672 #else
673                 MUTEX_LOCK(lock_);
674 #endif
675         }
676
677         /**
678          * Unlock this PairedPatternSource.
679          */
680         void unlock() {
681 #ifdef USE_SPINLOCK
682                 spinlock_.Leave();
683 #else
684                 MUTEX_UNLOCK(lock_);
685 #endif
686         }
687
688 protected:
689
690 #ifdef USE_SPINLOCK
691         SpinLock spinlock_;
692 #endif
693         MUTEX_T lock_; /// mutex for locking critical regions
694         uint32_t seed_;
695 };
696
697 /**
698  * Encapsulates a synchronized source of both paired-end reads and
699  * unpaired reads, where the paired-end must come from parallel files.
700  */
701 class PairedSoloPatternSource : public PairedPatternSource {
702
703 public:
704
705         PairedSoloPatternSource(const vector<PatternSource*>& src,
706                                 uint32_t seed) :
707                 PairedPatternSource(seed), cur_(0), src_(src)
708         {
709             for(size_t i = 0; i < src_.size(); i++) {
710                 assert(src_[i] != NULL);
711             }
712         }
713
714         virtual ~PairedSoloPatternSource() { }
715
716         /**
717          * Call this whenever this PairedPatternSource is wrapped by a new
718          * WrappedPatternSourcePerThread.  This helps us keep track of
719          * whether locks within PatternSources will be contended.
720          */
721         virtual void addWrapper() {
722                 for(size_t i = 0; i < src_.size(); i++) {
723                         src_[i]->addWrapper();
724                 }
725         }
726
727         /**
728          * Reset this object and all the PatternSources under it so that
729          * the next call to nextReadPair gets the very first read pair.
730          */
731         virtual void reset() {
732                 for(size_t i = 0; i < src_.size(); i++) {
733                         src_[i]->reset();
734                 }
735                 cur_ = 0;
736         }
737
738         /**
739          * The main member function for dispensing pairs of reads or
740          * singleton reads.  Returns true iff ra and rb contain a new
741          * pair; returns false if ra contains a new unpaired read.
742          */
743         virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
744                 uint32_t cur = cur_;
745                 while(cur < src_.size()) {
746                         // Patterns from srca_[cur_] are unpaired
747                         src_[cur]->nextReadPair(ra, rb, patid);
748                         if(seqan::empty(ra.patFw)) {
749                                 // If patFw is empty, that's our signal that the
750                                 // input dried up
751                                 lock();
752                                 if(cur + 1 > cur_) cur_++;
753                                 cur = cur_;
754                                 unlock();
755                                 continue; // on to next pair of PatternSources
756                         }
757                         ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
758                         if(!rb.empty()) {
759                                 rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
760                                 ra.fixMateName(1);
761                                 rb.fixMateName(2);
762                         }
763                         ra.patid = patid;
764                         ra.mate  = 1;
765                         rb.mate  = 2;
766                         return true; // paired
767                 }
768                 return false;
769         }
770
771         /**
772          * Return the number of reads attempted.
773          */
774         virtual pair<uint64_t,uint64_t> readCnt() const {
775                 uint64_t ret = 0llu;
776                 vector<PatternSource*>::const_iterator it;
777                 for(it = src_.begin(); it != src_.end(); it++) {
778                         ret += (*it)->readCnt();
779                 }
780                 return make_pair(ret, 0llu);
781         }
782
783 protected:
784
785         volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
786         vector<PatternSource*> src_; /// PatternSources for paired-end reads
787 };
788
789 /**
790  * Encapsulates a synchronized source of both paired-end reads and
791  * unpaired reads, where the paired-end must come from parallel files.
792  */
793 class PairedDualPatternSource : public PairedPatternSource {
794
795 public:
796
797         PairedDualPatternSource(const vector<PatternSource*>& srca,
798                                 const vector<PatternSource*>& srcb,
799                                 uint32_t seed) :
800                 PairedPatternSource(seed), cur_(0), srca_(srca), srcb_(srcb)
801         {
802                 // srca_ and srcb_ must be parallel
803                 assert_eq(srca_.size(), srcb_.size());
804                 for(size_t i = 0; i < srca_.size(); i++) {
805                         // Can't have NULL first-mate sources.  Second-mate sources
806                         // can be NULL, in the case when the corresponding first-
807                         // mate source is unpaired.
808                         assert(srca_[i] != NULL);
809                         for(size_t j = 0; j < srcb_.size(); j++) {
810                                 assert_neq(srca_[i], srcb_[j]);
811                         }
812                 }
813         }
814
815         virtual ~PairedDualPatternSource() { }
816
817         /**
818          * Call this whenever this PairedPatternSource is wrapped by a new
819          * WrappedPatternSourcePerThread.  This helps us keep track of
820          * whether locks within PatternSources will be contended.
821          */
822         virtual void addWrapper() {
823                 for(size_t i = 0; i < srca_.size(); i++) {
824                         srca_[i]->addWrapper();
825                         if(srcb_[i] != NULL) {
826                                 srcb_[i]->addWrapper();
827                         }
828                 }
829         }
830
831         /**
832          * Reset this object and all the PatternSources under it so that
833          * the next call to nextReadPair gets the very first read pair.
834          */
835         virtual void reset() {
836                 for(size_t i = 0; i < srca_.size(); i++) {
837                         srca_[i]->reset();
838                         if(srcb_[i] != NULL) {
839                                 srcb_[i]->reset();
840                         }
841                 }
842                 cur_ = 0;
843         }
844
845         /**
846          * The main member function for dispensing pairs of reads or
847          * singleton reads.  Returns true iff ra and rb contain a new
848          * pair; returns false if ra contains a new unpaired read.
849          */
850         virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
851                 uint32_t cur = cur_;
852                 while(cur < srca_.size()) {
853                         if(srcb_[cur] == NULL) {
854                                 // Patterns from srca_[cur_] are unpaired
855                                 srca_[cur]->nextRead(ra, patid);
856                                 if(seqan::empty(ra.patFw)) {
857                                         // If patFw is empty, that's our signal that the
858                                         // input dried up
859                                         lock();
860                                         if(cur + 1 > cur_) cur_++;
861                                         cur = cur_;
862                                         unlock();
863                                         continue; // on to next pair of PatternSources
864                                 }
865                                 ra.patid = patid;
866                                 ra.mate  = 0;
867                                 return false; // unpaired
868                         } else {
869                                 // Patterns from srca_[cur_] and srcb_[cur_] are paired
870                                 uint32_t patid_a = 0;
871                                 uint32_t patid_b = 0;
872                                 // Lock to ensure that this thread gets parallel reads
873                                 // in the two mate files
874                                 lock();
875                                 srca_[cur]->nextRead(ra, patid_a);
876                                 srcb_[cur]->nextRead(rb, patid_b);
877                                 bool cont = false;
878                                 // Did the pair obtained fail to match up?
879                                 while(patid_a != patid_b) {
880                                         // Is either input exhausted?  If so, bail.
881                                         if(seqan::empty(ra.patFw) || seqan::empty(rb.patFw)) {
882                                                 seqan::clear(ra.patFw);
883                                                 if(cur + 1 > cur_) cur_++;
884                                                 cur = cur_;
885                                                 cont = true;
886                                                 break;
887                                         }
888                                         if(patid_a < patid_b) {
889                                                 srca_[cur]->nextRead(ra, patid_a);
890                                                 ra.fixMateName(1);
891                                         } else {
892                                                 srcb_[cur]->nextRead(rb, patid_b);
893                                                 rb.fixMateName(2);
894                                         }
895                                 }
896                                 unlock();
897                                 if(cont) continue; // on to next pair of PatternSources
898                                 ra.fixMateName(1);
899                                 rb.fixMateName(2);
900                                 if(seqan::empty(ra.patFw)) {
901                                         // If patFw is empty, that's our signal that the
902                                         // input dried up
903                                         lock();
904                                         if(cur + 1 > cur_) cur_++;
905                                         cur = cur_;
906                                         unlock();
907                                         continue; // on to next pair of PatternSources
908                                 }
909                                 assert_eq(patid_a, patid_b);
910                                 patid = patid_a;
911                                 ra.patid = patid;
912                                 rb.patid = patid;
913                                 ra.mate  = 1;
914                                 rb.mate  = 2;
915                                 return true; // paired
916                         }
917                 }
918                 return false;
919         }
920
921         /**
922          * Return the number of reads attempted.
923          */
924         virtual pair<uint64_t,uint64_t> readCnt() const {
925                 uint64_t rets = 0llu, retp = 0llu;
926                 for(size_t i = 0; i < srca_.size(); i++) {
927                         if(srcb_[i] == NULL) {
928                                 rets += srca_[i]->readCnt();
929                         } else {
930                                 assert_eq(srca_[i]->readCnt(), srcb_[i]->readCnt());
931                                 retp += srca_[i]->readCnt();
932                         }
933                 }
934                 return make_pair(rets, retp);
935         }
936
937 protected:
938
939         volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
940         vector<PatternSource*> srca_; /// PatternSources for 1st mates and/or unpaired reads
941         vector<PatternSource*> srcb_; /// PatternSources for 2nd mates
942 };
943
944 /**
945  * Encapsulates a single thread's interaction with the PatternSource.
946  * Most notably, this class holds the buffers into which the
947  * PatterSource will write sequences.  This class is *not* threadsafe
948  * - it doesn't need to be since there's one per thread.  PatternSource
949  * is thread-safe.
950  */
951 class PatternSourcePerThread {
952 public:
953         PatternSourcePerThread() :
954                 buf1_(), buf2_(), patid_(0xffffffff) { }
955
956         virtual ~PatternSourcePerThread() { }
957
958         /**
959          * Read the next read pair.
960          */
961         virtual void nextReadPair() { }
962
963         ReadBuf& bufa()        { return buf1_;         }
964         ReadBuf& bufb()        { return buf2_;         }
965
966         uint32_t      patid() const { return patid_;        }
967         virtual void  reset()       { patid_ = 0xffffffff;  }
968         bool          empty() const { return buf1_.empty(); }
969         uint32_t length(int mate) const {
970                 return (mate == 1)? buf1_.length() : buf2_.length();
971         }
972
973         /**
974          * Return true iff the buffers jointly contain a paired-end read.
975          */
976         bool paired() {
977                 bool ret = !buf2_.empty();
978                 assert(!ret || !empty());
979                 return ret;
980         }
981
982 protected:
983         ReadBuf  buf1_;    // read buffer for mate a
984         ReadBuf  buf2_;    // read buffer for mate b
985         uint32_t patid_;   // index of read just read
986 };
987
988 /**
989  * Abstract parent factory for PatternSourcePerThreads.
990  */
991 class PatternSourcePerThreadFactory {
992 public:
993         virtual ~PatternSourcePerThreadFactory() { }
994         virtual PatternSourcePerThread* create() const = 0;
995         virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const = 0;
996
997         /// Free memory associated with a pattern source
998         virtual void destroy(PatternSourcePerThread* patsrc) const {
999                 assert(patsrc != NULL);
1000                 // Free the PatternSourcePerThread
1001                 delete patsrc;
1002         }
1003
1004         /// Free memory associated with a pattern source list
1005         virtual void destroy(std::vector<PatternSourcePerThread*>* patsrcs) const {
1006                 assert(patsrcs != NULL);
1007                 // Free all of the PatternSourcePerThreads
1008                 for(size_t i = 0; i < patsrcs->size(); i++) {
1009                         if((*patsrcs)[i] != NULL) {
1010                                 delete (*patsrcs)[i];
1011                                 (*patsrcs)[i] = NULL;
1012                         }
1013                 }
1014                 // Free the vector
1015                 delete patsrcs;
1016         }
1017 };
1018
1019 /**
1020  * A per-thread wrapper for a PairedPatternSource.
1021  */
1022 class WrappedPatternSourcePerThread : public PatternSourcePerThread {
1023 public:
1024         WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
1025                 patsrc_(__patsrc)
1026         {
1027                 patsrc_.addWrapper();
1028         }
1029
1030         /**
1031          * Get the next paired or unpaired read from the wrapped
1032          * PairedPatternSource.
1033          */
1034         virtual void nextReadPair() {
1035                 PatternSourcePerThread::nextReadPair();
1036                 ASSERT_ONLY(uint32_t lastPatid = patid_);
1037                 buf1_.clearAll();
1038                 buf2_.clearAll();
1039                 patsrc_.nextReadPair(buf1_, buf2_, patid_);
1040                 assert(buf1_.empty() || patid_ != lastPatid);
1041         }
1042
1043 private:
1044
1045         /// Container for obtaining paired reads from PatternSources
1046         PairedPatternSource& patsrc_;
1047 };
1048
1049 /**
1050  * Abstract parent factory for PatternSourcePerThreads.
1051  */
1052 class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
1053 public:
1054         WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
1055                 patsrc_(patsrc) { }
1056
1057         /**
1058          * Create a new heap-allocated WrappedPatternSourcePerThreads.
1059          */
1060         virtual PatternSourcePerThread* create() const {
1061                 return new WrappedPatternSourcePerThread(patsrc_);
1062         }
1063
1064         /**
1065          * Create a new heap-allocated vector of heap-allocated
1066          * WrappedPatternSourcePerThreads.
1067          */
1068         virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const {
1069                 std::vector<PatternSourcePerThread*>* v = new std::vector<PatternSourcePerThread*>;
1070                 for(size_t i = 0; i < n; i++) {
1071                         v->push_back(new WrappedPatternSourcePerThread(patsrc_));
1072                         assert(v->back() != NULL);
1073                 }
1074                 return v;
1075         }
1076
1077 private:
1078         /// Container for obtaining paired reads from PatternSources
1079         PairedPatternSource& patsrc_;
1080 };
1081
1082 /**
1083  * Encapsualtes a source of patterns where each raw pattern is trimmed
1084  * by some user-defined amount on the 3' and 5' ends.  Doesn't
1085  * implement the actual trimming - that's up to the concrete
1086  * descendants.
1087  */
1088 class TrimmingPatternSource : public PatternSource {
1089 public:
1090         TrimmingPatternSource(uint32_t seed,
1091                               bool randomizeQuals = false,
1092                               bool useSpinlock = true,
1093                               const char *dumpfile = NULL,
1094                               bool verbose = false,
1095                               int trim3 = 0,
1096                               int trim5 = 0) :
1097                 PatternSource(seed, randomizeQuals, useSpinlock, dumpfile, verbose),
1098                 trim3_(trim3), trim5_(trim5) { }
1099 protected:
1100         int trim3_;
1101         int trim5_;
1102 };
1103
1104 /**
1105  * A synchronized pattern source that simply returns random reads
1106  * without reading from the disk or storing lists of reads in memory.
1107  * Reads are generated with a RandomSource.
1108  */
1109 class RandomPatternSource : public PatternSource {
1110 public:
1111         RandomPatternSource(uint32_t seed,
1112                             uint32_t numReads = 2000000,
1113                             int length = 35,
1114                             bool useSpinlock = true,
1115                             const char *dumpfile = NULL,
1116                             bool verbose = false) :
1117                 PatternSource(seed, false, useSpinlock, dumpfile, verbose),
1118                 numReads_(numReads),
1119                 length_(length),
1120                 seed_(seed)
1121         {
1122                 if(length_ > 1024) {
1123                         cerr << "Read length for RandomPatternSource may not exceed 1024; got " << length_ << endl;
1124                         throw 1;
1125                 }
1126                 rand_.init(seed_);
1127         }
1128
1129         /** Get the next random read and set patid */
1130         virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1131                 // Begin critical section
1132                 lock();
1133                 if(readCnt_ >= numReads_) {
1134                         r.clearAll();
1135                         unlock();
1136                         return;
1137                 }
1138                 uint32_t ra = rand_.nextU32();
1139                 patid = readCnt_;
1140                 readCnt_++;
1141                 unlock();
1142                 fillRandomRead(r, ra, length_, patid);
1143         }
1144
1145         /** Get the next random read and set patid */
1146         virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1147                 // Begin critical section
1148                 lock();
1149                 if(readCnt_ >= numReads_) {
1150                         ra.clearAll();
1151                         rb.clearAll();
1152                         unlock();
1153                         return;
1154                 }
1155                 uint32_t rna = rand_.nextU32();
1156                 uint32_t rnb = rand_.nextU32();
1157                 patid = readCnt_;
1158                 readCnt_++;
1159                 unlock();
1160                 fillRandomRead(ra, rna, length_, patid);
1161                 fillRandomRead(rb, rnb, length_, patid);
1162         }
1163
1164         /** */
1165         static void fillRandomRead(ReadBuf& r,
1166                                    uint32_t ra,
1167                                    int length,
1168                                    uint32_t patid)
1169         {
1170                 // End critical section
1171                 for(int i = 0; i < length; i++) {
1172                         ra = RandomSource::nextU32(ra) >> 8;
1173                         r.patBufFw[i]           = (ra & 3);
1174                         char c                  = 'I' - ((ra >> 2) & 31);
1175                         r.qualBuf[i]            = c;
1176                 }
1177                 _setBegin (r.patFw, (Dna5*)r.patBufFw);
1178                 _setLength(r.patFw, length);
1179                 _setBegin (r.qual, r.qualBuf);
1180                 _setLength(r.qual, length);
1181                 itoa10(patid, r.nameBuf);
1182                 _setBegin(r.name, r.nameBuf);
1183                 _setLength(r.name, strlen(r.nameBuf));
1184         }
1185
1186         /** Reset the pattern source to the beginning */
1187         virtual void reset() {
1188                 PatternSource::reset();
1189                 // reset pseudo-random generator; next string of calls to
1190                 // nextU32() will return same pseudo-randoms as the last
1191                 rand_.init(seed_);
1192         }
1193 private:
1194         uint32_t     numReads_; /// number of reads to dish out
1195         int          length_;   /// length of reads
1196         uint32_t     seed_;     /// seed for pseudo-randoms
1197         RandomSource rand_;     /// pseudo-random generator
1198 };
1199
1200 /**
1201  * A version of PatternSourcePerThread that dishes out random patterns
1202  * without any synchronization.
1203  */
1204 class RandomPatternSourcePerThread : public PatternSourcePerThread {
1205 public:
1206         RandomPatternSourcePerThread(uint32_t numreads,
1207                                      int length,
1208                                      int numthreads,
1209                                      int thread) :
1210                 PatternSourcePerThread(),
1211                 numreads_(numreads),
1212                 length_(length),
1213                 numthreads_(numthreads),
1214                 thread_(thread)
1215         {
1216                 patid_ = thread_;
1217                 if(length_ > 1024) {
1218                         cerr << "Read length for RandomPatternSourcePerThread may not exceed 1024; got " << length_ << endl;
1219                         throw 1;
1220                 }
1221                 rand_.init(thread_);
1222         }
1223
1224         virtual void nextReadPair() {
1225                 PatternSourcePerThread::nextReadPair();
1226                 if(patid_ >= numreads_) {
1227                         buf1_.clearAll();
1228                         buf2_.clearAll();
1229                         return;
1230                 }
1231                 RandomPatternSource::fillRandomRead(
1232                         buf1_, rand_.nextU32(), length_, patid_);
1233                 RandomPatternSource::fillRandomRead(
1234                         buf2_, rand_.nextU32(), length_, patid_);
1235                 patid_ += numthreads_;
1236         }
1237
1238         virtual void reset() {
1239                 PatternSourcePerThread::reset();
1240                 patid_ = thread_;
1241                 rand_.init(thread_);
1242         }
1243
1244 private:
1245         uint32_t     numreads_;
1246         int          length_;
1247         int          numthreads_;
1248         int          thread_;
1249         RandomSource rand_;
1250 };
1251
1252 /**
1253  * Abstract parent factory for PatternSourcePerThreads.
1254  */
1255 class RandomPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
1256 public:
1257         RandomPatternSourcePerThreadFactory(
1258                 uint32_t numreads,
1259                 int length,
1260                 int numthreads,
1261                 int thread) :
1262                 numreads_(numreads),
1263                 length_(length),
1264                 numthreads_(numthreads),
1265                 thread_(thread) { }
1266
1267         /**
1268          * Create a new heap-allocated WrappedPatternSourcePerThreads.
1269          */
1270         virtual PatternSourcePerThread* create() const {
1271                 return new RandomPatternSourcePerThread(
1272                         numreads_, length_, numthreads_, thread_);
1273         }
1274
1275         /**
1276          * Create a new heap-allocated vector of heap-allocated
1277          * WrappedPatternSourcePerThreads.
1278          */
1279         virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const {
1280                 std::vector<PatternSourcePerThread*>* v = new std::vector<PatternSourcePerThread*>;
1281                 for(size_t i = 0; i < n; i++) {
1282                         v->push_back(new RandomPatternSourcePerThread(
1283                                 numreads_, length_, numthreads_, thread_));
1284                         assert(v->back() != NULL);
1285                 }
1286                 return v;
1287         }
1288
1289 private:
1290         uint32_t numreads_;
1291         int length_;
1292         int numthreads_;
1293         int thread_;
1294 };
1295
1296 /// Skip to the end of the current string of newline chars and return
1297 /// the first character after the newline chars, or -1 for EOF
1298 static inline int getOverNewline(FileBuf& in) {
1299         int c;
1300         while(isspace(c = in.get()));
1301         return c;
1302 }
1303
1304 /// Skip to the end of the current string of newline chars such that
1305 /// the next call to get() returns the first character after the
1306 /// whitespace
1307 static inline int peekOverNewline(FileBuf& in) {
1308         while(true) {
1309                 int c = in.peek();
1310                 if(c != '\r' && c != '\n') {
1311                         return c;
1312                 }
1313                 in.get();
1314         }
1315 }
1316
1317 /// Skip to the end of the current line; return the first character
1318 /// of the next line or -1 for EOF
1319 static inline int getToEndOfLine(FileBuf& in) {
1320         while(true) {
1321                 int c = in.get(); if(c < 0) return -1;
1322                 if(c == '\n' || c == '\r') {
1323                         while(c == '\n' || c == '\r') {
1324                                 c = in.get(); if(c < 0) return -1;
1325                         }
1326                         // c now holds first character of next line
1327                         return c;
1328                 }
1329         }
1330 }
1331
1332 /// Skip to the end of the current line such that the next call to
1333 /// get() returns the first character on the next line
1334 static inline int peekToEndOfLine(FileBuf& in) {
1335         while(true) {
1336                 int c = in.get(); if(c < 0) return c;
1337                 if(c == '\n' || c == '\r') {
1338                         c = in.peek();
1339                         while(c == '\n' || c == '\r') {
1340                                 in.get(); if(c < 0) return c; // consume \r or \n
1341                                 c = in.peek();
1342                         }
1343                         // next get() gets first character of next line
1344                         return c;
1345                 }
1346         }
1347 }
1348
1349 extern void wrongQualityFormat(const String<char>& read_name);
1350 extern void tooFewQualities(const String<char>& read_name);
1351 extern void tooManyQualities(const String<char>& read_name);
1352 extern void tooManySeqChars(const String<char>& read_name);
1353
1354 /**
1355  * Encapsulates a source of patterns which is an in-memory vector.
1356  */
1357 class VectorPatternSource : public TrimmingPatternSource {
1358 public:
1359         VectorPatternSource(uint32_t seed,
1360                             const vector<string>& v,
1361                             bool color,
1362                             bool randomizeQuals = false,
1363                             bool useSpinlock = true,
1364                             const char *dumpfile = NULL,
1365                             bool verbose = false,
1366                             int trim3 = 0,
1367                             int trim5 = 0,
1368                                 uint32_t skip = 0) :
1369                 TrimmingPatternSource(seed, randomizeQuals, useSpinlock,
1370                                       dumpfile, verbose, trim3, trim5),
1371                 color_(color), cur_(skip), skip_(skip), paired_(false), v_(),
1372                 quals_()
1373         {
1374                 for(size_t i = 0; i < v.size(); i++) {
1375                         vector<string> ss;
1376                         tokenize(v[i], ":", ss, 2);
1377                         assert_gt(ss.size(), 0);
1378                         assert_leq(ss.size(), 2);
1379                         // Initialize s
1380                         string s = ss[0];
1381                         int mytrim5 = this->trim5_;
1382                         if(color_ && s.length() > 1) {
1383                                 // This may be a primer character.  If so, keep it in the
1384                                 // 'primer' field of the read buf and parse the rest of the
1385                                 // read without it.
1386                                 int c = toupper(s[0]);
1387                                 if(asc2dnacat[c] > 0) {
1388                                         // First char is a DNA char
1389                                         int c2 = toupper(s[1]);
1390                                         // Second char is a color char
1391                                         if(asc2colcat[c2] > 0) {
1392                                                 mytrim5 += 2; // trim primer and first color
1393                                         }
1394                                 }
1395                         }
1396                         if(color_) {
1397                                 // Convert '0'-'3' to 'A'-'T'
1398                                 for(size_t i = 0; i < s.length(); i++) {
1399                                         if(s[i] >= '0' && s[i] <= '4') {
1400                                                 s[i] = "ACGTN"[(int)s[i] - '0'];
1401                                         }
1402                                         if(s[i] == '.') s[i] = 'N';
1403                                 }
1404                         }
1405                         if(s.length() <= (size_t)(trim3_ + mytrim5)) {
1406                                 // Entire read is trimmed away
1407                                 continue;
1408                         } else {
1409                                 // Trim on 5' (high-quality) end
1410                                 if(mytrim5 > 0) {
1411                                         s.erase(0, mytrim5);
1412                                 }
1413                                 // Trim on 3' (low-quality) end
1414                                 if(trim3_ > 0) {
1415                                         s.erase(s.length()-trim3_);
1416                                 }
1417                         }
1418                         //  Initialize vq
1419                         string vq;
1420                         if(ss.size() == 2) {
1421                                 vq = ss[1];
1422                         }
1423                         // Trim qualities
1424                         if(vq.length() > (size_t)(trim3_ + mytrim5)) {
1425                                 // Trim on 5' (high-quality) end
1426                                 if(mytrim5 > 0) {
1427                                         vq.erase(0, mytrim5);
1428                                 }
1429                                 // Trim on 3' (low-quality) end
1430                                 if(trim3_ > 0) {
1431                                         vq.erase(vq.length()-trim3_);
1432                                 }
1433                         }
1434                         // Pad quals with Is if necessary; this shouldn't happen
1435                         while(vq.length() < length(s)) {
1436                                 vq.push_back('I');
1437                         }
1438                         // Truncate quals to match length of read if necessary;
1439                         // this shouldn't happen
1440                         if(vq.length() > length(s)) {
1441                                 vq.erase(length(s));
1442                         }
1443                         assert_eq(vq.length(), length(s));
1444                         v_.push_back(s);
1445                         quals_.push_back(vq);
1446                         trimmed3_.push_back(trim3_);
1447                         trimmed5_.push_back(mytrim5);
1448                         ostringstream os;
1449                         os << (names_.size());
1450                         names_.push_back(os.str());
1451                 }
1452                 assert_eq(v_.size(), quals_.size());
1453         }
1454         virtual ~VectorPatternSource() { }
1455         virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1456                 // Let Strings begin at the beginning of the respective bufs
1457                 r.reset();
1458                 lock();
1459                 if(cur_ >= v_.size()) {
1460                         unlock();
1461                         // Clear all the Strings, as a signal to the caller that
1462                         // we're out of reads
1463                         r.clearAll();
1464                         assert(r.empty());
1465                         return;
1466                 }
1467                 // Copy v_*, quals_* strings into the respective Strings
1468                 r.color = color_;
1469                 r.patFw  = v_[cur_];
1470                 r.qual = quals_[cur_];
1471                 r.trimmed3 = trimmed3_[cur_];
1472                 r.trimmed5 = trimmed5_[cur_];
1473                 ostringstream os;
1474                 os << cur_;
1475                 r.name = os.str();
1476                 cur_++;
1477                 readCnt_++;
1478                 patid = readCnt_;
1479                 unlock();
1480         }
1481         /**
1482          * This is unused, but implementation is given for completeness.
1483          */
1484         virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1485                 // Let Strings begin at the beginning of the respective bufs
1486                 ra.reset();
1487                 rb.reset();
1488                 if(!paired_) {
1489                         paired_ = true;
1490                         cur_ <<= 1;
1491                 }
1492                 lock();
1493                 if(cur_ >= v_.size()-1) {
1494                         unlock();
1495                         // Clear all the Strings, as a signal to the caller that
1496                         // we're out of reads
1497                         ra.clearAll();
1498                         rb.clearAll();
1499                         assert(ra.empty());
1500                         assert(rb.empty());
1501                         return;
1502                 }
1503                 // Copy v_*, quals_* strings into the respective Strings
1504                 ra.patFw  = v_[cur_];
1505                 ra.qual = quals_[cur_];
1506                 ra.trimmed3 = trimmed3_[cur_];
1507                 ra.trimmed5 = trimmed5_[cur_];
1508                 cur_++;
1509                 rb.patFw  = v_[cur_];
1510                 rb.qual = quals_[cur_];
1511                 rb.trimmed3 = trimmed3_[cur_];
1512                 rb.trimmed5 = trimmed5_[cur_];
1513                 ostringstream os;
1514                 os << readCnt_;
1515                 ra.name = os.str();
1516                 rb.name = os.str();
1517                 ra.color = rb.color = color_;
1518                 cur_++;
1519                 readCnt_++;
1520                 patid = readCnt_;
1521                 unlock();
1522         }
1523         virtual void reset() {
1524                 TrimmingPatternSource::reset();
1525                 cur_ = skip_;
1526                 paired_ = false;
1527         }
1528 private:
1529         bool color_;
1530         size_t cur_;
1531         uint32_t skip_;
1532         bool paired_;
1533         vector<String<Dna5> > v_;     /// forward sequences
1534         vector<String<char> > quals_; /// quality values parallel to v_
1535         vector<String<char> > names_; /// names
1536         vector<int> trimmed3_; // names
1537         vector<int> trimmed5_; // names
1538 };
1539
1540 /**
1541  *
1542  */
1543 class BufferedFilePatternSource : public TrimmingPatternSource {
1544 public:
1545         BufferedFilePatternSource(uint32_t seed,
1546                                   const vector<string>& infiles,
1547                                   const vector<string>* qinfiles,
1548                                   bool randomizeQuals = false,
1549                                   bool useSpinlock = true,
1550                                   const char *dumpfile = NULL,
1551                                   bool verbose = false,
1552                                   int trim3 = 0,
1553                                   int trim5 = 0,
1554                                   uint32_t skip = 0) :
1555                 TrimmingPatternSource(seed, randomizeQuals, useSpinlock,
1556                                       dumpfile, verbose, trim3, trim5),
1557                 infiles_(infiles),
1558                 filecur_(0),
1559                 fb_(),
1560                 qfb_(),
1561                 skip_(skip),
1562                 first_(true)
1563         {
1564                 qinfiles_.clear();
1565                 if(qinfiles != NULL) qinfiles_ = *qinfiles;
1566                 assert_gt(infiles.size(), 0);
1567                 errs_.resize(infiles_.size(), false);
1568                 if(qinfiles_.size() > 0 &&
1569                    qinfiles_.size() != infiles_.size())
1570                 {
1571                         cerr << "Error: Different numbers of input FASTA/quality files ("
1572                              << infiles_.size() << "/" << qinfiles_.size() << ")" << endl;
1573                         throw 1;
1574                 }
1575                 assert(!fb_.isOpen());
1576                 assert(!qfb_.isOpen());
1577                 open(); // open first file in the list
1578                 filecur_++;
1579         }
1580
1581         virtual ~BufferedFilePatternSource() {
1582                 if(fb_.isOpen()) fb_.close();
1583                 if(qfb_.isOpen()) {
1584                         assert_gt(qinfiles_.size(), 0);
1585                         qfb_.close();
1586                 }
1587         }
1588
1589         /**
1590          * Fill ReadBuf with the sequence, quality and name for the next
1591          * read in the list of read files.  This function gets called by
1592          * all the search threads, so we must handle synchronization.
1593          */
1594         virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1595                 // We are entering a critical region, because we're
1596                 // manipulating our file handle and filecur_ state
1597                 lock();
1598                 bool notDone = true;
1599                 do {
1600                         read(r, patid);
1601                         // Try again if r is empty (indicating an error) and input
1602                         // is not yet exhausted, OR if we have more reads to skip
1603                         // over
1604                         notDone = seqan::empty(r.patFw) && !fb_.eof();
1605                 } while(notDone || (!fb_.eof() && patid < skip_));
1606                 if(patid < skip_) {
1607                         unlock();
1608                         r.clearAll();
1609                         assert(seqan::empty(r.patFw));
1610                         return;
1611                 }
1612                 if(first_ && seqan::empty(r.patFw) /* && !quiet_ */) {
1613                         // No reads could be extracted from the first _infile
1614                         cerr << "Warning: Could not find any reads in \"" << infiles_[0] << "\"" << endl;
1615                 }
1616                 first_ = false;
1617                 while(seqan::empty(r.patFw) && filecur_ < infiles_.size()) {
1618                         // Open next file
1619                         open();
1620                         resetForNextFile(); // reset state to handle a fresh file
1621                         do {
1622                                 read(r, patid);
1623                         } while((seqan::empty(r.patFw) && !fb_.eof()));
1624                         assert_geq(patid, skip_);
1625                         if(seqan::empty(r.patFw) /*&& !quiet_ */) {
1626                                 // No reads could be extracted from this _infile
1627                                 cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl;
1628                         }
1629                         filecur_++;
1630                 }
1631                 // Leaving critical region
1632                 unlock();
1633                 // If r.patFw is empty, then the caller knows that we are
1634                 // finished with the reads
1635         }
1636         /**
1637          *
1638          */
1639         virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1640                 // We are entering a critical region, because we're
1641                 // manipulating our file handle and filecur_ state
1642                 lock();
1643                 bool notDone = true;
1644                 do {
1645                         readPair(ra, rb, patid);
1646                         // Try again if ra is empty (indicating an error) and input
1647                         // is not yet exhausted, OR if we have more reads to skip
1648                         // over
1649                         notDone = seqan::empty(ra.patFw) && !fb_.eof();
1650                 } while(notDone || (!fb_.eof() && patid < skip_));
1651                 if(patid < skip_) {
1652                         unlock();
1653                         ra.clearAll();
1654                         rb.clearAll();
1655                         assert(seqan::empty(ra.patFw));
1656                         return;
1657                 }
1658                 if(first_ && seqan::empty(ra.patFw) /*&& !quiet_*/) {
1659                         // No reads could be extracted from the first _infile
1660                         cerr << "Warning: Could not find any read pairs in \"" << infiles_[0] << "\"" << endl;
1661                 }
1662                 first_ = false;
1663                 while(seqan::empty(ra.patFw) && filecur_ < infiles_.size()) {
1664                         // Open next file
1665                         open();
1666                         resetForNextFile(); // reset state to handle a fresh file
1667                         do {
1668                                 readPair(ra, rb, patid);
1669                         } while((seqan::empty(ra.patFw) && !fb_.eof()));
1670                         assert_geq(patid, skip_);
1671                         if(seqan::empty(ra.patFw) /*&& !quiet_*/) {
1672                                 // No reads could be extracted from this _infile
1673                                 cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl;
1674                         }
1675                         filecur_++;
1676                 }
1677                 // Leaving critical region
1678                 unlock();
1679                 // If ra.patFw is empty, then the caller knows that we are
1680                 // finished with the reads
1681         }
1682         /**
1683          * Reset state so that we read start reading again from the
1684          * beginning of the first file.  Should only be called by the
1685          * master thread.
1686          */
1687         virtual void reset() {
1688                 TrimmingPatternSource::reset();
1689                 filecur_ = 0,
1690                 open();
1691                 filecur_++;
1692         }
1693 protected:
1694         /// Read another pattern from the input file; this is overridden
1695         /// to deal with specific file formats
1696         virtual void read(ReadBuf& r, uint32_t& patid) = 0;
1697         /// Read another pattern pair from the input file; this is
1698         /// overridden to deal with specific file formats
1699         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
1700         /// Reset state to handle a fresh file
1701         virtual void resetForNextFile() { }
1702         void open() {
1703                 if(fb_.isOpen()) fb_.close();
1704                 if(qfb_.isOpen()) qfb_.close();
1705                 while(filecur_ < infiles_.size()) {
1706                         // Open read
1707                         FILE *in;
1708                         if(infiles_[filecur_] == "-") {
1709                                 in = stdin;
1710                         } else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
1711                                 if(!errs_[filecur_]) {
1712                                         cerr << "Warning: Could not open read file \"" << infiles_[filecur_] << "\" for reading; skipping..." << endl;
1713                                         errs_[filecur_] = true;
1714                                 }
1715                                 filecur_++;
1716                                 continue;
1717                         }
1718                         fb_.newFile(in);
1719                         // Open quality
1720                         if(!qinfiles_.empty()) {
1721                                 FILE *in;
1722                                 if(qinfiles_[filecur_] == "-") {
1723                                         in = stdin;
1724                                 } else if((in = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) {
1725                                         if(!errs_[filecur_]) {
1726                                                 cerr << "Warning: Could not open quality file \"" << qinfiles_[filecur_] << "\" for reading; skipping..." << endl;
1727                                                 errs_[filecur_] = true;
1728                                         }
1729                                         filecur_++;
1730                                         continue;
1731                                 }
1732                                 qfb_.newFile(in);
1733                         }
1734                         return;
1735                 }
1736                 throw 1;
1737         }
1738         vector<string> infiles_; /// filenames for read files
1739         vector<string> qinfiles_; /// filenames for quality files
1740         vector<bool> errs_; /// whether we've already printed an error for each file
1741         size_t filecur_;   /// index into infiles_ of next file to read
1742         FileBuf fb_;  /// read file currently being read from
1743         FileBuf qfb_; /// quality file currently being read from
1744         uint32_t skip_;     /// number of reads to skip
1745         bool first_;
1746 };
1747
1748 /**
1749  * Parse a single quality string from fb and store qualities in r.
1750  * Assume the next character obtained via fb.get() is the first
1751  * character of the quality string.  When returning, the next
1752  * character returned by fb.peek() or fb.get() should be the first
1753  * character of the following line.
1754  */
1755 int parseQuals(ReadBuf& r,
1756                FileBuf& fb,
1757                int readLen,
1758                int trim3,
1759                int trim5,
1760                bool intQuals,
1761                bool phred64,
1762                bool solexa64);
1763
1764 /**
1765  * Synchronized concrete pattern source for a list of FASTA or CSFASTA
1766  * (if color = true) files.
1767  */
1768 class FastaPatternSource : public BufferedFilePatternSource {
1769 public:
1770         FastaPatternSource(uint32_t seed,
1771                            const vector<string>& infiles,
1772                            const vector<string>* qinfiles,
1773                            bool color,
1774                            bool randomizeQuals,
1775                            bool useSpinlock = true,
1776                            const char *dumpfile = NULL,
1777                            bool verbose = false,
1778                            int trim3 = 0,
1779                            int trim5 = 0,
1780                            bool solexa64 = false,
1781                            bool phred64 = false,
1782                            bool intQuals = false,
1783                            uint32_t skip = 0) :
1784                 BufferedFilePatternSource(seed, infiles, qinfiles, randomizeQuals,
1785                                           useSpinlock, dumpfile, verbose, trim3,
1786                                           trim5, skip),
1787                 first_(true), color_(color), solexa64_(solexa64),
1788                 phred64_(phred64), intQuals_(intQuals)
1789         { }
1790         virtual void reset() {
1791                 first_ = true;
1792                 BufferedFilePatternSource::reset();
1793         }
1794 protected:
1795         /**
1796          * Scan to the next FASTA record (starting with >) and return the first
1797          * character of the record (which will always be >).
1798          */
1799         static int skipToNextFastaRecord(FileBuf& in) {
1800                 int c;
1801                 while((c = in.get()) != '>') {
1802                         if(in.eof()) return -1;
1803                 }
1804                 return c;
1805         }
1806
1807         /// Called when we have to bail without having parsed a read.
1808         void bail(ReadBuf& r) {
1809                 r.clearAll();
1810                 fb_.resetLastN();
1811                 qfb_.resetLastN();
1812         }
1813
1814         /// Read another pattern from a FASTA input file
1815         virtual void read(ReadBuf& r, uint32_t& patid) {
1816                 int c, qc = 0;
1817                 int dstLen = 0;
1818                 int nameLen = 0;
1819                 bool doquals = qinfiles_.size() > 0;
1820                 assert(!doquals || qfb_.isOpen());
1821                 assert(fb_.isOpen());
1822                 r.color = color_;
1823
1824                 // Skip over between-read comments.  Note that SOLiD's comments use #s
1825                 c = fb_.get();
1826                 if(c < 0) { bail(r); return; }
1827                 while(c == '#' || c == ';') {
1828                         c = fb_.peekUptoNewline();
1829                         fb_.resetLastN();
1830                         c = fb_.get();
1831                 }
1832                 assert_eq(1, fb_.lastNCur());
1833                 if(doquals) {
1834                         qc = qfb_.get();
1835                         if(qc < 0) { bail(r); return; }
1836                         while(qc == '#' || qc == ';') {
1837                                 qc = qfb_.peekUptoNewline();
1838                                 qfb_.resetLastN();
1839                                 qc = qfb_.get();
1840                         }
1841                         assert_eq(1, qfb_.lastNCur());
1842                 }
1843
1844                 // Pick off the first carat
1845                 if(first_) {
1846                         if(c != '>') {
1847                                 cerr << "Error: reads file does not look like a FASTA file" << endl;
1848                                 throw 1;
1849                         }
1850                         if(doquals && qc != '>') {
1851                                 cerr << "Error: quality file does not look like a FASTA quality file" << endl;
1852                                 throw 1;
1853                         }
1854                         first_ = false;
1855                 }
1856                 assert_eq('>', c);
1857                 if(doquals) assert_eq('>', qc);
1858                 c = fb_.get(); // get next char after '>'
1859                 if(doquals) qc = qfb_.get();
1860
1861                 // Read to the end of the id line, sticking everything after the '>'
1862                 // into *name
1863                 bool warning = false;
1864                 while(true) {
1865                         if(c < 0 || qc < 0) { bail(r); return; }
1866                         if(c == '\n' || c == '\r') {
1867                                 // Break at end of line, after consuming all \r's, \n's
1868                                 while(c == '\n' || c == '\r') {
1869                                         c = fb_.get();
1870                                         if(doquals) qc = qfb_.get();
1871                                         if(c < 0 || qc < 0) { bail(r); return; }
1872                                 }
1873                                 break;
1874                         }
1875                         if(doquals && c != qc) {
1876                                 cerr << "Warning: one or more mismatched read names between FASTA and quality files" << endl;
1877                                 warning = true;
1878                         }
1879                         r.nameBuf[nameLen++] = c;
1880                         c = fb_.get();
1881                         if(doquals) qc = qfb_.get();
1882                 }
1883                 _setBegin(r.name, r.nameBuf);
1884                 _setLength(r.name, nameLen);
1885                 if(warning) {
1886                         cerr << "         Offending read name: \"" << r.name << "\"" << endl;
1887                 }
1888
1889                 // _in now points just past the first character of a sequence
1890                 // line, and c holds the first character
1891                 int begin = 0;
1892                 int mytrim5 = this->trim5_;
1893                 if(color_) {
1894                         // This is the primer character, keep it in the
1895                         // 'primer' field of the read buf and keep parsing
1896                         c = toupper(c);
1897                         if(asc2dnacat[c] > 0) {
1898                                 // First char is a DNA char
1899                                 int c2 = toupper(fb_.peek());
1900                                 if(asc2colcat[c2] > 0) {
1901                                         // Second char is a color char
1902                                         r.primer = c;
1903                                         r.trimc = c2;
1904                                         mytrim5 += 2;
1905                                 }
1906                         }
1907                         if(c < 0) { bail(r); return; }
1908                 }
1909                 while(c != '>' && c >= 0) {
1910                         if(color_) {
1911                                 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
1912                                 if(c == '.') c = 'N';
1913                         }
1914                         if(asc2dnacat[c] > 0 && begin++ >= mytrim5) {
1915                                 if(dstLen + 1 > 1024) tooManySeqChars(r.name);
1916                                 r.patBufFw[dstLen] = charToDna5[c];
1917                                 if(!doquals) r.qualBuf[dstLen]  = 'I';
1918                                 dstLen++;
1919                         }
1920                         if(fb_.peek() == '>') break;
1921                         c = fb_.get();
1922                 }
1923                 dstLen -= this->trim3_;
1924                 if(dstLen < 0) dstLen = 0;
1925                 _setBegin (r.patFw, (Dna5*)r.patBufFw);
1926                 _setLength(r.patFw, dstLen);
1927                 r.trimmed3 = this->trim3_;
1928                 r.trimmed5 = mytrim5;
1929                 if(doquals) {
1930                         if(dstLen > 0) {
1931                                 parseQuals(r, qfb_, dstLen + r.trimmed3 + r.trimmed5,
1932                                                    r.trimmed3, r.trimmed5, intQuals_, phred64_,
1933                                                    solexa64_);
1934                         } else {
1935                                 // Bail
1936                                 qfb_.peekUptoNewline();
1937                                 qfb_.get();
1938                                 qfb_.resetLastN();
1939                                 fb_.resetLastN();
1940                                 // Count the read
1941                                 readCnt_++;
1942                                 patid = readCnt_-1;
1943                                 return;
1944                         }
1945                 }
1946                 _setBegin (r.qual,  r.qualBuf);
1947                 _setLength(r.qual,  dstLen);
1948                 // Set up a default name if one hasn't been set
1949                 if(nameLen == 0) {
1950                         itoa10(readCnt_, r.nameBuf);
1951                         _setBegin(r.name, r.nameBuf);
1952                         nameLen = strlen(r.nameBuf);
1953                         _setLength(r.name, nameLen);
1954                 }
1955                 assert_gt(nameLen, 0);
1956                 readCnt_++;
1957                 patid = readCnt_-1;
1958                 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
1959                 fb_.resetLastN();
1960                 if(doquals) {
1961                         r.qualOrigBufLen = qfb_.copyLastN(r.qualOrigBuf);
1962                         qfb_.resetLastN();
1963                         if(false) {
1964                                 cout << "Name: " << r.name << endl
1965                                          << " Seq: " << r.patFw << " (" << seqan::length(r.patFw) << ")" << endl
1966                                          << "Qual: " << r.qual  << " (" << seqan::length(r.qual) << ")" << endl
1967                                          << "Orig seq:" << endl;
1968                                 for(size_t i = 0; i < r.readOrigBufLen; i++) cout << r.readOrigBuf[i];
1969                                 cout << "Orig qual:" << endl;
1970                                 for(size_t i = 0; i < r.qualOrigBufLen; i++) cout << r.qualOrigBuf[i];
1971                                 cout << endl;
1972                         }
1973                 }
1974         }
1975         /// Read another pair of patterns from a FASTA input file
1976         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1977                 // (For now, we shouldn't ever be here)
1978                 cerr << "In FastaPatternSource.readPair()" << endl;
1979                 throw 1;
1980                 read(ra, patid);
1981                 readCnt_--;
1982                 read(rb, patid);
1983         }
1984         virtual void resetForNextFile() {
1985                 first_ = true;
1986         }
1987         virtual void dump(ostream& out,
1988                           const String<Dna5>& seq,
1989                           const String<char>& qual,
1990                           const String<char>& name)
1991         {
1992                 out << ">" << name << endl << seq << endl;
1993         }
1994 private:
1995         bool first_;
1996         bool color_;
1997         bool solexa64_;
1998         bool phred64_;
1999         bool intQuals_;
2000 };
2001
2002
2003 /**
2004  * Tokenize a line of space-separated integer quality values.
2005  */
2006 static inline bool tokenizeQualLine(FileBuf& filebuf, char *buf, size_t buflen, vector<string>& toks) {
2007         size_t rd = filebuf.gets(buf, buflen);
2008         if(rd == 0) return false;
2009         assert(NULL == strrchr(buf, '\n'));
2010         tokenize(string(buf), " ", toks);
2011         return true;
2012 }
2013
2014 /**
2015  * Synchronized concrete pattern source for a list of files with tab-
2016  * delimited name, seq, qual fields (or, for paired-end reads,
2017  * basename, seq1, qual1, seq2, qual2).
2018  */
2019 class TabbedPatternSource : public BufferedFilePatternSource {
2020 public:
2021         TabbedPatternSource(uint32_t seed,
2022                             const vector<string>& infiles,
2023                             bool color,
2024                             bool randomizeQuals = false,
2025                             bool useSpinlock = true,
2026                             const char *dumpfile = NULL,
2027                             bool verbose = false,
2028                             int trim3 = 0,
2029                             int trim5 = 0,
2030                             bool solQuals = false,
2031                             bool phred64Quals = false,
2032                             bool intQuals = false,
2033                             uint32_t skip = 0) :
2034                 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals,
2035                                           useSpinlock, dumpfile, verbose,
2036                                           trim3, trim5, skip),
2037                 color_(color),
2038                 solQuals_(solQuals),
2039                 phred64Quals_(phred64Quals),
2040                 intQuals_(intQuals)
2041         { }
2042
2043 protected:
2044
2045         /// Read another pattern from a FASTA input file
2046         virtual void read(ReadBuf& r, uint32_t& patid) {
2047                 r.color = color_;
2048                 int trim5 = this->trim5_;
2049                 // fb_ is about to dish out the first character of the
2050                 // name field
2051                 if(parseName(r, NULL, '\t') == -1) {
2052                         peekOverNewline(fb_); // skip rest of line
2053                         r.clearAll();
2054                         return;
2055                 }
2056                 assert_neq('\t', fb_.peek());
2057
2058                 // fb_ is about to dish out the first character of the
2059                 // sequence field
2060                 int charsRead = 0;
2061                 int dstLen = parseSeq(r, charsRead, trim5, '\t');
2062                 assert_neq('\t', fb_.peek());
2063                 if(dstLen <= 0) {
2064                         peekOverNewline(fb_); // skip rest of line
2065                         r.clearAll();
2066                         return;
2067                 }
2068
2069                 // fb_ is about to dish out the first character of the
2070                 // quality-string field
2071                 char ct = 0;
2072                 if(parseQuals(r, charsRead, dstLen, trim5, ct, '\n') <= 0) {
2073                         peekOverNewline(fb_); // skip rest of line
2074                         r.clearAll();
2075                         return;
2076                 }
2077                 r.trimmed3 = this->trim3_;
2078                 r.trimmed5 = trim5;
2079                 assert_eq(ct, '\n');
2080                 assert_neq('\n', fb_.peek());
2081                 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
2082                 fb_.resetLastN();
2083                 // The last character read in parseQuals should have been a
2084                 // '\n'
2085
2086                 readCnt_++;
2087                 patid = readCnt_-1;
2088         }
2089
2090         /// Read another pair of patterns from a FASTA input file
2091         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2092                 // fb_ is about to dish out the first character of the
2093                 // name field
2094                 int mytrim5_1 = this->trim5_;
2095                 if(parseName(ra, &rb, '\t') == -1) {
2096                         peekOverNewline(fb_); // skip rest of line
2097                         ra.clearAll();
2098                         rb.clearAll();
2099                         fb_.resetLastN();
2100                         return;
2101                 }
2102                 assert_neq('\t', fb_.peek());
2103
2104                 // fb_ is about to dish out the first character of the
2105                 // sequence field for the first mate
2106                 int charsRead1 = 0;
2107                 int dstLen1 = parseSeq(ra, charsRead1, mytrim5_1, '\t');
2108                 if(dstLen1 <= -1) {
2109                         peekOverNewline(fb_); // skip rest of line
2110                         ra.clearAll();
2111                         rb.clearAll();
2112                         fb_.resetLastN();
2113                         return;
2114                 }
2115                 assert_neq('\t', fb_.peek());
2116
2117                 // fb_ is about to dish out the first character of the
2118                 // quality-string field
2119                 char ct = 0;
2120                 if(parseQuals(ra, charsRead1, dstLen1, mytrim5_1, ct, '\t', '\n') <= 0) {
2121                         peekOverNewline(fb_); // skip rest of line
2122                         ra.clearAll();
2123                         rb.clearAll();
2124                         fb_.resetLastN();
2125                         return;
2126                 }
2127                 ra.trimmed3 = this->trim3_;
2128                 ra.trimmed5 = mytrim5_1;
2129                 assert(ct == '\t' || ct == '\n');
2130                 if(ct == '\n') {
2131                         // Unpaired record; return.
2132                         rb.clearAll();
2133                         peekOverNewline(fb_);
2134                         ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf);
2135                         fb_.resetLastN();
2136                         readCnt_++;
2137                         patid = readCnt_-1;
2138                         return;
2139                 }
2140                 assert_neq('\t', fb_.peek());
2141
2142                 // fb_ is about to dish out the first character of the
2143                 // sequence field for the second mate
2144                 int charsRead2 = 0;
2145                 int mytrim5_2 = this->trim5_;
2146                 int dstLen2 = parseSeq(rb, charsRead2, mytrim5_2, '\t');
2147                 if(dstLen2 <= 0) {
2148                         peekOverNewline(fb_); // skip rest of line
2149                         ra.clearAll();
2150                         rb.clearAll();
2151                         fb_.resetLastN();
2152                         return;
2153                 }
2154                 assert_neq('\t', fb_.peek());
2155
2156                 // fb_ is about to dish out the first character of the
2157                 // quality-string field
2158                 if(parseQuals(rb, charsRead2, dstLen2, mytrim5_2, ct, '\n') <= 0) {
2159                         peekOverNewline(fb_); // skip rest of line
2160                         ra.clearAll();
2161                         rb.clearAll();
2162                         fb_.resetLastN();
2163                         return;
2164                 }
2165                 assert_eq('\n', ct);
2166                 if(fb_.peek() == '\n') {
2167                         assert(false);
2168                 }
2169                 peekOverNewline(fb_);
2170                 ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf);
2171                 fb_.resetLastN();
2172
2173                 rb.trimmed3 = this->trim3_;
2174                 rb.trimmed5 = mytrim5_2;
2175
2176                 // The last character read in parseQuals should have been a
2177                 // '\n'
2178
2179                 readCnt_++;
2180                 patid = readCnt_-1;
2181         }
2182
2183         /**
2184          * Dump a FASTQ-style record for the read.
2185          */
2186         virtual void dump(ostream& out,
2187                           const String<Dna5>& seq,
2188                           const String<char>& qual,
2189                           const String<char>& name)
2190         {
2191                 out << "@" << name << endl << seq << endl
2192                     << "+" << endl << qual << endl;
2193         }
2194 private:
2195
2196         /**
2197          * Parse a name from fb_ and store in r.  Assume that the next
2198          * character obtained via fb_.get() is the first character of
2199          * the sequence and the string stops at the next char upto (could
2200          * be tab, newline, etc.).
2201          */
2202         int parseName(ReadBuf& r, ReadBuf* r2, char upto = '\t') {
2203                 // Read the name out of the first field
2204                 int c = 0;
2205                 int nameLen = 0;
2206                 while(true) {
2207                         if((c = fb_.get()) < 0) {
2208                                 return -1;
2209                         }
2210                         if(c == upto) {
2211                                 // Finished with first field
2212                                 break;
2213                         }
2214                         if(c == '\n' || c == '\r') {
2215                                 return -1;
2216                         }
2217                         if(r2 != NULL) (*r2).nameBuf[nameLen] = c;
2218                         r.nameBuf[nameLen++] = c;
2219                 }
2220                 _setBegin(r.name, r.nameBuf);
2221                 _setLength(r.name, nameLen);
2222                 if(r2 != NULL) {
2223                         _setBegin((*r2).name, (*r2).nameBuf);
2224                         _setLength((*r2).name, nameLen);
2225                 }
2226                 // Set up a default name if one hasn't been set
2227                 if(nameLen == 0) {
2228                         itoa10(readCnt_, r.nameBuf);
2229                         _setBegin(r.name, r.nameBuf);
2230                         nameLen = strlen(r.nameBuf);
2231                         _setLength(r.name, nameLen);
2232                         if(r2 != NULL) {
2233                                 itoa10(readCnt_, (*r2).nameBuf);
2234                                 _setBegin((*r2).name, (*r2).nameBuf);
2235                                 _setLength((*r2).name, nameLen);
2236                         }
2237                 }
2238                 assert_gt(nameLen, 0);
2239                 return nameLen;
2240         }
2241
2242         /**
2243          * Parse a single sequence from fb_ and store in r.  Assume
2244          * that the next character obtained via fb_.get() is the first
2245          * character of the sequence and the sequence stops at the next
2246          * char upto (could be tab, newline, etc.).
2247          */
2248         int parseSeq(ReadBuf& r, int& charsRead, int& trim5, char upto = '\t') {
2249                 int begin = 0;
2250                 int dstLen = 0;
2251                 int c = fb_.get();
2252                 assert(c != upto);
2253                 r.color = color_;
2254                 if(color_) {
2255                         // This may be a primer character.  If so, keep it in the
2256                         // 'primer' field of the read buf and parse the rest of the
2257                         // read without it.
2258                         c = toupper(c);
2259                         if(asc2dnacat[c] > 0) {
2260                                 // First char is a DNA char
2261                                 int c2 = toupper(fb_.peek());
2262                                 // Second char is a color char
2263                                 if(asc2colcat[c2] > 0) {
2264                                         r.primer = c;
2265                                         r.trimc = c2;
2266                                         trim5 += 2; // trim primer and first color
2267                                 }
2268                         }
2269                         if(c < 0) { return -1; }
2270                 }
2271                 while(c != upto) {
2272                         if(color_) {
2273                                 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
2274                         }
2275                         if(c == '.') c = 'N';
2276                         if(isalpha(c)) {
2277                                 assert_in(toupper(c), "ACGTN");
2278                                 if(begin++ >= trim5) {
2279                                         assert_neq(0, dna4Cat[c]);
2280                                         if(dstLen + 1 > 1024) {
2281                                                 cerr << "Input file contained a pattern more than 1024 characters long.  Please truncate" << endl
2282                                                          << "reads and re-run Bowtie" << endl;
2283                                                 throw 1;
2284                                         }
2285                                         r.patBufFw[dstLen] = charToDna5[c];
2286                                         dstLen++;
2287                                 }
2288                                 charsRead++;
2289                         }
2290                         if((c = fb_.get()) < 0) {
2291                                 return -1;
2292                         }
2293                 }
2294                 dstLen -= this->trim3_;
2295                 _setBegin (r.patFw,  (Dna5*)r.patBufFw);
2296                 _setLength(r.patFw,  dstLen);
2297                 return dstLen;
2298         }
2299
2300         /**
2301          * Parse a single quality string from fb_ and store in r.
2302          * Assume that the next character obtained via fb_.get() is
2303          * the first character of the quality string and the string stops
2304          * at the next char upto (could be tab, newline, etc.).
2305          */
2306         int parseQuals(ReadBuf& r, int charsRead, int dstLen, int trim5,
2307                        char& c2, char upto = '\t', char upto2 = -1)
2308         {
2309                 int qualsRead = 0;
2310                 int c = 0;
2311                 if (intQuals_) {
2312                         char buf[4096];
2313                         while (qualsRead < charsRead) {
2314                                 vector<string> s_quals;
2315                                 if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break;
2316                                 for (unsigned int j = 0; j < s_quals.size(); ++j) {
2317                                         char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_);
2318                                         assert_geq(c, 33);
2319                                         if (qualsRead >= trim5) {
2320                                                 size_t off = qualsRead - trim5;
2321                                                 if(off >= 1024) tooManyQualities(r.name);
2322                                                 r.qualBuf[off] = c;
2323                                         }
2324                                         ++qualsRead;
2325                                 }
2326                         } // done reading integer quality lines
2327                         if (charsRead > qualsRead) tooFewQualities(r.name);
2328                 } else {
2329                         // Non-integer qualities
2330                         while((qualsRead < dstLen + trim5) && c >= 0) {
2331                                 c = fb_.get();
2332                                 c2 = c;
2333                                 if (c == ' ') wrongQualityFormat(r.name);
2334                                 if(c < 0) {
2335                                         // EOF occurred in the middle of a read - abort
2336                                         return -1;
2337                                 }
2338                                 if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
2339                                         if (qualsRead >= trim5) {
2340                                                 size_t off = qualsRead - trim5;
2341                                                 if(off >= 1024) tooManyQualities(r.name);
2342                                                 c = charToPhred33(c, solQuals_, phred64Quals_);
2343                                                 assert_geq(c, 33);
2344                                                 r.qualBuf[off] = c;
2345                                         }
2346                                         qualsRead++;
2347                                 } else {
2348                                         break;
2349                                 }
2350                         }
2351                         if(qualsRead != dstLen + trim5) {
2352                                 assert(false);
2353                         }
2354                 }
2355                 _setBegin (r.qual, (char*)r.qualBuf);
2356                 _setLength(r.qual, dstLen);
2357                 while(c != upto && (upto2 == -1 || c != upto2)) {
2358                         c = fb_.get();
2359                         c2 = c;
2360                 }
2361                 return qualsRead;
2362         }
2363
2364         bool color_;
2365         bool solQuals_;
2366         bool phred64Quals_;
2367         bool intQuals_;
2368 };
2369
2370 /**
2371  * Synchronized concrete pattern source for a list of FASTA files where
2372  * reads need to be extracted from long continuous sequences.
2373  */
2374 class FastaContinuousPatternSource : public BufferedFilePatternSource {
2375 public:
2376         FastaContinuousPatternSource(
2377                         uint32_t seed,
2378                         const vector<string>& infiles,
2379                         size_t length,
2380                         size_t freq,
2381                         bool useSpinlock = true,
2382                         const char *dumpfile = NULL,
2383                         bool verbose = false,
2384                         uint32_t skip = 0) :
2385                 BufferedFilePatternSource(seed, infiles, NULL, false, useSpinlock,
2386                                           dumpfile, verbose, 0, 0, skip),
2387                 length_(length), freq_(freq),
2388                 eat_(length_-1), beginning_(true),
2389                 nameChars_(0), bufCur_(0), subReadCnt_(0llu)
2390         {
2391                 resetForNextFile();
2392                 assert_lt(length_, (size_t)ReadBuf::BUF_SIZE);
2393         }
2394
2395         virtual void reset() {
2396                 BufferedFilePatternSource::reset();
2397                 resetForNextFile();
2398         }
2399
2400 protected:
2401         /// Read another pattern from a FASTA input file
2402         virtual void read(ReadBuf& r, uint32_t& patid) {
2403                 while(true) {
2404                         int c = fb_.get();
2405                         if(c < 0) {
2406                                 seqan::clear(r.patFw);
2407                                 return;
2408                         }
2409                         if(c == '>') {
2410                                 resetForNextFile();
2411                                 c = fb_.peek();
2412                                 bool sawSpace = false;
2413                                 while(c != '\n' && c != '\r') {
2414                                         if(!sawSpace) {
2415                                                 sawSpace = isspace(c);
2416                                         }
2417                                         if(!sawSpace) {
2418                                                 nameBuf_[nameChars_++] = c;
2419                                         }
2420                                         fb_.get();
2421                                         c = fb_.peek();
2422                                 }
2423                                 while(c == '\n' || c == '\r') {
2424                                         fb_.get();
2425                                         c = fb_.peek();
2426                                 }
2427                                 nameBuf_[nameChars_++] = '_';
2428                         } else {
2429                                 int cat = dna4Cat[c];
2430                                 if(cat == 2) c = 'N';
2431                                 if(cat == 0) {
2432                                         // Encountered non-DNA, non-IUPAC char; skip it
2433                                         continue;
2434                                 } else {
2435                                         // DNA char
2436                                         buf_[bufCur_++] = c;
2437                                         if(bufCur_ == 1024) bufCur_ = 0;
2438                                         if(eat_ > 0) {
2439                                                 eat_--;
2440                                                 // Try to keep readCnt_ aligned with the offset
2441                                                 // into the reference; that let's us see where
2442                                                 // the sampling gaps are by looking at the read
2443                                                 // name
2444                                                 if(!beginning_) readCnt_++;
2445                                                 continue;
2446                                         }
2447                                         for(size_t i = 0; i < length_; i++) {
2448                                                 if(length_ - i <= bufCur_) {
2449                                                         c = buf_[bufCur_ - (length_ - i)];
2450                                                 } else {
2451                                                         // Rotate
2452                                                         c = buf_[bufCur_ - (length_ - i) + 1024];
2453                                                 }
2454                                                 r.patBufFw [i] = charToDna5[c];
2455                                                 r.qualBuf[i] = 'I';
2456                                         }
2457                                         _setBegin (r.patFw,  (Dna5*)r.patBufFw);
2458                                         _setLength(r.patFw,  length_);
2459                                         _setBegin (r.qual, r.qualBuf);
2460                                         _setLength(r.qual, length_);
2461                                         // Set up a default name if one hasn't been set
2462                                         for(size_t i = 0; i < nameChars_; i++) {
2463                                                 r.nameBuf[i] = nameBuf_[i];
2464                                         }
2465                                         itoa10(readCnt_ - subReadCnt_, &r.nameBuf[nameChars_]);
2466                                         _setBegin(r.name, r.nameBuf);
2467                                         _setLength(r.name, strlen(r.nameBuf));
2468                                         eat_ = freq_-1;
2469                                         readCnt_++;
2470                                         beginning_ = false;
2471                                         patid = readCnt_-1;
2472                                         break;
2473                                 }
2474                         }
2475                 }
2476         }
2477         /// Shouldn't ever be here; it's not sensible to obtain read pairs
2478         // from a continuous input.
2479         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2480                 cerr << "In FastaContinuousPatternSource.readPair()" << endl;
2481                 throw 1;
2482         }
2483
2484         /**
2485          * Reset state to be read for the next file.
2486          */
2487         virtual void resetForNextFile() {
2488                 eat_ = length_-1;
2489                 beginning_ = true;
2490                 bufCur_ = nameChars_ = 0;
2491                 subReadCnt_ = readCnt_;
2492         }
2493
2494 private:
2495         size_t length_;     /// length of reads to generate
2496         size_t freq_;       /// frequency to sample reads
2497         size_t eat_;        /// number of characters we need to skip before
2498                             /// we have flushed all of the ambiguous or
2499                             /// non-existent characters out of our read
2500                             /// window
2501         bool beginning_;    /// skipping over the first read length?
2502         char buf_[1024];    /// read buffer
2503         char nameBuf_[1024];/// read buffer for name of fasta record being
2504                             /// split into mers
2505         size_t nameChars_;  /// number of name characters copied into buf
2506         size_t bufCur_;     /// buffer cursor; points to where we should
2507                             /// insert the next character
2508         uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
2509                             /// the pat id to output (so it resets to 0 for
2510                             /// each new sequence)
2511 };
2512
2513 /**
2514  * Read a FASTQ-format file.
2515  * See: http://maq.sourceforge.net/fastq.shtml
2516  */
2517 class FastqPatternSource : public BufferedFilePatternSource {
2518 public:
2519         FastqPatternSource(uint32_t seed,
2520                            const vector<string>& infiles,
2521                            bool color,
2522                            bool randomizeQuals = false,
2523                            bool useSpinlock = true,
2524                            const char *dumpfile = NULL,
2525                            bool verbose = false,
2526                            int trim3 = 0,
2527                            int trim5 = 0,
2528                            bool solexa_quals = false,
2529                            bool phred64Quals = false,
2530                            bool integer_quals = false,
2531                            bool fuzzy = false,
2532                            uint32_t skip = 0) :
2533                 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals,
2534                                           useSpinlock, dumpfile, verbose,
2535                                           trim3, trim5, skip),
2536                 first_(true),
2537                 solQuals_(solexa_quals),
2538                 phred64Quals_(phred64Quals),
2539                 intQuals_(integer_quals),
2540                 fuzzy_(fuzzy),
2541                 color_(color)
2542         { }
2543         virtual void reset() {
2544                 first_ = true;
2545                 fb_.resetLastN();
2546                 BufferedFilePatternSource::reset();
2547         }
2548 protected:
2549         /**
2550          * Scan to the next FASTQ record (starting with @) and return the first
2551          * character of the record (which will always be @).  Since the quality
2552          * line may start with @, we keep scanning until we've seen a line
2553          * beginning with @ where the line two lines back began with +.
2554          */
2555         static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
2556                 int line = 0;
2557                 int plusLine = -1;
2558                 int c = in.get();
2559                 int firstc = c;
2560                 while(true) {
2561                         if(line > 20) {
2562                                 // If we couldn't find our desired '@' in the first 20
2563                                 // lines, it's time to give up
2564                                 if(firstc == '>') {
2565                                         // That firstc is '>' may be a hint that this is
2566                                         // actually a FASTA file, so return it intact
2567                                         return '>';
2568                                 }
2569                                 // Return an error
2570                                 return -1;
2571                         }
2572                         if(c == -1) return -1;
2573                         if(c == '\n') {
2574                                 c = in.get();
2575                                 if(c == '@' && sawPlus && plusLine == (line-2)) {
2576                                         return '@';
2577                                 }
2578                                 else if(c == '+') {
2579                                         // Saw a '+' at the beginning of a line; remember where
2580                                         // we saw it
2581                                         sawPlus = true;
2582                                         plusLine = line;
2583                                 }
2584                                 else if(c == -1) {
2585                                         return -1;
2586                                 }
2587                                 line++;
2588                         }
2589                         c = in.get();
2590                 }
2591         }
2592
2593         /// Read another pattern from a FASTQ input file
2594         virtual void read(ReadBuf& r, uint32_t& patid) {
2595                 const int bufSz = ReadBuf::BUF_SIZE;
2596                 while(true) {
2597                         int c;
2598                         int dstLen = 0;
2599                         int nameLen = 0;
2600                         r.fuzzy = fuzzy_;
2601                         r.color = color_;
2602                         r.primer = -1;
2603                         r.alts = 0;
2604                         // Pick off the first at
2605                         if(first_) {
2606                                 c = fb_.get();
2607                                 if(c != '@') {
2608                                         c = getOverNewline(fb_);
2609                                         if(c < 0) { bail(r); return; }
2610                                 }
2611                                 if(c != '@') {
2612                                         cerr << "Error: reads file does not look like a FASTQ file" << endl;
2613                                         throw 1;
2614                                 }
2615                                 assert_eq('@', c);
2616                                 first_ = false;
2617                         }
2618
2619                         // Read to the end of the id line, sticking everything after the '@'
2620                         // into *name
2621                         while(true) {
2622                                 c = fb_.get();
2623                                 if(c < 0) { bail(r); return; }
2624                                 if(c == '\n' || c == '\r') {
2625                                         // Break at end of line, after consuming all \r's, \n's
2626                                         while(c == '\n' || c == '\r') {
2627                                                 c = fb_.get();
2628                                                 if(c < 0) { bail(r); return; }
2629                                         }
2630                                         break;
2631                                 }
2632                                 r.nameBuf[nameLen++] = c;
2633                                 if(nameLen > bufSz-2) {
2634                                         // Too many chars in read name; print friendly error message
2635                                         _setBegin(r.name, r.nameBuf);
2636                                         _setLength(r.name, nameLen);
2637                                         cerr << "FASTQ read name is too long; read names must be " << (bufSz-2) << " characters or fewer." << endl;
2638                                         cerr << "Beginning of bad read name: " << r.name << endl;
2639                                         throw 1;
2640                                 }
2641                         }
2642                         _setBegin(r.name, r.nameBuf);
2643                         assert_leq(nameLen, bufSz-2);
2644                         _setLength(r.name, nameLen);
2645                         // c now holds the first character on the line after the
2646                         // @name line
2647
2648                         // fb_ now points just past the first character of a
2649                         // sequence line, and c holds the first character
2650                         int charsRead = 0;
2651                         uint8_t *sbuf = r.patBufFw;
2652                         int dstLens[] = {0, 0, 0, 0};
2653                         int *dstLenCur = &dstLens[0];
2654                         int mytrim5 = this->trim5_;
2655                         int altBufIdx = 0;
2656                         if(color_) {
2657                                 // This may be a primer character.  If so, keep it in the
2658                                 // 'primer' field of the read buf and parse the rest of the
2659                                 // read without it.
2660                                 c = toupper(c);
2661                                 if(asc2dnacat[c] > 0) {
2662                                         // First char is a DNA char
2663                                         int c2 = toupper(fb_.peek());
2664                                         // Second char is a color char
2665                                         if(asc2colcat[c2] > 0) {
2666                                                 r.primer = c;
2667                                                 r.trimc = c2;
2668                                                 mytrim5 += 2; // trim primer and first color
2669                                         }
2670                                 }
2671                                 if(c < 0) { bail(r); return; }
2672                         }
2673                         int trim5 = mytrim5;
2674                         if(c == '+') {
2675                                 // Read had length 0; print warning (if not quiet) and quit
2676                                 if(!quiet) {
2677                                         cerr << "Warning: Skipping read (" << r.name << ") because it had length 0" << endl;
2678                                 }
2679                                 peekToEndOfLine(fb_);
2680                                 fb_.get();
2681                                 continue;
2682                         }
2683                         while(c != '+') {
2684                                 // Convert color numbers to letters if necessary
2685                                 if(color_) {
2686                                         if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
2687                                 }
2688                                 if(c == '.') c = 'N';
2689                                 if(fuzzy_ && c == '-') c = 'A';
2690                                 if(isalpha(c)) {
2691                                         // If it's past the 5'-end trim point
2692                                         assert_in(toupper(c), "ACGTN");
2693                                         if(charsRead >= trim5) {
2694                                                 if((*dstLenCur) >= 1024) tooManySeqChars(r.name);
2695                                                 sbuf[(*dstLenCur)++] = charToDna5[c];
2696                                         }
2697                                         charsRead++;
2698                                 } else if(fuzzy_ && c == ' ') {
2699                                         trim5 = 0; // disable 5' trimming for now
2700                                         if(charsRead == 0) {
2701                                                 c = fb_.get();
2702                                                 continue;
2703                                         }
2704                                         charsRead = 0;
2705                                         if(altBufIdx >= 3) {
2706                                                 cerr << "At most 3 alternate sequence strings permitted; offending read: " << r.name << endl;
2707                                                 throw 1;
2708                                         }
2709                                         // Move on to the next alternate-sequence buffer
2710                                         sbuf = r.altPatBufFw[altBufIdx++];
2711                                         dstLenCur = &dstLens[altBufIdx];
2712                                 }
2713                                 c = fb_.get();
2714                                 if(c < 0) { bail(r); return; }
2715                         }
2716                         // Trim from 3' end
2717                         dstLen = dstLens[0];
2718                         charsRead = dstLen + mytrim5;
2719                         dstLen -= this->trim3_;
2720                         // Set trimmed bounds of buffers
2721                         _setBegin(r.patFw, (Dna5*)r.patBufFw);
2722                         _setLength(r.patFw, dstLen);
2723                         assert_eq('+', c);
2724
2725                         // Chew up the optional name on the '+' line
2726                         peekToEndOfLine(fb_);
2727
2728                         // Now read the qualities
2729                         if (intQuals_) {
2730                                 assert(!fuzzy_);
2731                                 int qualsRead = 0;
2732                                 char buf[4096];
2733                                 if(color_ && r.primer != -1) mytrim5--;
2734                                 while (qualsRead < charsRead) {
2735                                         vector<string> s_quals;
2736                                         if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break;
2737                                         for (unsigned int j = 0; j < s_quals.size(); ++j) {
2738                                                 char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_);
2739                                                 assert_geq(c, 33);
2740                                                 if (qualsRead >= mytrim5) {
2741                                                         size_t off = qualsRead - mytrim5;
2742                                                         if(off >= 1024) tooManyQualities(r.name);
2743                                                         r.qualBuf[off] = c;
2744                                                 }
2745                                                 ++qualsRead;
2746                                         }
2747                                 } // done reading integer quality lines
2748                                 if(color_ && r.primer != -1) mytrim5++;
2749                                 if(qualsRead < charsRead-mytrim5) {
2750                                         tooFewQualities(r.name);
2751                                 } else if(qualsRead > charsRead-mytrim5+1) {
2752                                         tooManyQualities(r.name);
2753                                 }
2754                                 if(qualsRead == charsRead-mytrim5+1 && color_ && r.primer != -1) {
2755                                         for(int i = 0; i < qualsRead-1; i++) {
2756                                                 r.qualBuf[i] = r.qualBuf[i+1];
2757                                         }
2758                                 }
2759                                 _setBegin(r.qual, (char*)r.qualBuf);
2760                                 _setLength(r.qual, dstLen);
2761                                 peekOverNewline(fb_);
2762                         } else {
2763                                 // Non-integer qualities
2764                                 char *qbuf = r.qualBuf;
2765                                 altBufIdx = 0;
2766                                 trim5 = mytrim5;
2767                                 if(color_ && r.primer != -1) trim5--;
2768                                 int itrim5 = trim5;
2769                                 int qualsRead[4] = {0, 0, 0, 0};
2770                                 int *qualsReadCur = &qualsRead[0];
2771                                 while(true) {
2772                                         c = fb_.get();
2773                                         if (!fuzzy_ && c == ' ') {
2774                                                 wrongQualityFormat(r.name);
2775                                         } else if(c == ' ') {
2776                                                 trim5 = 0; // disable 5' trimming for now
2777                                                 if((*qualsReadCur) == 0) continue;
2778                                                 if(altBufIdx >= 3) {
2779                                                         cerr << "At most 3 alternate quality strings permitted; offending read: " << r.name << endl;
2780                                                         throw 1;
2781                                                 }
2782                                                 qbuf = r.altQualBuf[altBufIdx++];
2783                                                 qualsReadCur = &qualsRead[altBufIdx];
2784                                                 continue;
2785                                         }
2786                                         if(c < 0) { bail(r); return; }
2787                                         if (c != '\r' && c != '\n') {
2788                                                 if (*qualsReadCur >= trim5) {
2789                                                         size_t off = (*qualsReadCur) - trim5;
2790                                                         if(off >= 1024) tooManyQualities(r.name);
2791                                                         c = charToPhred33(c, solQuals_, phred64Quals_);
2792                                                         assert_geq(c, 33);
2793                                                         qbuf[off] = c;
2794                                                 }
2795                                                 (*qualsReadCur)++;
2796                                         } else {
2797                                                 break;
2798                                         }
2799                                 }
2800                                 qualsRead[0] -= this->trim3_;
2801                                 int qRead = (int)(qualsRead[0] - itrim5);
2802                                 if(qRead < dstLen) {
2803                                         tooFewQualities(r.name);
2804                                 } else if(qRead > dstLen+1) {
2805                                         tooManyQualities(r.name);
2806                                 }
2807                                 if(qRead == dstLen+1 && color_ && r.primer != -1) {
2808                                         for(int i = 0; i < dstLen; i++) {
2809                                                 r.qualBuf[i] = r.qualBuf[i+1];
2810                                         }
2811                                 }
2812                                 _setBegin (r.qual, (char*)r.qualBuf);
2813                                 _setLength(r.qual, dstLen);
2814
2815                                 if(fuzzy_) {
2816                                         // Trim from 3' end of alternate basecall and quality strings
2817                                         if(this->trim3_ > 0) {
2818                                                 for(int i = 1; i < 4; i++) {
2819                                                         assert_eq(qualsRead[i], dstLens[i]);
2820                                                         qualsRead[i] = dstLens[i] =
2821                                                                 max<int>(0, dstLens[i] - this->trim3_);
2822                                                 }
2823                                         }
2824                                         // Shift to RHS, and install in Strings
2825                                         assert_eq(0, r.alts);
2826                                         for(int i = 1; i < 4; i++) {
2827                                                 if(qualsRead[i] == 0) continue;
2828                                                 if(qualsRead[i] > dstLen) {
2829                                                         // Shift everybody up
2830                                                         int shiftAmt = qualsRead[i] - dstLen;
2831                                                         for(int j = 0; j < dstLen; j++) {
2832                                                                 r.altQualBuf[i-1][j]  = r.altQualBuf[i-1][j+shiftAmt];
2833                                                                 r.altPatBufFw[i-1][j] = r.altPatBufFw[i-1][j+shiftAmt];
2834                                                         }
2835                                                 } else if (qualsRead[i] < dstLen) {
2836                                                         // Shift everybody down
2837                                                         int shiftAmt = dstLen - qualsRead[i];
2838                                                         for(int j = dstLen-1; j >= shiftAmt; j--) {
2839                                                                 r.altQualBuf[i-1][j]  = r.altQualBuf[i-1][j-shiftAmt];
2840                                                                 r.altPatBufFw[i-1][j] = r.altPatBufFw[i-1][j-shiftAmt];
2841                                                         }
2842                                                         // Fill in unset positions
2843                                                         for(int j = 0; j < shiftAmt; j++) {
2844                                                                 // '!' - indicates no alternate basecall at
2845                                                                 // this position
2846                                                                 r.altQualBuf[i-1][j] = (char)33;
2847                                                         }
2848                                                 }
2849                                                 _setBegin (r.altPatFw[i-1], (Dna5*)r.altPatBufFw[i-1]);
2850                                                 _setLength(r.altPatFw[i-1], dstLen);
2851                                                 _setBegin (r.altQual[i-1], (char*)r.altQualBuf[i-1]);
2852                                                 _setLength(r.altQual[i-1], dstLen);
2853                                                 r.alts++;
2854                                         }
2855                                 }
2856
2857                                 if(c == '\r' || c == '\n') {
2858                                         c = peekOverNewline(fb_);
2859                                 } else {
2860                                         c = peekToEndOfLine(fb_);
2861                                 }
2862                         }
2863                         r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
2864                         fb_.resetLastN();
2865
2866                         c = fb_.get();
2867                         assert(c == -1 || c == '@');
2868
2869                         // Set up a default name if one hasn't been set
2870                         if(nameLen == 0) {
2871                                 itoa10(readCnt_, r.nameBuf);
2872                                 _setBegin(r.name, r.nameBuf);
2873                                 nameLen = strlen(r.nameBuf);
2874                                 _setLength(r.name, nameLen);
2875                         }
2876                         r.trimmed3 = this->trim3_;
2877                         r.trimmed5 = mytrim5;
2878                         assert_gt(nameLen, 0);
2879                         readCnt_++;
2880                         patid = readCnt_-1;
2881                         return;
2882                 }
2883         }
2884         /// Read another read pair from a FASTQ input file
2885         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2886                 // (For now, we shouldn't ever be here)
2887                 cerr << "In FastqPatternSource.readPair()" << endl;
2888                 throw 1;
2889                 read(ra, patid);
2890                 readCnt_--;
2891                 read(rb, patid);
2892         }
2893         virtual void resetForNextFile() {
2894                 first_ = true;
2895         }
2896         virtual void dump(ostream& out,
2897                           const String<Dna5>& seq,
2898                           const String<char>& qual,
2899                           const String<char>& name)
2900         {
2901                 out << "@" << name << endl << seq << endl << "+" << endl << qual << endl;
2902         }
2903 private:
2904
2905         /**
2906          * Do things we need to do if we have to bail in the middle of a
2907          * read, usually because we reached the end of the input without
2908          * finishing.
2909          */
2910         void bail(ReadBuf& r) {
2911                 seqan::clear(r.patFw);
2912                 fb_.resetLastN();
2913         }
2914
2915         bool first_;
2916         bool solQuals_;
2917         bool phred64Quals_;
2918         bool intQuals_;
2919         bool fuzzy_;
2920         bool color_;
2921 };
2922
2923 /**
2924  * Read a Raw-format file (one sequence per line).  No quality strings
2925  * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
2926  * scale).
2927  */
2928 class RawPatternSource : public BufferedFilePatternSource {
2929 public:
2930         RawPatternSource(uint32_t seed,
2931                          const vector<string>& infiles,
2932                          bool color,
2933                          bool randomizeQuals = false,
2934                          bool useSpinlock = true,
2935                          const char *dumpfile = NULL,
2936                          bool verbose = false,
2937                          int trim3 = 0,
2938                          int trim5 = 0,
2939                          uint32_t skip = 0) :
2940                 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals, useSpinlock,
2941                                           dumpfile, verbose, trim3, trim5, skip),
2942                 first_(true), color_(color)
2943         { }
2944         virtual void reset() {
2945                 first_ = true;
2946                 BufferedFilePatternSource::reset();
2947         }
2948 protected:
2949         /// Read another pattern from a Raw input file
2950         virtual void read(ReadBuf& r, uint32_t& patid) {
2951                 int c;
2952                 int dstLen = 0;
2953                 int nameLen = 0;
2954                 c = getOverNewline(this->fb_);
2955                 if(c < 0) { bail(r); return; }
2956                 assert(!isspace(c));
2957                 r.color = color_;
2958                 int mytrim5 = this->trim5_;
2959                 if(first_) {
2960                         // Check that the first character is sane for a raw file
2961                         int cc = c;
2962                         if(color_) {
2963                                 if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
2964                                 if(cc == '.') cc = 'N';
2965                         }
2966                         if(dna4Cat[cc] == 0) {
2967                                 cerr << "Error: reads file does not look like a Raw file" << endl;
2968                                 if(c == '>') {
2969                                         cerr << "Reads file looks like a FASTA file; please use -f" << endl;
2970                                 }
2971                                 if(c == '@') {
2972                                         cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
2973                                 }
2974                                 throw 1;
2975                         }
2976                         first_ = false;
2977                 }
2978                 if(color_) {
2979                         // This may be a primer character.  If so, keep it in the
2980                         // 'primer' field of the read buf and parse the rest of the
2981                         // read without it.
2982                         c = toupper(c);
2983                         if(asc2dnacat[c] > 0) {
2984                                 // First char is a DNA char
2985                                 int c2 = toupper(fb_.peek());
2986                                 // Second char is a color char
2987                                 if(asc2colcat[c2] > 0) {
2988                                         r.primer = c;
2989                                         r.trimc = c2;
2990                                         mytrim5 += 2; // trim primer and first color
2991                                 }
2992                         }
2993                         if(c < 0) { bail(r); return; }
2994                 }
2995                 // _in now points just past the first character of a sequence
2996                 // line, and c holds the first character
2997                 while(!isspace(c) && c >= 0) {
2998                         if(color_) {
2999                                 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
3000                         }
3001                         if(c == '.') c = 'N';
3002                         if(isalpha(c) && dstLen >= mytrim5) {
3003                                 size_t len = dstLen - mytrim5;
3004                                 if(len >= 1024) tooManyQualities(String<char>("(no name)"));
3005                                 r.patBufFw [len] = charToDna5[c];
3006                                 r.qualBuf[len] = 'I';
3007                                 dstLen++;
3008                         } else if(isalpha(c)) dstLen++;
3009                         if(isspace(fb_.peek())) break;
3010                         c = fb_.get();
3011                 }
3012                 if(dstLen >= (this->trim3_ + mytrim5)) {
3013                         dstLen -= (this->trim3_ + mytrim5);
3014                 } else {
3015                         dstLen = 0;
3016                 }
3017                 _setBegin (r.patFw,  (Dna5*)r.patBufFw);
3018                 _setLength(r.patFw,  dstLen);
3019                 _setBegin (r.qual, r.qualBuf);
3020                 _setLength(r.qual, dstLen);
3021
3022                 c = peekToEndOfLine(fb_);
3023                 r.trimmed3 = this->trim3_;
3024                 r.trimmed5 = mytrim5;
3025                 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
3026                 fb_.resetLastN();
3027
3028                 // Set up name
3029                 itoa10(readCnt_, r.nameBuf);
3030                 _setBegin(r.name, r.nameBuf);
3031                 nameLen = strlen(r.nameBuf);
3032                 _setLength(r.name, nameLen);
3033                 readCnt_++;
3034
3035                 patid = readCnt_-1;
3036         }
3037         /// Read another read pair from a FASTQ input file
3038         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
3039                 // (For now, we shouldn't ever be here)
3040                 cerr << "In RawPatternSource.readPair()" << endl;
3041                 throw 1;
3042                 read(ra, patid);
3043                 readCnt_--;
3044                 read(rb, patid);
3045         }
3046         virtual void resetForNextFile() {
3047                 first_ = true;
3048         }
3049         virtual void dump(ostream& out,
3050                           const String<Dna5>& seq,
3051                           const String<char>& qual,
3052                           const String<char>& name)
3053         {
3054                 out << seq << endl;
3055         }
3056 private:
3057         /**
3058          * Do things we need to do if we have to bail in the middle of a
3059          * read, usually because we reached the end of the input without
3060          * finishing.
3061          */
3062         void bail(ReadBuf& r) {
3063                 seqan::clear(r.patFw);
3064                 fb_.resetLastN();
3065         }
3066         bool first_;
3067         bool color_;
3068 };
3069
3070 /**
3071  * Read a Raw-format file (one sequence per line).  No quality strings
3072  * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
3073  * scale).
3074  */
3075 class ChainPatternSource : public BufferedFilePatternSource {
3076 public:
3077         ChainPatternSource(uint32_t seed,
3078                            const vector<string>& infiles,
3079                            bool useSpinlock,
3080                            const char *dumpfile,
3081                            bool verbose,
3082                            uint32_t skip) :
3083         BufferedFilePatternSource(
3084                 seed, infiles, NULL, false, useSpinlock, dumpfile, verbose, 0, 0, skip) { }
3085
3086 protected:
3087
3088         /// Read another pattern from a Raw input file
3089         virtual void read(ReadBuf& r, uint32_t& patid) {
3090                 fb_.peek();
3091                 if(fb_.eof()) {
3092                         fb_.resetLastN();
3093                         seqan::clear(r.patFw);
3094                         return;
3095                 }
3096                 do {
3097                         r.hitset.deserialize(fb_);
3098                 } while(!r.hitset.initialized() && !fb_.eof());
3099                 if(!r.hitset.initialized()) {
3100                         fb_.resetLastN();
3101                         seqan::clear(r.patFw);
3102                         return;
3103                 }
3104                 // Now copy the name/sequence/quals into r.name/r.patFw/r.qualFw
3105                 _setBegin(r.name, (char*)r.nameBuf);
3106                 _setCapacity(r.name, seqan::length(r.hitset.name));
3107                 _setLength(r.name, seqan::length(r.hitset.name));
3108                 memcpy(r.nameBuf, seqan::begin(r.hitset.name), seqan::length(r.hitset.name));
3109                 _setBegin (r.patFw, (Dna5*)r.patBufFw);
3110                 _setCapacity(r.patFw, seqan::length(r.hitset.seq));
3111                 _setLength(r.patFw, seqan::length(r.hitset.seq));
3112                 memcpy(r.patBufFw, seqan::begin(r.hitset.seq), seqan::length(r.hitset.seq));
3113                 _setBegin (r.qual, r.qualBuf);
3114                 _setCapacity(r.qual, seqan::length(r.hitset.qual));
3115                 _setLength(r.qual, seqan::length(r.hitset.qual));
3116                 memcpy(r.qualBuf, seqan::begin(r.hitset.qual), seqan::length(r.hitset.qual));
3117
3118                 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
3119                 fb_.resetLastN();
3120
3121                 readCnt_++;
3122                 patid = readCnt_-1;
3123         }
3124
3125         /// Read another read pair
3126         virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
3127                 // (For now, we shouldn't ever be here)
3128                 cerr << "In ChainPatternSource.readPair()" << endl;
3129                 throw 1;
3130         }
3131 };
3132
3133 #endif /*PAT_H_*/