12 #include <seqan/sequence.h>
14 #include "assert_helpers.h"
16 #include "random_source.h"
18 #include "threading.h"
22 #include "search_globals.h"
25 * Classes and routines for reading reads from various input sources.
29 using namespace seqan;
31 /// Constructs string base-10 representation of integer 'value'
32 extern char* itoa10(int value, char* result);
35 * Calculate a per-read random seed based on a combination of
36 * the read data (incl. sequence, name, quals) and the global
37 * seed in '_randSeed'.
39 static inline uint32_t genRandSeed(const String<Dna5>& qry,
40 const String<char>& qual,
41 const String<char>& name,
44 // Calculate a per-read random seed based on a combination of
45 // the read data (incl. sequence, name, quals) and the global
47 uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
48 size_t qlen = seqan::length(qry);
49 // Throw all the characters of the read into the random seed
50 for(size_t i = 0; i < qlen; i++) {
53 size_t off = ((i & 15) << 1);
56 // Throw all the quality values for the read into the random
58 for(size_t i = 0; i < qlen; i++) {
61 size_t off = ((i & 3) << 3);
64 // Throw all the characters in the read name into the random
66 size_t namelen = seqan::length(name);
67 for(size_t i = 0; i < namelen; i++) {
70 size_t off = ((i & 3) << 3);
77 * A buffer for keeping all relevant information about a single read.
78 * Each search thread has one.
81 ReadBuf() { reset(); }
85 // Prevent seqan from trying to free buffers
86 _setBegin(patFw, NULL);
87 _setBegin(patRc, NULL);
88 _setBegin(qual, NULL);
89 _setBegin(patFwRev, NULL);
90 _setBegin(patRcRev, NULL);
91 _setBegin(qualRev, NULL);
92 _setBegin(name, NULL);
93 for(int j = 0; j < 3; j++) {
94 _setBegin(altPatFw[j], NULL);
95 _setBegin(altPatFwRev[j], NULL);
96 _setBegin(altPatRc[j], NULL);
97 _setBegin(altPatRcRev[j], NULL);
98 _setBegin(altQual[j], NULL);
99 _setBegin(altQualRev[j], NULL);
103 #define RESET_BUF(str, buf, typ) _setBegin(str, (typ*)buf); _setLength(str, 0); _setCapacity(str, BUF_SIZE);
104 #define RESET_BUF_LEN(str, buf, len, typ) _setBegin(str, (typ*)buf); _setLength(str, len); _setCapacity(str, BUF_SIZE);
106 /// Point all Strings to the beginning of their respective buffers
107 /// and set all lengths to 0
113 trimmed5 = trimmed3 = 0;
119 RESET_BUF(patFw, patBufFw, Dna5);
120 RESET_BUF(patRc, patBufRc, Dna5);
121 RESET_BUF(qual, qualBuf, char);
122 RESET_BUF(patFwRev, patBufFwRev, Dna5);
123 RESET_BUF(patRcRev, patBufRcRev, Dna5);
124 RESET_BUF(qualRev, qualBufRev, char);
125 RESET_BUF(name, nameBuf, char);
126 for(int j = 0; j < 3; j++) {
127 RESET_BUF(altPatFw[j], altPatBufFw[j], Dna5);
128 RESET_BUF(altPatFwRev[j], altPatBufFwRev[j], Dna5);
129 RESET_BUF(altPatRc[j], altPatBufRc[j], Dna5);
130 RESET_BUF(altPatRcRev[j], altPatBufRcRev[j], Dna5);
131 RESET_BUF(altQual[j], altQualBuf[j], char);
132 RESET_BUF(altQualRev[j], altQualBufRev[j], char);
140 seqan::clear(patFwRev);
141 seqan::clear(patRcRev);
142 seqan::clear(qualRev);
144 for(int j = 0; j < 3; j++) {
145 seqan::clear(altPatFw[j]);
146 seqan::clear(altPatFwRev[j]);
147 seqan::clear(altPatRc[j]);
148 seqan::clear(altPatRcRev[j]);
149 seqan::clear(altQual[j]);
150 seqan::clear(altQualRev[j]);
152 trimmed5 = trimmed3 = 0;
155 color = fuzzy = false;
161 /// Return true iff the read (pair) is empty
163 return seqan::empty(patFw);
166 /// Return length of the read in the buffer
167 uint32_t length() const {
168 return seqan::length(patFw);
172 * Construct reverse complement of the pattern and the fuzzy
173 * alternative patters. If read is in colorspace, just reverse
176 void constructRevComps() {
177 uint32_t len = length();
179 RESET_BUF_LEN(patRc, patBufRc, len, Dna5);
180 for(int j = 0; j < alts; j++) {
181 RESET_BUF_LEN(altPatRc[j], altPatBufRc[j], len, Dna5);
184 for(uint32_t i = 0; i < len; i++) {
185 // Reverse the sequence
186 patBufRc[i] = patBufFw[len-i-1];
187 for(int j = 0; j < alts; j++) {
188 // Reverse the fuzzy alternatives
189 altPatBufRc[j][i] = altPatBufFw[j][len-i-1];
193 for(uint32_t i = 0; i < len; i++) {
194 // Reverse-complement the sequence
195 patBufRc[i] = (patBufFw[len-i-1] == 4) ? 4 : (patBufFw[len-i-1] ^ 3);
196 for(int j = 0; j < alts; j++) {
197 // Reverse-complement the fuzzy alternatives
198 altPatBufRc[j][i] = (altPatBufFw[j][len-i-1] == 4) ? 4 : (altPatBufFw[j][len-i-1] ^ 3);
205 * Given patFw, patRc, and qual, construct the *Rev versions in
206 * place. Assumes constructRevComps() was called previously.
208 void constructReverses() {
209 uint32_t len = length();
211 RESET_BUF_LEN(patFwRev, patBufFwRev, len, Dna5);
212 RESET_BUF_LEN(patRcRev, patBufRcRev, len, Dna5);
213 RESET_BUF_LEN(qualRev, qualBufRev, len, char);
214 for(int j = 0; j < alts; j++) {
215 RESET_BUF_LEN(altPatFwRev[j], altPatBufFwRev[j], len, Dna5);
216 RESET_BUF_LEN(altPatRcRev[j], altPatBufRcRev[j], len, Dna5);
217 RESET_BUF_LEN(altQualRev[j], altQualBufRev[j], len, char);
219 for(uint32_t i = 0; i < len; i++) {
220 patFwRev[i] = patFw[len-i-1];
221 patRcRev[i] = patRc[len-i-1];
222 qualRev[i] = qual[len-i-1];
223 for(int j = 0; j < alts; j++) {
224 altPatFwRev[j][i] = altPatFw[j][len-i-1];
225 altPatRcRev[j][i] = altPatRc[j][len-i-1];
226 altQualRev[j][i] = altQual[j][len-i-1];
232 * Append a "/1" or "/2" string onto the end of the name buf if
233 * it's not already there.
235 void fixMateName(int i) {
236 assert(i == 1 || i == 2);
237 size_t namelen = seqan::length(name);
240 // Name is too short to possibly have /1 or /2 on the end
244 // append = true iff mate name does not already end in /1
246 nameBuf[namelen-2] != '/' ||
247 nameBuf[namelen-1] != '1';
249 // append = true iff mate name does not already end in /2
251 nameBuf[namelen-2] != '/' ||
252 nameBuf[namelen-1] != '2';
256 assert_leq(namelen, BUF_SIZE-2);
257 _setLength(name, namelen + 2);
258 nameBuf[namelen] = '/';
259 nameBuf[namelen+1] = "012"[i];
264 * Dump basic information about this read to the given ostream.
266 void dump(std::ostream& os) const {
269 for(size_t i = 0; i < seqan::length(patFw); i++) {
270 os << "0123."[(int)patFw[i]];
276 // Print out the fuzzy alternative sequences
277 for(int j = 0; j < 3; j++) {
278 bool started = false;
279 if(seqan::length(altQual[j]) > 0) {
280 for(size_t i = 0; i < length(); i++) {
281 if(altQual[j][i] != '!') {
285 if(altQual[j][i] == '!') {
289 os << "0123."[(int)altPatFw[j][i]];
291 os << altPatFw[j][i];
300 // Print out the fuzzy alternative quality strings
301 for(int j = 0; j < 3; j++) {
302 bool started = false;
303 if(seqan::length(altQual[j]) > 0) {
304 for(size_t i = 0; i < length(); i++) {
305 if(altQual[j][i] != '!') {
322 * Write read details to a HitSet object.
324 void toHitSet(HitSet& hs) {
332 static const int BUF_SIZE = 1024;
334 String<Dna5> patFw; // forward-strand sequence
335 uint8_t patBufFw[BUF_SIZE]; // forward-strand sequence buffer
336 String<Dna5> patRc; // reverse-complement sequence
337 uint8_t patBufRc[BUF_SIZE]; // reverse-complement sequence buffer
338 String<char> qual; // quality values
339 char qualBuf[BUF_SIZE]; // quality value buffer
341 String<Dna5> altPatFw[3]; // forward-strand sequence
342 uint8_t altPatBufFw[3][BUF_SIZE]; // forward-strand sequence buffer
343 String<Dna5> altPatRc[3]; // reverse-complement sequence
344 uint8_t altPatBufRc[3][BUF_SIZE]; // reverse-complement sequence buffer
345 String<char> altQual[3]; // quality values for alternate basecalls
346 char altQualBuf[3][BUF_SIZE]; // quality value buffer for alternate basecalls
348 String<Dna5> patFwRev; // forward-strand sequence reversed
349 uint8_t patBufFwRev[BUF_SIZE]; // forward-strand sequence buffer reversed
350 String<Dna5> patRcRev; // reverse-complement sequence reversed
351 uint8_t patBufRcRev[BUF_SIZE]; // reverse-complement sequence buffer reversed
352 String<char> qualRev; // quality values reversed
353 char qualBufRev[BUF_SIZE]; // quality value buffer reversed
355 String<Dna5> altPatFwRev[3]; // forward-strand sequence reversed
356 uint8_t altPatBufFwRev[3][BUF_SIZE]; // forward-strand sequence buffer reversed
357 String<Dna5> altPatRcRev[3]; // reverse-complement sequence reversed
358 uint8_t altPatBufRcRev[3][BUF_SIZE]; // reverse-complement sequence buffer reversed
359 String<char> altQualRev[3]; // quality values for alternate basecalls reversed
360 char altQualBufRev[3][BUF_SIZE]; // quality value buffer for alternate basecalls reversed
362 // For remembering the exact input text used to define a read
363 char readOrigBuf[FileBuf::LASTN_BUF_SZ];
364 size_t readOrigBufLen;
366 // For when qualities are in a separate file
367 char qualOrigBuf[FileBuf::LASTN_BUF_SZ];
368 size_t qualOrigBufLen;
370 String<char> name; // read name
371 char nameBuf[BUF_SIZE]; // read name buffer
372 uint32_t patid; // unique 0-based id based on order in read file(s)
373 int mate; // 0 = single-end, 1 = mate1, 2 = mate2
374 uint32_t seed; // random seed
375 int alts; // number of alternatives
376 bool fuzzy; // whether to employ fuzziness
377 bool color; // whether read is in color space
378 char primer; // primer base, for csfasta files
379 char trimc; // trimmed color, for csfasta files
380 int trimmed5; // amount actually trimmed off 5' end
381 int trimmed3; // amount actually trimmed off 3' end
382 HitSet hitset; // holds previously-found hits; for chaining
386 * Encapsulates a synchronized source of patterns; usually a file.
387 * Handles dumping patterns to a logfile (useful for debugging). Also
388 * optionally reverses reads and quality strings before returning them,
389 * though that is usually more efficiently done by the concrete
390 * subclass. Concrete subclasses should delimit critical sections with
391 * calls to lock() and unlock().
393 class PatternSource {
395 PatternSource(uint32_t seed,
396 bool randomizeQuals = false,
397 bool useSpinlock = true,
398 const char *dumpfile = NULL,
399 bool verbose = false) :
405 useSpinlock_(useSpinlock),
406 randomizeQuals_(randomizeQuals),
410 // Open dumpfile, if specified
411 if(dumpfile_ != NULL) {
412 out_.open(dumpfile_, ios_base::out);
414 cerr << "Could not open pattern dump file \"" << dumpfile_ << "\" for writing" << endl;
421 virtual ~PatternSource() { }
424 * Call this whenever this PatternSource is wrapped by a new
425 * WrappedPatternSourcePerThread. This helps us keep track of
426 * whether locks will be contended.
433 * The main member function for dispensing patterns.
435 virtual void nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
436 // nextPatternImpl does the reading from the ultimate source;
437 // it is implemented in concrete subclasses
438 nextReadPairImpl(ra, rb, patid);
440 // Possibly randomize the qualities so that they're more
441 // scattered throughout the range of possible values
442 if(randomizeQuals_) {
448 // TODO: Perhaps bundle all of the following up into a
449 // finalize() member in the ReadBuf class?
451 // Construct the reversed versions of the fw and rc seqs
453 ra.constructRevComps();
454 ra.constructReverses();
456 rb.constructRevComps();
457 rb.constructReverses();
459 // Fill in the random-seed field using a combination of
460 // information from the user-specified seed and the read
461 // sequence, qualities, and name
462 ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
464 rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
466 // Output it, if desired
467 if(dumpfile_ != NULL) {
474 cout << "Parsed mate 1: "; ra.dump(cout);
475 cout << "Parsed mate 2: "; rb.dump(cout);
481 * The main member function for dispensing patterns.
483 virtual void nextRead(ReadBuf& r, uint32_t& patid) {
484 // nextPatternImpl does the reading from the ultimate source;
485 // it is implemented in concrete subclasses
486 nextReadImpl(r, patid);
488 // Possibly randomize the qualities so that they're more
489 // scattered throughout the range of possible values
490 if(randomizeQuals_) {
493 // Construct the reversed versions of the fw and rc seqs
495 r.constructRevComps();
496 r.constructReverses();
497 // Fill in the random-seed field using a combination of
498 // information from the user-specified seed and the read
499 // sequence, qualities, and name
500 r.seed = genRandSeed(r.patFw, r.qual, r.name, seed_);
501 // Output it, if desired
502 if(dumpfile_ != NULL) {
506 cout << "Parsed read: "; r.dump(cout);
512 * Implementation to be provided by concrete subclasses. An
513 * implementation for this member is only relevant for formats that
514 * can read in a pair of reads in a single transaction with a
515 * single input source. If paired-end input is given as a pair of
516 * parallel files, this member should throw an error and exit.
518 virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
521 * Implementation to be provided by concrete subclasses. An
522 * implementation for this member is only relevant for formats
523 * where individual input sources look like single-end-read
524 * sources, e.g., formats where paired-end reads are specified in
525 * parallel read files.
527 virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) = 0;
529 /// Reset state to start over again with the first read
530 virtual void reset() { readCnt_ = 0; }
533 * Concrete subclasses call lock() to enter a critical region.
534 * What constitutes a critical region depends on the subclass.
537 if(!doLocking_) return; // no contention
540 // User can ask to use the normal pthreads lock even if
541 // spinlocks are compiled in.
552 * Concrete subclasses call unlock() to exit a critical region
553 * What constitutes a critical region depends on the subclass.
556 if(!doLocking_) return; // no contention
559 // User can ask to use the normal pthreads lock even if
560 // spinlocks are compiled in.
571 * Return the number of reads attempted.
573 uint64_t readCnt() const { return readCnt_ - 1; }
578 * Mix up the quality values for ReadBuf r. There's probably a
579 * more (pseudo-)randomly rigorous way to do this; the output looks
582 void randomizeQuals(ReadBuf& r) {
583 const size_t rlen = r.length();
584 for(size_t i = 0; i < rlen; i++) {
586 r.qual[i] *= (r.qual[i+1] + 7);
589 r.qual[i] *= (r.qual[i-1] + 11);
591 // A user says that g++ complained here about "comparison
592 // is always false due to limited range of data type", but
593 // I can't see why. I added the (int) cast to try to avoid
595 if((int)r.qual[i] < 0) r.qual[i] = -(r.qual[i]+1);
597 assert_leq(r.qual[i], 40);
603 * Dump the contents of the ReadBuf to the dump file.
605 void dumpBuf(const ReadBuf& r) {
606 assert(dumpfile_ != NULL);
608 empty(r.qual) ? String<char>("(empty)") : r.qual,
609 empty(r.name) ? String<char>("(empty)") : r.name);
611 empty(r.qualRev) ? String<char>("(empty)") : r.qualRev,
612 empty(r.name) ? String<char>("(empty)") : r.name);
616 * Default format for dumping a read to an output stream. Concrete
617 * subclasses might want to do something fancier.
619 virtual void dump(ostream& out,
620 const String<Dna5>& seq,
621 const String<char>& qual,
622 const String<char>& name)
624 out << name << ": " << seq << " " << qual << endl;
629 /// The number of reads read by this PatternSource
632 const char *dumpfile_; /// dump patterns to this file before returning them
633 ofstream out_; /// output stream for dumpfile
634 int numWrappers_; /// # threads that own a wrapper for this PatternSource
635 bool doLocking_; /// override whether to lock (true = don't override)
636 /// User can ask to use the normal pthreads-style lock even if
637 /// spinlocks is enabled and compiled in. This is sometimes better
638 /// if we expect bad I/O latency on some reads.
640 bool randomizeQuals_; /// true -> mess up qualities in a random way
644 MUTEX_T lock_; /// mutex for locking critical regions
649 * Abstract parent class for synhconized sources of paired-end reads
650 * (and possibly also single-end reads).
652 class PairedPatternSource {
654 PairedPatternSource(uint32_t seed) {
658 virtual ~PairedPatternSource() { }
660 virtual void addWrapper() = 0;
661 virtual void reset() = 0;
662 virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
663 virtual pair<uint64_t,uint64_t> readCnt() const = 0;
666 * Lock this PairedPatternSource, usually because one of its shared
667 * fields is being updated.
678 * Unlock this PairedPatternSource.
693 MUTEX_T lock_; /// mutex for locking critical regions
698 * Encapsulates a synchronized source of both paired-end reads and
699 * unpaired reads, where the paired-end must come from parallel files.
701 class PairedSoloPatternSource : public PairedPatternSource {
705 PairedSoloPatternSource(const vector<PatternSource*>& src,
707 PairedPatternSource(seed), cur_(0), src_(src)
709 for(size_t i = 0; i < src_.size(); i++) {
710 assert(src_[i] != NULL);
714 virtual ~PairedSoloPatternSource() { }
717 * Call this whenever this PairedPatternSource is wrapped by a new
718 * WrappedPatternSourcePerThread. This helps us keep track of
719 * whether locks within PatternSources will be contended.
721 virtual void addWrapper() {
722 for(size_t i = 0; i < src_.size(); i++) {
723 src_[i]->addWrapper();
728 * Reset this object and all the PatternSources under it so that
729 * the next call to nextReadPair gets the very first read pair.
731 virtual void reset() {
732 for(size_t i = 0; i < src_.size(); i++) {
739 * The main member function for dispensing pairs of reads or
740 * singleton reads. Returns true iff ra and rb contain a new
741 * pair; returns false if ra contains a new unpaired read.
743 virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
745 while(cur < src_.size()) {
746 // Patterns from srca_[cur_] are unpaired
747 src_[cur]->nextReadPair(ra, rb, patid);
748 if(seqan::empty(ra.patFw)) {
749 // If patFw is empty, that's our signal that the
752 if(cur + 1 > cur_) cur_++;
755 continue; // on to next pair of PatternSources
757 ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
759 rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
766 return true; // paired
772 * Return the number of reads attempted.
774 virtual pair<uint64_t,uint64_t> readCnt() const {
776 vector<PatternSource*>::const_iterator it;
777 for(it = src_.begin(); it != src_.end(); it++) {
778 ret += (*it)->readCnt();
780 return make_pair(ret, 0llu);
785 volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
786 vector<PatternSource*> src_; /// PatternSources for paired-end reads
790 * Encapsulates a synchronized source of both paired-end reads and
791 * unpaired reads, where the paired-end must come from parallel files.
793 class PairedDualPatternSource : public PairedPatternSource {
797 PairedDualPatternSource(const vector<PatternSource*>& srca,
798 const vector<PatternSource*>& srcb,
800 PairedPatternSource(seed), cur_(0), srca_(srca), srcb_(srcb)
802 // srca_ and srcb_ must be parallel
803 assert_eq(srca_.size(), srcb_.size());
804 for(size_t i = 0; i < srca_.size(); i++) {
805 // Can't have NULL first-mate sources. Second-mate sources
806 // can be NULL, in the case when the corresponding first-
807 // mate source is unpaired.
808 assert(srca_[i] != NULL);
809 for(size_t j = 0; j < srcb_.size(); j++) {
810 assert_neq(srca_[i], srcb_[j]);
815 virtual ~PairedDualPatternSource() { }
818 * Call this whenever this PairedPatternSource is wrapped by a new
819 * WrappedPatternSourcePerThread. This helps us keep track of
820 * whether locks within PatternSources will be contended.
822 virtual void addWrapper() {
823 for(size_t i = 0; i < srca_.size(); i++) {
824 srca_[i]->addWrapper();
825 if(srcb_[i] != NULL) {
826 srcb_[i]->addWrapper();
832 * Reset this object and all the PatternSources under it so that
833 * the next call to nextReadPair gets the very first read pair.
835 virtual void reset() {
836 for(size_t i = 0; i < srca_.size(); i++) {
838 if(srcb_[i] != NULL) {
846 * The main member function for dispensing pairs of reads or
847 * singleton reads. Returns true iff ra and rb contain a new
848 * pair; returns false if ra contains a new unpaired read.
850 virtual bool nextReadPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
852 while(cur < srca_.size()) {
853 if(srcb_[cur] == NULL) {
854 // Patterns from srca_[cur_] are unpaired
855 srca_[cur]->nextRead(ra, patid);
856 if(seqan::empty(ra.patFw)) {
857 // If patFw is empty, that's our signal that the
860 if(cur + 1 > cur_) cur_++;
863 continue; // on to next pair of PatternSources
867 return false; // unpaired
869 // Patterns from srca_[cur_] and srcb_[cur_] are paired
870 uint32_t patid_a = 0;
871 uint32_t patid_b = 0;
872 // Lock to ensure that this thread gets parallel reads
873 // in the two mate files
875 srca_[cur]->nextRead(ra, patid_a);
876 srcb_[cur]->nextRead(rb, patid_b);
878 // Did the pair obtained fail to match up?
879 while(patid_a != patid_b) {
880 // Is either input exhausted? If so, bail.
881 if(seqan::empty(ra.patFw) || seqan::empty(rb.patFw)) {
882 seqan::clear(ra.patFw);
883 if(cur + 1 > cur_) cur_++;
888 if(patid_a < patid_b) {
889 srca_[cur]->nextRead(ra, patid_a);
892 srcb_[cur]->nextRead(rb, patid_b);
897 if(cont) continue; // on to next pair of PatternSources
900 if(seqan::empty(ra.patFw)) {
901 // If patFw is empty, that's our signal that the
904 if(cur + 1 > cur_) cur_++;
907 continue; // on to next pair of PatternSources
909 assert_eq(patid_a, patid_b);
915 return true; // paired
922 * Return the number of reads attempted.
924 virtual pair<uint64_t,uint64_t> readCnt() const {
925 uint64_t rets = 0llu, retp = 0llu;
926 for(size_t i = 0; i < srca_.size(); i++) {
927 if(srcb_[i] == NULL) {
928 rets += srca_[i]->readCnt();
930 assert_eq(srca_[i]->readCnt(), srcb_[i]->readCnt());
931 retp += srca_[i]->readCnt();
934 return make_pair(rets, retp);
939 volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
940 vector<PatternSource*> srca_; /// PatternSources for 1st mates and/or unpaired reads
941 vector<PatternSource*> srcb_; /// PatternSources for 2nd mates
945 * Encapsulates a single thread's interaction with the PatternSource.
946 * Most notably, this class holds the buffers into which the
947 * PatterSource will write sequences. This class is *not* threadsafe
948 * - it doesn't need to be since there's one per thread. PatternSource
951 class PatternSourcePerThread {
953 PatternSourcePerThread() :
954 buf1_(), buf2_(), patid_(0xffffffff) { }
956 virtual ~PatternSourcePerThread() { }
959 * Read the next read pair.
961 virtual void nextReadPair() { }
963 ReadBuf& bufa() { return buf1_; }
964 ReadBuf& bufb() { return buf2_; }
966 uint32_t patid() const { return patid_; }
967 virtual void reset() { patid_ = 0xffffffff; }
968 bool empty() const { return buf1_.empty(); }
969 uint32_t length(int mate) const {
970 return (mate == 1)? buf1_.length() : buf2_.length();
974 * Return true iff the buffers jointly contain a paired-end read.
977 bool ret = !buf2_.empty();
978 assert(!ret || !empty());
983 ReadBuf buf1_; // read buffer for mate a
984 ReadBuf buf2_; // read buffer for mate b
985 uint32_t patid_; // index of read just read
989 * Abstract parent factory for PatternSourcePerThreads.
991 class PatternSourcePerThreadFactory {
993 virtual ~PatternSourcePerThreadFactory() { }
994 virtual PatternSourcePerThread* create() const = 0;
995 virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const = 0;
997 /// Free memory associated with a pattern source
998 virtual void destroy(PatternSourcePerThread* patsrc) const {
999 assert(patsrc != NULL);
1000 // Free the PatternSourcePerThread
1004 /// Free memory associated with a pattern source list
1005 virtual void destroy(std::vector<PatternSourcePerThread*>* patsrcs) const {
1006 assert(patsrcs != NULL);
1007 // Free all of the PatternSourcePerThreads
1008 for(size_t i = 0; i < patsrcs->size(); i++) {
1009 if((*patsrcs)[i] != NULL) {
1010 delete (*patsrcs)[i];
1011 (*patsrcs)[i] = NULL;
1020 * A per-thread wrapper for a PairedPatternSource.
1022 class WrappedPatternSourcePerThread : public PatternSourcePerThread {
1024 WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
1027 patsrc_.addWrapper();
1031 * Get the next paired or unpaired read from the wrapped
1032 * PairedPatternSource.
1034 virtual void nextReadPair() {
1035 PatternSourcePerThread::nextReadPair();
1036 ASSERT_ONLY(uint32_t lastPatid = patid_);
1039 patsrc_.nextReadPair(buf1_, buf2_, patid_);
1040 assert(buf1_.empty() || patid_ != lastPatid);
1045 /// Container for obtaining paired reads from PatternSources
1046 PairedPatternSource& patsrc_;
1050 * Abstract parent factory for PatternSourcePerThreads.
1052 class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
1054 WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
1058 * Create a new heap-allocated WrappedPatternSourcePerThreads.
1060 virtual PatternSourcePerThread* create() const {
1061 return new WrappedPatternSourcePerThread(patsrc_);
1065 * Create a new heap-allocated vector of heap-allocated
1066 * WrappedPatternSourcePerThreads.
1068 virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const {
1069 std::vector<PatternSourcePerThread*>* v = new std::vector<PatternSourcePerThread*>;
1070 for(size_t i = 0; i < n; i++) {
1071 v->push_back(new WrappedPatternSourcePerThread(patsrc_));
1072 assert(v->back() != NULL);
1078 /// Container for obtaining paired reads from PatternSources
1079 PairedPatternSource& patsrc_;
1083 * Encapsualtes a source of patterns where each raw pattern is trimmed
1084 * by some user-defined amount on the 3' and 5' ends. Doesn't
1085 * implement the actual trimming - that's up to the concrete
1088 class TrimmingPatternSource : public PatternSource {
1090 TrimmingPatternSource(uint32_t seed,
1091 bool randomizeQuals = false,
1092 bool useSpinlock = true,
1093 const char *dumpfile = NULL,
1094 bool verbose = false,
1097 PatternSource(seed, randomizeQuals, useSpinlock, dumpfile, verbose),
1098 trim3_(trim3), trim5_(trim5) { }
1105 * A synchronized pattern source that simply returns random reads
1106 * without reading from the disk or storing lists of reads in memory.
1107 * Reads are generated with a RandomSource.
1109 class RandomPatternSource : public PatternSource {
1111 RandomPatternSource(uint32_t seed,
1112 uint32_t numReads = 2000000,
1114 bool useSpinlock = true,
1115 const char *dumpfile = NULL,
1116 bool verbose = false) :
1117 PatternSource(seed, false, useSpinlock, dumpfile, verbose),
1118 numReads_(numReads),
1122 if(length_ > 1024) {
1123 cerr << "Read length for RandomPatternSource may not exceed 1024; got " << length_ << endl;
1129 /** Get the next random read and set patid */
1130 virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1131 // Begin critical section
1133 if(readCnt_ >= numReads_) {
1138 uint32_t ra = rand_.nextU32();
1142 fillRandomRead(r, ra, length_, patid);
1145 /** Get the next random read and set patid */
1146 virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1147 // Begin critical section
1149 if(readCnt_ >= numReads_) {
1155 uint32_t rna = rand_.nextU32();
1156 uint32_t rnb = rand_.nextU32();
1160 fillRandomRead(ra, rna, length_, patid);
1161 fillRandomRead(rb, rnb, length_, patid);
1165 static void fillRandomRead(ReadBuf& r,
1170 // End critical section
1171 for(int i = 0; i < length; i++) {
1172 ra = RandomSource::nextU32(ra) >> 8;
1173 r.patBufFw[i] = (ra & 3);
1174 char c = 'I' - ((ra >> 2) & 31);
1177 _setBegin (r.patFw, (Dna5*)r.patBufFw);
1178 _setLength(r.patFw, length);
1179 _setBegin (r.qual, r.qualBuf);
1180 _setLength(r.qual, length);
1181 itoa10(patid, r.nameBuf);
1182 _setBegin(r.name, r.nameBuf);
1183 _setLength(r.name, strlen(r.nameBuf));
1186 /** Reset the pattern source to the beginning */
1187 virtual void reset() {
1188 PatternSource::reset();
1189 // reset pseudo-random generator; next string of calls to
1190 // nextU32() will return same pseudo-randoms as the last
1194 uint32_t numReads_; /// number of reads to dish out
1195 int length_; /// length of reads
1196 uint32_t seed_; /// seed for pseudo-randoms
1197 RandomSource rand_; /// pseudo-random generator
1201 * A version of PatternSourcePerThread that dishes out random patterns
1202 * without any synchronization.
1204 class RandomPatternSourcePerThread : public PatternSourcePerThread {
1206 RandomPatternSourcePerThread(uint32_t numreads,
1210 PatternSourcePerThread(),
1211 numreads_(numreads),
1213 numthreads_(numthreads),
1217 if(length_ > 1024) {
1218 cerr << "Read length for RandomPatternSourcePerThread may not exceed 1024; got " << length_ << endl;
1221 rand_.init(thread_);
1224 virtual void nextReadPair() {
1225 PatternSourcePerThread::nextReadPair();
1226 if(patid_ >= numreads_) {
1231 RandomPatternSource::fillRandomRead(
1232 buf1_, rand_.nextU32(), length_, patid_);
1233 RandomPatternSource::fillRandomRead(
1234 buf2_, rand_.nextU32(), length_, patid_);
1235 patid_ += numthreads_;
1238 virtual void reset() {
1239 PatternSourcePerThread::reset();
1241 rand_.init(thread_);
1253 * Abstract parent factory for PatternSourcePerThreads.
1255 class RandomPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
1257 RandomPatternSourcePerThreadFactory(
1262 numreads_(numreads),
1264 numthreads_(numthreads),
1268 * Create a new heap-allocated WrappedPatternSourcePerThreads.
1270 virtual PatternSourcePerThread* create() const {
1271 return new RandomPatternSourcePerThread(
1272 numreads_, length_, numthreads_, thread_);
1276 * Create a new heap-allocated vector of heap-allocated
1277 * WrappedPatternSourcePerThreads.
1279 virtual std::vector<PatternSourcePerThread*>* create(uint32_t n) const {
1280 std::vector<PatternSourcePerThread*>* v = new std::vector<PatternSourcePerThread*>;
1281 for(size_t i = 0; i < n; i++) {
1282 v->push_back(new RandomPatternSourcePerThread(
1283 numreads_, length_, numthreads_, thread_));
1284 assert(v->back() != NULL);
1296 /// Skip to the end of the current string of newline chars and return
1297 /// the first character after the newline chars, or -1 for EOF
1298 static inline int getOverNewline(FileBuf& in) {
1300 while(isspace(c = in.get()));
1304 /// Skip to the end of the current string of newline chars such that
1305 /// the next call to get() returns the first character after the
1307 static inline int peekOverNewline(FileBuf& in) {
1310 if(c != '\r' && c != '\n') {
1317 /// Skip to the end of the current line; return the first character
1318 /// of the next line or -1 for EOF
1319 static inline int getToEndOfLine(FileBuf& in) {
1321 int c = in.get(); if(c < 0) return -1;
1322 if(c == '\n' || c == '\r') {
1323 while(c == '\n' || c == '\r') {
1324 c = in.get(); if(c < 0) return -1;
1326 // c now holds first character of next line
1332 /// Skip to the end of the current line such that the next call to
1333 /// get() returns the first character on the next line
1334 static inline int peekToEndOfLine(FileBuf& in) {
1336 int c = in.get(); if(c < 0) return c;
1337 if(c == '\n' || c == '\r') {
1339 while(c == '\n' || c == '\r') {
1340 in.get(); if(c < 0) return c; // consume \r or \n
1343 // next get() gets first character of next line
1349 extern void wrongQualityFormat(const String<char>& read_name);
1350 extern void tooFewQualities(const String<char>& read_name);
1351 extern void tooManyQualities(const String<char>& read_name);
1352 extern void tooManySeqChars(const String<char>& read_name);
1355 * Encapsulates a source of patterns which is an in-memory vector.
1357 class VectorPatternSource : public TrimmingPatternSource {
1359 VectorPatternSource(uint32_t seed,
1360 const vector<string>& v,
1362 bool randomizeQuals = false,
1363 bool useSpinlock = true,
1364 const char *dumpfile = NULL,
1365 bool verbose = false,
1368 uint32_t skip = 0) :
1369 TrimmingPatternSource(seed, randomizeQuals, useSpinlock,
1370 dumpfile, verbose, trim3, trim5),
1371 color_(color), cur_(skip), skip_(skip), paired_(false), v_(),
1374 for(size_t i = 0; i < v.size(); i++) {
1376 tokenize(v[i], ":", ss, 2);
1377 assert_gt(ss.size(), 0);
1378 assert_leq(ss.size(), 2);
1381 int mytrim5 = this->trim5_;
1382 if(color_ && s.length() > 1) {
1383 // This may be a primer character. If so, keep it in the
1384 // 'primer' field of the read buf and parse the rest of the
1386 int c = toupper(s[0]);
1387 if(asc2dnacat[c] > 0) {
1388 // First char is a DNA char
1389 int c2 = toupper(s[1]);
1390 // Second char is a color char
1391 if(asc2colcat[c2] > 0) {
1392 mytrim5 += 2; // trim primer and first color
1397 // Convert '0'-'3' to 'A'-'T'
1398 for(size_t i = 0; i < s.length(); i++) {
1399 if(s[i] >= '0' && s[i] <= '4') {
1400 s[i] = "ACGTN"[(int)s[i] - '0'];
1402 if(s[i] == '.') s[i] = 'N';
1405 if(s.length() <= (size_t)(trim3_ + mytrim5)) {
1406 // Entire read is trimmed away
1409 // Trim on 5' (high-quality) end
1411 s.erase(0, mytrim5);
1413 // Trim on 3' (low-quality) end
1415 s.erase(s.length()-trim3_);
1420 if(ss.size() == 2) {
1424 if(vq.length() > (size_t)(trim3_ + mytrim5)) {
1425 // Trim on 5' (high-quality) end
1427 vq.erase(0, mytrim5);
1429 // Trim on 3' (low-quality) end
1431 vq.erase(vq.length()-trim3_);
1434 // Pad quals with Is if necessary; this shouldn't happen
1435 while(vq.length() < length(s)) {
1438 // Truncate quals to match length of read if necessary;
1439 // this shouldn't happen
1440 if(vq.length() > length(s)) {
1441 vq.erase(length(s));
1443 assert_eq(vq.length(), length(s));
1445 quals_.push_back(vq);
1446 trimmed3_.push_back(trim3_);
1447 trimmed5_.push_back(mytrim5);
1449 os << (names_.size());
1450 names_.push_back(os.str());
1452 assert_eq(v_.size(), quals_.size());
1454 virtual ~VectorPatternSource() { }
1455 virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1456 // Let Strings begin at the beginning of the respective bufs
1459 if(cur_ >= v_.size()) {
1461 // Clear all the Strings, as a signal to the caller that
1462 // we're out of reads
1467 // Copy v_*, quals_* strings into the respective Strings
1470 r.qual = quals_[cur_];
1471 r.trimmed3 = trimmed3_[cur_];
1472 r.trimmed5 = trimmed5_[cur_];
1482 * This is unused, but implementation is given for completeness.
1484 virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1485 // Let Strings begin at the beginning of the respective bufs
1493 if(cur_ >= v_.size()-1) {
1495 // Clear all the Strings, as a signal to the caller that
1496 // we're out of reads
1503 // Copy v_*, quals_* strings into the respective Strings
1504 ra.patFw = v_[cur_];
1505 ra.qual = quals_[cur_];
1506 ra.trimmed3 = trimmed3_[cur_];
1507 ra.trimmed5 = trimmed5_[cur_];
1509 rb.patFw = v_[cur_];
1510 rb.qual = quals_[cur_];
1511 rb.trimmed3 = trimmed3_[cur_];
1512 rb.trimmed5 = trimmed5_[cur_];
1517 ra.color = rb.color = color_;
1523 virtual void reset() {
1524 TrimmingPatternSource::reset();
1533 vector<String<Dna5> > v_; /// forward sequences
1534 vector<String<char> > quals_; /// quality values parallel to v_
1535 vector<String<char> > names_; /// names
1536 vector<int> trimmed3_; // names
1537 vector<int> trimmed5_; // names
1543 class BufferedFilePatternSource : public TrimmingPatternSource {
1545 BufferedFilePatternSource(uint32_t seed,
1546 const vector<string>& infiles,
1547 const vector<string>* qinfiles,
1548 bool randomizeQuals = false,
1549 bool useSpinlock = true,
1550 const char *dumpfile = NULL,
1551 bool verbose = false,
1554 uint32_t skip = 0) :
1555 TrimmingPatternSource(seed, randomizeQuals, useSpinlock,
1556 dumpfile, verbose, trim3, trim5),
1565 if(qinfiles != NULL) qinfiles_ = *qinfiles;
1566 assert_gt(infiles.size(), 0);
1567 errs_.resize(infiles_.size(), false);
1568 if(qinfiles_.size() > 0 &&
1569 qinfiles_.size() != infiles_.size())
1571 cerr << "Error: Different numbers of input FASTA/quality files ("
1572 << infiles_.size() << "/" << qinfiles_.size() << ")" << endl;
1575 assert(!fb_.isOpen());
1576 assert(!qfb_.isOpen());
1577 open(); // open first file in the list
1581 virtual ~BufferedFilePatternSource() {
1582 if(fb_.isOpen()) fb_.close();
1584 assert_gt(qinfiles_.size(), 0);
1590 * Fill ReadBuf with the sequence, quality and name for the next
1591 * read in the list of read files. This function gets called by
1592 * all the search threads, so we must handle synchronization.
1594 virtual void nextReadImpl(ReadBuf& r, uint32_t& patid) {
1595 // We are entering a critical region, because we're
1596 // manipulating our file handle and filecur_ state
1598 bool notDone = true;
1601 // Try again if r is empty (indicating an error) and input
1602 // is not yet exhausted, OR if we have more reads to skip
1604 notDone = seqan::empty(r.patFw) && !fb_.eof();
1605 } while(notDone || (!fb_.eof() && patid < skip_));
1609 assert(seqan::empty(r.patFw));
1612 if(first_ && seqan::empty(r.patFw) /* && !quiet_ */) {
1613 // No reads could be extracted from the first _infile
1614 cerr << "Warning: Could not find any reads in \"" << infiles_[0] << "\"" << endl;
1617 while(seqan::empty(r.patFw) && filecur_ < infiles_.size()) {
1620 resetForNextFile(); // reset state to handle a fresh file
1623 } while((seqan::empty(r.patFw) && !fb_.eof()));
1624 assert_geq(patid, skip_);
1625 if(seqan::empty(r.patFw) /*&& !quiet_ */) {
1626 // No reads could be extracted from this _infile
1627 cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl;
1631 // Leaving critical region
1633 // If r.patFw is empty, then the caller knows that we are
1634 // finished with the reads
1639 virtual void nextReadPairImpl(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1640 // We are entering a critical region, because we're
1641 // manipulating our file handle and filecur_ state
1643 bool notDone = true;
1645 readPair(ra, rb, patid);
1646 // Try again if ra is empty (indicating an error) and input
1647 // is not yet exhausted, OR if we have more reads to skip
1649 notDone = seqan::empty(ra.patFw) && !fb_.eof();
1650 } while(notDone || (!fb_.eof() && patid < skip_));
1655 assert(seqan::empty(ra.patFw));
1658 if(first_ && seqan::empty(ra.patFw) /*&& !quiet_*/) {
1659 // No reads could be extracted from the first _infile
1660 cerr << "Warning: Could not find any read pairs in \"" << infiles_[0] << "\"" << endl;
1663 while(seqan::empty(ra.patFw) && filecur_ < infiles_.size()) {
1666 resetForNextFile(); // reset state to handle a fresh file
1668 readPair(ra, rb, patid);
1669 } while((seqan::empty(ra.patFw) && !fb_.eof()));
1670 assert_geq(patid, skip_);
1671 if(seqan::empty(ra.patFw) /*&& !quiet_*/) {
1672 // No reads could be extracted from this _infile
1673 cerr << "Warning: Could not find any reads in \"" << infiles_[filecur_] << "\"" << endl;
1677 // Leaving critical region
1679 // If ra.patFw is empty, then the caller knows that we are
1680 // finished with the reads
1683 * Reset state so that we read start reading again from the
1684 * beginning of the first file. Should only be called by the
1687 virtual void reset() {
1688 TrimmingPatternSource::reset();
1694 /// Read another pattern from the input file; this is overridden
1695 /// to deal with specific file formats
1696 virtual void read(ReadBuf& r, uint32_t& patid) = 0;
1697 /// Read another pattern pair from the input file; this is
1698 /// overridden to deal with specific file formats
1699 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) = 0;
1700 /// Reset state to handle a fresh file
1701 virtual void resetForNextFile() { }
1703 if(fb_.isOpen()) fb_.close();
1704 if(qfb_.isOpen()) qfb_.close();
1705 while(filecur_ < infiles_.size()) {
1708 if(infiles_[filecur_] == "-") {
1710 } else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
1711 if(!errs_[filecur_]) {
1712 cerr << "Warning: Could not open read file \"" << infiles_[filecur_] << "\" for reading; skipping..." << endl;
1713 errs_[filecur_] = true;
1720 if(!qinfiles_.empty()) {
1722 if(qinfiles_[filecur_] == "-") {
1724 } else if((in = fopen(qinfiles_[filecur_].c_str(), "rb")) == NULL) {
1725 if(!errs_[filecur_]) {
1726 cerr << "Warning: Could not open quality file \"" << qinfiles_[filecur_] << "\" for reading; skipping..." << endl;
1727 errs_[filecur_] = true;
1738 vector<string> infiles_; /// filenames for read files
1739 vector<string> qinfiles_; /// filenames for quality files
1740 vector<bool> errs_; /// whether we've already printed an error for each file
1741 size_t filecur_; /// index into infiles_ of next file to read
1742 FileBuf fb_; /// read file currently being read from
1743 FileBuf qfb_; /// quality file currently being read from
1744 uint32_t skip_; /// number of reads to skip
1749 * Parse a single quality string from fb and store qualities in r.
1750 * Assume the next character obtained via fb.get() is the first
1751 * character of the quality string. When returning, the next
1752 * character returned by fb.peek() or fb.get() should be the first
1753 * character of the following line.
1755 int parseQuals(ReadBuf& r,
1765 * Synchronized concrete pattern source for a list of FASTA or CSFASTA
1766 * (if color = true) files.
1768 class FastaPatternSource : public BufferedFilePatternSource {
1770 FastaPatternSource(uint32_t seed,
1771 const vector<string>& infiles,
1772 const vector<string>* qinfiles,
1774 bool randomizeQuals,
1775 bool useSpinlock = true,
1776 const char *dumpfile = NULL,
1777 bool verbose = false,
1780 bool solexa64 = false,
1781 bool phred64 = false,
1782 bool intQuals = false,
1783 uint32_t skip = 0) :
1784 BufferedFilePatternSource(seed, infiles, qinfiles, randomizeQuals,
1785 useSpinlock, dumpfile, verbose, trim3,
1787 first_(true), color_(color), solexa64_(solexa64),
1788 phred64_(phred64), intQuals_(intQuals)
1790 virtual void reset() {
1792 BufferedFilePatternSource::reset();
1796 * Scan to the next FASTA record (starting with >) and return the first
1797 * character of the record (which will always be >).
1799 static int skipToNextFastaRecord(FileBuf& in) {
1801 while((c = in.get()) != '>') {
1802 if(in.eof()) return -1;
1807 /// Called when we have to bail without having parsed a read.
1808 void bail(ReadBuf& r) {
1814 /// Read another pattern from a FASTA input file
1815 virtual void read(ReadBuf& r, uint32_t& patid) {
1819 bool doquals = qinfiles_.size() > 0;
1820 assert(!doquals || qfb_.isOpen());
1821 assert(fb_.isOpen());
1824 // Skip over between-read comments. Note that SOLiD's comments use #s
1826 if(c < 0) { bail(r); return; }
1827 while(c == '#' || c == ';') {
1828 c = fb_.peekUptoNewline();
1832 assert_eq(1, fb_.lastNCur());
1835 if(qc < 0) { bail(r); return; }
1836 while(qc == '#' || qc == ';') {
1837 qc = qfb_.peekUptoNewline();
1841 assert_eq(1, qfb_.lastNCur());
1844 // Pick off the first carat
1847 cerr << "Error: reads file does not look like a FASTA file" << endl;
1850 if(doquals && qc != '>') {
1851 cerr << "Error: quality file does not look like a FASTA quality file" << endl;
1857 if(doquals) assert_eq('>', qc);
1858 c = fb_.get(); // get next char after '>'
1859 if(doquals) qc = qfb_.get();
1861 // Read to the end of the id line, sticking everything after the '>'
1863 bool warning = false;
1865 if(c < 0 || qc < 0) { bail(r); return; }
1866 if(c == '\n' || c == '\r') {
1867 // Break at end of line, after consuming all \r's, \n's
1868 while(c == '\n' || c == '\r') {
1870 if(doquals) qc = qfb_.get();
1871 if(c < 0 || qc < 0) { bail(r); return; }
1875 if(doquals && c != qc) {
1876 cerr << "Warning: one or more mismatched read names between FASTA and quality files" << endl;
1879 r.nameBuf[nameLen++] = c;
1881 if(doquals) qc = qfb_.get();
1883 _setBegin(r.name, r.nameBuf);
1884 _setLength(r.name, nameLen);
1886 cerr << " Offending read name: \"" << r.name << "\"" << endl;
1889 // _in now points just past the first character of a sequence
1890 // line, and c holds the first character
1892 int mytrim5 = this->trim5_;
1894 // This is the primer character, keep it in the
1895 // 'primer' field of the read buf and keep parsing
1897 if(asc2dnacat[c] > 0) {
1898 // First char is a DNA char
1899 int c2 = toupper(fb_.peek());
1900 if(asc2colcat[c2] > 0) {
1901 // Second char is a color char
1907 if(c < 0) { bail(r); return; }
1909 while(c != '>' && c >= 0) {
1911 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
1912 if(c == '.') c = 'N';
1914 if(asc2dnacat[c] > 0 && begin++ >= mytrim5) {
1915 if(dstLen + 1 > 1024) tooManySeqChars(r.name);
1916 r.patBufFw[dstLen] = charToDna5[c];
1917 if(!doquals) r.qualBuf[dstLen] = 'I';
1920 if(fb_.peek() == '>') break;
1923 dstLen -= this->trim3_;
1924 if(dstLen < 0) dstLen = 0;
1925 _setBegin (r.patFw, (Dna5*)r.patBufFw);
1926 _setLength(r.patFw, dstLen);
1927 r.trimmed3 = this->trim3_;
1928 r.trimmed5 = mytrim5;
1931 parseQuals(r, qfb_, dstLen + r.trimmed3 + r.trimmed5,
1932 r.trimmed3, r.trimmed5, intQuals_, phred64_,
1936 qfb_.peekUptoNewline();
1946 _setBegin (r.qual, r.qualBuf);
1947 _setLength(r.qual, dstLen);
1948 // Set up a default name if one hasn't been set
1950 itoa10(readCnt_, r.nameBuf);
1951 _setBegin(r.name, r.nameBuf);
1952 nameLen = strlen(r.nameBuf);
1953 _setLength(r.name, nameLen);
1955 assert_gt(nameLen, 0);
1958 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
1961 r.qualOrigBufLen = qfb_.copyLastN(r.qualOrigBuf);
1964 cout << "Name: " << r.name << endl
1965 << " Seq: " << r.patFw << " (" << seqan::length(r.patFw) << ")" << endl
1966 << "Qual: " << r.qual << " (" << seqan::length(r.qual) << ")" << endl
1967 << "Orig seq:" << endl;
1968 for(size_t i = 0; i < r.readOrigBufLen; i++) cout << r.readOrigBuf[i];
1969 cout << "Orig qual:" << endl;
1970 for(size_t i = 0; i < r.qualOrigBufLen; i++) cout << r.qualOrigBuf[i];
1975 /// Read another pair of patterns from a FASTA input file
1976 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
1977 // (For now, we shouldn't ever be here)
1978 cerr << "In FastaPatternSource.readPair()" << endl;
1984 virtual void resetForNextFile() {
1987 virtual void dump(ostream& out,
1988 const String<Dna5>& seq,
1989 const String<char>& qual,
1990 const String<char>& name)
1992 out << ">" << name << endl << seq << endl;
2004 * Tokenize a line of space-separated integer quality values.
2006 static inline bool tokenizeQualLine(FileBuf& filebuf, char *buf, size_t buflen, vector<string>& toks) {
2007 size_t rd = filebuf.gets(buf, buflen);
2008 if(rd == 0) return false;
2009 assert(NULL == strrchr(buf, '\n'));
2010 tokenize(string(buf), " ", toks);
2015 * Synchronized concrete pattern source for a list of files with tab-
2016 * delimited name, seq, qual fields (or, for paired-end reads,
2017 * basename, seq1, qual1, seq2, qual2).
2019 class TabbedPatternSource : public BufferedFilePatternSource {
2021 TabbedPatternSource(uint32_t seed,
2022 const vector<string>& infiles,
2024 bool randomizeQuals = false,
2025 bool useSpinlock = true,
2026 const char *dumpfile = NULL,
2027 bool verbose = false,
2030 bool solQuals = false,
2031 bool phred64Quals = false,
2032 bool intQuals = false,
2033 uint32_t skip = 0) :
2034 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals,
2035 useSpinlock, dumpfile, verbose,
2036 trim3, trim5, skip),
2038 solQuals_(solQuals),
2039 phred64Quals_(phred64Quals),
2045 /// Read another pattern from a FASTA input file
2046 virtual void read(ReadBuf& r, uint32_t& patid) {
2048 int trim5 = this->trim5_;
2049 // fb_ is about to dish out the first character of the
2051 if(parseName(r, NULL, '\t') == -1) {
2052 peekOverNewline(fb_); // skip rest of line
2056 assert_neq('\t', fb_.peek());
2058 // fb_ is about to dish out the first character of the
2061 int dstLen = parseSeq(r, charsRead, trim5, '\t');
2062 assert_neq('\t', fb_.peek());
2064 peekOverNewline(fb_); // skip rest of line
2069 // fb_ is about to dish out the first character of the
2070 // quality-string field
2072 if(parseQuals(r, charsRead, dstLen, trim5, ct, '\n') <= 0) {
2073 peekOverNewline(fb_); // skip rest of line
2077 r.trimmed3 = this->trim3_;
2079 assert_eq(ct, '\n');
2080 assert_neq('\n', fb_.peek());
2081 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
2083 // The last character read in parseQuals should have been a
2090 /// Read another pair of patterns from a FASTA input file
2091 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2092 // fb_ is about to dish out the first character of the
2094 int mytrim5_1 = this->trim5_;
2095 if(parseName(ra, &rb, '\t') == -1) {
2096 peekOverNewline(fb_); // skip rest of line
2102 assert_neq('\t', fb_.peek());
2104 // fb_ is about to dish out the first character of the
2105 // sequence field for the first mate
2107 int dstLen1 = parseSeq(ra, charsRead1, mytrim5_1, '\t');
2109 peekOverNewline(fb_); // skip rest of line
2115 assert_neq('\t', fb_.peek());
2117 // fb_ is about to dish out the first character of the
2118 // quality-string field
2120 if(parseQuals(ra, charsRead1, dstLen1, mytrim5_1, ct, '\t', '\n') <= 0) {
2121 peekOverNewline(fb_); // skip rest of line
2127 ra.trimmed3 = this->trim3_;
2128 ra.trimmed5 = mytrim5_1;
2129 assert(ct == '\t' || ct == '\n');
2131 // Unpaired record; return.
2133 peekOverNewline(fb_);
2134 ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf);
2140 assert_neq('\t', fb_.peek());
2142 // fb_ is about to dish out the first character of the
2143 // sequence field for the second mate
2145 int mytrim5_2 = this->trim5_;
2146 int dstLen2 = parseSeq(rb, charsRead2, mytrim5_2, '\t');
2148 peekOverNewline(fb_); // skip rest of line
2154 assert_neq('\t', fb_.peek());
2156 // fb_ is about to dish out the first character of the
2157 // quality-string field
2158 if(parseQuals(rb, charsRead2, dstLen2, mytrim5_2, ct, '\n') <= 0) {
2159 peekOverNewline(fb_); // skip rest of line
2165 assert_eq('\n', ct);
2166 if(fb_.peek() == '\n') {
2169 peekOverNewline(fb_);
2170 ra.readOrigBufLen = fb_.copyLastN(ra.readOrigBuf);
2173 rb.trimmed3 = this->trim3_;
2174 rb.trimmed5 = mytrim5_2;
2176 // The last character read in parseQuals should have been a
2184 * Dump a FASTQ-style record for the read.
2186 virtual void dump(ostream& out,
2187 const String<Dna5>& seq,
2188 const String<char>& qual,
2189 const String<char>& name)
2191 out << "@" << name << endl << seq << endl
2192 << "+" << endl << qual << endl;
2197 * Parse a name from fb_ and store in r. Assume that the next
2198 * character obtained via fb_.get() is the first character of
2199 * the sequence and the string stops at the next char upto (could
2200 * be tab, newline, etc.).
2202 int parseName(ReadBuf& r, ReadBuf* r2, char upto = '\t') {
2203 // Read the name out of the first field
2207 if((c = fb_.get()) < 0) {
2211 // Finished with first field
2214 if(c == '\n' || c == '\r') {
2217 if(r2 != NULL) (*r2).nameBuf[nameLen] = c;
2218 r.nameBuf[nameLen++] = c;
2220 _setBegin(r.name, r.nameBuf);
2221 _setLength(r.name, nameLen);
2223 _setBegin((*r2).name, (*r2).nameBuf);
2224 _setLength((*r2).name, nameLen);
2226 // Set up a default name if one hasn't been set
2228 itoa10(readCnt_, r.nameBuf);
2229 _setBegin(r.name, r.nameBuf);
2230 nameLen = strlen(r.nameBuf);
2231 _setLength(r.name, nameLen);
2233 itoa10(readCnt_, (*r2).nameBuf);
2234 _setBegin((*r2).name, (*r2).nameBuf);
2235 _setLength((*r2).name, nameLen);
2238 assert_gt(nameLen, 0);
2243 * Parse a single sequence from fb_ and store in r. Assume
2244 * that the next character obtained via fb_.get() is the first
2245 * character of the sequence and the sequence stops at the next
2246 * char upto (could be tab, newline, etc.).
2248 int parseSeq(ReadBuf& r, int& charsRead, int& trim5, char upto = '\t') {
2255 // This may be a primer character. If so, keep it in the
2256 // 'primer' field of the read buf and parse the rest of the
2259 if(asc2dnacat[c] > 0) {
2260 // First char is a DNA char
2261 int c2 = toupper(fb_.peek());
2262 // Second char is a color char
2263 if(asc2colcat[c2] > 0) {
2266 trim5 += 2; // trim primer and first color
2269 if(c < 0) { return -1; }
2273 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
2275 if(c == '.') c = 'N';
2277 assert_in(toupper(c), "ACGTN");
2278 if(begin++ >= trim5) {
2279 assert_neq(0, dna4Cat[c]);
2280 if(dstLen + 1 > 1024) {
2281 cerr << "Input file contained a pattern more than 1024 characters long. Please truncate" << endl
2282 << "reads and re-run Bowtie" << endl;
2285 r.patBufFw[dstLen] = charToDna5[c];
2290 if((c = fb_.get()) < 0) {
2294 dstLen -= this->trim3_;
2295 _setBegin (r.patFw, (Dna5*)r.patBufFw);
2296 _setLength(r.patFw, dstLen);
2301 * Parse a single quality string from fb_ and store in r.
2302 * Assume that the next character obtained via fb_.get() is
2303 * the first character of the quality string and the string stops
2304 * at the next char upto (could be tab, newline, etc.).
2306 int parseQuals(ReadBuf& r, int charsRead, int dstLen, int trim5,
2307 char& c2, char upto = '\t', char upto2 = -1)
2313 while (qualsRead < charsRead) {
2314 vector<string> s_quals;
2315 if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break;
2316 for (unsigned int j = 0; j < s_quals.size(); ++j) {
2317 char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_);
2319 if (qualsRead >= trim5) {
2320 size_t off = qualsRead - trim5;
2321 if(off >= 1024) tooManyQualities(r.name);
2326 } // done reading integer quality lines
2327 if (charsRead > qualsRead) tooFewQualities(r.name);
2329 // Non-integer qualities
2330 while((qualsRead < dstLen + trim5) && c >= 0) {
2333 if (c == ' ') wrongQualityFormat(r.name);
2335 // EOF occurred in the middle of a read - abort
2338 if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
2339 if (qualsRead >= trim5) {
2340 size_t off = qualsRead - trim5;
2341 if(off >= 1024) tooManyQualities(r.name);
2342 c = charToPhred33(c, solQuals_, phred64Quals_);
2351 if(qualsRead != dstLen + trim5) {
2355 _setBegin (r.qual, (char*)r.qualBuf);
2356 _setLength(r.qual, dstLen);
2357 while(c != upto && (upto2 == -1 || c != upto2)) {
2371 * Synchronized concrete pattern source for a list of FASTA files where
2372 * reads need to be extracted from long continuous sequences.
2374 class FastaContinuousPatternSource : public BufferedFilePatternSource {
2376 FastaContinuousPatternSource(
2378 const vector<string>& infiles,
2381 bool useSpinlock = true,
2382 const char *dumpfile = NULL,
2383 bool verbose = false,
2384 uint32_t skip = 0) :
2385 BufferedFilePatternSource(seed, infiles, NULL, false, useSpinlock,
2386 dumpfile, verbose, 0, 0, skip),
2387 length_(length), freq_(freq),
2388 eat_(length_-1), beginning_(true),
2389 nameChars_(0), bufCur_(0), subReadCnt_(0llu)
2392 assert_lt(length_, (size_t)ReadBuf::BUF_SIZE);
2395 virtual void reset() {
2396 BufferedFilePatternSource::reset();
2401 /// Read another pattern from a FASTA input file
2402 virtual void read(ReadBuf& r, uint32_t& patid) {
2406 seqan::clear(r.patFw);
2412 bool sawSpace = false;
2413 while(c != '\n' && c != '\r') {
2415 sawSpace = isspace(c);
2418 nameBuf_[nameChars_++] = c;
2423 while(c == '\n' || c == '\r') {
2427 nameBuf_[nameChars_++] = '_';
2429 int cat = dna4Cat[c];
2430 if(cat == 2) c = 'N';
2432 // Encountered non-DNA, non-IUPAC char; skip it
2436 buf_[bufCur_++] = c;
2437 if(bufCur_ == 1024) bufCur_ = 0;
2440 // Try to keep readCnt_ aligned with the offset
2441 // into the reference; that let's us see where
2442 // the sampling gaps are by looking at the read
2444 if(!beginning_) readCnt_++;
2447 for(size_t i = 0; i < length_; i++) {
2448 if(length_ - i <= bufCur_) {
2449 c = buf_[bufCur_ - (length_ - i)];
2452 c = buf_[bufCur_ - (length_ - i) + 1024];
2454 r.patBufFw [i] = charToDna5[c];
2457 _setBegin (r.patFw, (Dna5*)r.patBufFw);
2458 _setLength(r.patFw, length_);
2459 _setBegin (r.qual, r.qualBuf);
2460 _setLength(r.qual, length_);
2461 // Set up a default name if one hasn't been set
2462 for(size_t i = 0; i < nameChars_; i++) {
2463 r.nameBuf[i] = nameBuf_[i];
2465 itoa10(readCnt_ - subReadCnt_, &r.nameBuf[nameChars_]);
2466 _setBegin(r.name, r.nameBuf);
2467 _setLength(r.name, strlen(r.nameBuf));
2477 /// Shouldn't ever be here; it's not sensible to obtain read pairs
2478 // from a continuous input.
2479 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2480 cerr << "In FastaContinuousPatternSource.readPair()" << endl;
2485 * Reset state to be read for the next file.
2487 virtual void resetForNextFile() {
2490 bufCur_ = nameChars_ = 0;
2491 subReadCnt_ = readCnt_;
2495 size_t length_; /// length of reads to generate
2496 size_t freq_; /// frequency to sample reads
2497 size_t eat_; /// number of characters we need to skip before
2498 /// we have flushed all of the ambiguous or
2499 /// non-existent characters out of our read
2501 bool beginning_; /// skipping over the first read length?
2502 char buf_[1024]; /// read buffer
2503 char nameBuf_[1024];/// read buffer for name of fasta record being
2505 size_t nameChars_; /// number of name characters copied into buf
2506 size_t bufCur_; /// buffer cursor; points to where we should
2507 /// insert the next character
2508 uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
2509 /// the pat id to output (so it resets to 0 for
2510 /// each new sequence)
2514 * Read a FASTQ-format file.
2515 * See: http://maq.sourceforge.net/fastq.shtml
2517 class FastqPatternSource : public BufferedFilePatternSource {
2519 FastqPatternSource(uint32_t seed,
2520 const vector<string>& infiles,
2522 bool randomizeQuals = false,
2523 bool useSpinlock = true,
2524 const char *dumpfile = NULL,
2525 bool verbose = false,
2528 bool solexa_quals = false,
2529 bool phred64Quals = false,
2530 bool integer_quals = false,
2532 uint32_t skip = 0) :
2533 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals,
2534 useSpinlock, dumpfile, verbose,
2535 trim3, trim5, skip),
2537 solQuals_(solexa_quals),
2538 phred64Quals_(phred64Quals),
2539 intQuals_(integer_quals),
2543 virtual void reset() {
2546 BufferedFilePatternSource::reset();
2550 * Scan to the next FASTQ record (starting with @) and return the first
2551 * character of the record (which will always be @). Since the quality
2552 * line may start with @, we keep scanning until we've seen a line
2553 * beginning with @ where the line two lines back began with +.
2555 static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
2562 // If we couldn't find our desired '@' in the first 20
2563 // lines, it's time to give up
2565 // That firstc is '>' may be a hint that this is
2566 // actually a FASTA file, so return it intact
2572 if(c == -1) return -1;
2575 if(c == '@' && sawPlus && plusLine == (line-2)) {
2579 // Saw a '+' at the beginning of a line; remember where
2593 /// Read another pattern from a FASTQ input file
2594 virtual void read(ReadBuf& r, uint32_t& patid) {
2595 const int bufSz = ReadBuf::BUF_SIZE;
2604 // Pick off the first at
2608 c = getOverNewline(fb_);
2609 if(c < 0) { bail(r); return; }
2612 cerr << "Error: reads file does not look like a FASTQ file" << endl;
2619 // Read to the end of the id line, sticking everything after the '@'
2623 if(c < 0) { bail(r); return; }
2624 if(c == '\n' || c == '\r') {
2625 // Break at end of line, after consuming all \r's, \n's
2626 while(c == '\n' || c == '\r') {
2628 if(c < 0) { bail(r); return; }
2632 r.nameBuf[nameLen++] = c;
2633 if(nameLen > bufSz-2) {
2634 // Too many chars in read name; print friendly error message
2635 _setBegin(r.name, r.nameBuf);
2636 _setLength(r.name, nameLen);
2637 cerr << "FASTQ read name is too long; read names must be " << (bufSz-2) << " characters or fewer." << endl;
2638 cerr << "Beginning of bad read name: " << r.name << endl;
2642 _setBegin(r.name, r.nameBuf);
2643 assert_leq(nameLen, bufSz-2);
2644 _setLength(r.name, nameLen);
2645 // c now holds the first character on the line after the
2648 // fb_ now points just past the first character of a
2649 // sequence line, and c holds the first character
2651 uint8_t *sbuf = r.patBufFw;
2652 int dstLens[] = {0, 0, 0, 0};
2653 int *dstLenCur = &dstLens[0];
2654 int mytrim5 = this->trim5_;
2657 // This may be a primer character. If so, keep it in the
2658 // 'primer' field of the read buf and parse the rest of the
2661 if(asc2dnacat[c] > 0) {
2662 // First char is a DNA char
2663 int c2 = toupper(fb_.peek());
2664 // Second char is a color char
2665 if(asc2colcat[c2] > 0) {
2668 mytrim5 += 2; // trim primer and first color
2671 if(c < 0) { bail(r); return; }
2673 int trim5 = mytrim5;
2675 // Read had length 0; print warning (if not quiet) and quit
2677 cerr << "Warning: Skipping read (" << r.name << ") because it had length 0" << endl;
2679 peekToEndOfLine(fb_);
2684 // Convert color numbers to letters if necessary
2686 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
2688 if(c == '.') c = 'N';
2689 if(fuzzy_ && c == '-') c = 'A';
2691 // If it's past the 5'-end trim point
2692 assert_in(toupper(c), "ACGTN");
2693 if(charsRead >= trim5) {
2694 if((*dstLenCur) >= 1024) tooManySeqChars(r.name);
2695 sbuf[(*dstLenCur)++] = charToDna5[c];
2698 } else if(fuzzy_ && c == ' ') {
2699 trim5 = 0; // disable 5' trimming for now
2700 if(charsRead == 0) {
2705 if(altBufIdx >= 3) {
2706 cerr << "At most 3 alternate sequence strings permitted; offending read: " << r.name << endl;
2709 // Move on to the next alternate-sequence buffer
2710 sbuf = r.altPatBufFw[altBufIdx++];
2711 dstLenCur = &dstLens[altBufIdx];
2714 if(c < 0) { bail(r); return; }
2717 dstLen = dstLens[0];
2718 charsRead = dstLen + mytrim5;
2719 dstLen -= this->trim3_;
2720 // Set trimmed bounds of buffers
2721 _setBegin(r.patFw, (Dna5*)r.patBufFw);
2722 _setLength(r.patFw, dstLen);
2725 // Chew up the optional name on the '+' line
2726 peekToEndOfLine(fb_);
2728 // Now read the qualities
2733 if(color_ && r.primer != -1) mytrim5--;
2734 while (qualsRead < charsRead) {
2735 vector<string> s_quals;
2736 if(!tokenizeQualLine(fb_, buf, 4096, s_quals)) break;
2737 for (unsigned int j = 0; j < s_quals.size(); ++j) {
2738 char c = intToPhred33(atoi(s_quals[j].c_str()), solQuals_);
2740 if (qualsRead >= mytrim5) {
2741 size_t off = qualsRead - mytrim5;
2742 if(off >= 1024) tooManyQualities(r.name);
2747 } // done reading integer quality lines
2748 if(color_ && r.primer != -1) mytrim5++;
2749 if(qualsRead < charsRead-mytrim5) {
2750 tooFewQualities(r.name);
2751 } else if(qualsRead > charsRead-mytrim5+1) {
2752 tooManyQualities(r.name);
2754 if(qualsRead == charsRead-mytrim5+1 && color_ && r.primer != -1) {
2755 for(int i = 0; i < qualsRead-1; i++) {
2756 r.qualBuf[i] = r.qualBuf[i+1];
2759 _setBegin(r.qual, (char*)r.qualBuf);
2760 _setLength(r.qual, dstLen);
2761 peekOverNewline(fb_);
2763 // Non-integer qualities
2764 char *qbuf = r.qualBuf;
2767 if(color_ && r.primer != -1) trim5--;
2769 int qualsRead[4] = {0, 0, 0, 0};
2770 int *qualsReadCur = &qualsRead[0];
2773 if (!fuzzy_ && c == ' ') {
2774 wrongQualityFormat(r.name);
2775 } else if(c == ' ') {
2776 trim5 = 0; // disable 5' trimming for now
2777 if((*qualsReadCur) == 0) continue;
2778 if(altBufIdx >= 3) {
2779 cerr << "At most 3 alternate quality strings permitted; offending read: " << r.name << endl;
2782 qbuf = r.altQualBuf[altBufIdx++];
2783 qualsReadCur = &qualsRead[altBufIdx];
2786 if(c < 0) { bail(r); return; }
2787 if (c != '\r' && c != '\n') {
2788 if (*qualsReadCur >= trim5) {
2789 size_t off = (*qualsReadCur) - trim5;
2790 if(off >= 1024) tooManyQualities(r.name);
2791 c = charToPhred33(c, solQuals_, phred64Quals_);
2800 qualsRead[0] -= this->trim3_;
2801 int qRead = (int)(qualsRead[0] - itrim5);
2802 if(qRead < dstLen) {
2803 tooFewQualities(r.name);
2804 } else if(qRead > dstLen+1) {
2805 tooManyQualities(r.name);
2807 if(qRead == dstLen+1 && color_ && r.primer != -1) {
2808 for(int i = 0; i < dstLen; i++) {
2809 r.qualBuf[i] = r.qualBuf[i+1];
2812 _setBegin (r.qual, (char*)r.qualBuf);
2813 _setLength(r.qual, dstLen);
2816 // Trim from 3' end of alternate basecall and quality strings
2817 if(this->trim3_ > 0) {
2818 for(int i = 1; i < 4; i++) {
2819 assert_eq(qualsRead[i], dstLens[i]);
2820 qualsRead[i] = dstLens[i] =
2821 max<int>(0, dstLens[i] - this->trim3_);
2824 // Shift to RHS, and install in Strings
2825 assert_eq(0, r.alts);
2826 for(int i = 1; i < 4; i++) {
2827 if(qualsRead[i] == 0) continue;
2828 if(qualsRead[i] > dstLen) {
2829 // Shift everybody up
2830 int shiftAmt = qualsRead[i] - dstLen;
2831 for(int j = 0; j < dstLen; j++) {
2832 r.altQualBuf[i-1][j] = r.altQualBuf[i-1][j+shiftAmt];
2833 r.altPatBufFw[i-1][j] = r.altPatBufFw[i-1][j+shiftAmt];
2835 } else if (qualsRead[i] < dstLen) {
2836 // Shift everybody down
2837 int shiftAmt = dstLen - qualsRead[i];
2838 for(int j = dstLen-1; j >= shiftAmt; j--) {
2839 r.altQualBuf[i-1][j] = r.altQualBuf[i-1][j-shiftAmt];
2840 r.altPatBufFw[i-1][j] = r.altPatBufFw[i-1][j-shiftAmt];
2842 // Fill in unset positions
2843 for(int j = 0; j < shiftAmt; j++) {
2844 // '!' - indicates no alternate basecall at
2846 r.altQualBuf[i-1][j] = (char)33;
2849 _setBegin (r.altPatFw[i-1], (Dna5*)r.altPatBufFw[i-1]);
2850 _setLength(r.altPatFw[i-1], dstLen);
2851 _setBegin (r.altQual[i-1], (char*)r.altQualBuf[i-1]);
2852 _setLength(r.altQual[i-1], dstLen);
2857 if(c == '\r' || c == '\n') {
2858 c = peekOverNewline(fb_);
2860 c = peekToEndOfLine(fb_);
2863 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
2867 assert(c == -1 || c == '@');
2869 // Set up a default name if one hasn't been set
2871 itoa10(readCnt_, r.nameBuf);
2872 _setBegin(r.name, r.nameBuf);
2873 nameLen = strlen(r.nameBuf);
2874 _setLength(r.name, nameLen);
2876 r.trimmed3 = this->trim3_;
2877 r.trimmed5 = mytrim5;
2878 assert_gt(nameLen, 0);
2884 /// Read another read pair from a FASTQ input file
2885 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
2886 // (For now, we shouldn't ever be here)
2887 cerr << "In FastqPatternSource.readPair()" << endl;
2893 virtual void resetForNextFile() {
2896 virtual void dump(ostream& out,
2897 const String<Dna5>& seq,
2898 const String<char>& qual,
2899 const String<char>& name)
2901 out << "@" << name << endl << seq << endl << "+" << endl << qual << endl;
2906 * Do things we need to do if we have to bail in the middle of a
2907 * read, usually because we reached the end of the input without
2910 void bail(ReadBuf& r) {
2911 seqan::clear(r.patFw);
2924 * Read a Raw-format file (one sequence per line). No quality strings
2925 * allowed. All qualities are assumed to be 'I' (40 on the Phred-33
2928 class RawPatternSource : public BufferedFilePatternSource {
2930 RawPatternSource(uint32_t seed,
2931 const vector<string>& infiles,
2933 bool randomizeQuals = false,
2934 bool useSpinlock = true,
2935 const char *dumpfile = NULL,
2936 bool verbose = false,
2939 uint32_t skip = 0) :
2940 BufferedFilePatternSource(seed, infiles, NULL, randomizeQuals, useSpinlock,
2941 dumpfile, verbose, trim3, trim5, skip),
2942 first_(true), color_(color)
2944 virtual void reset() {
2946 BufferedFilePatternSource::reset();
2949 /// Read another pattern from a Raw input file
2950 virtual void read(ReadBuf& r, uint32_t& patid) {
2954 c = getOverNewline(this->fb_);
2955 if(c < 0) { bail(r); return; }
2956 assert(!isspace(c));
2958 int mytrim5 = this->trim5_;
2960 // Check that the first character is sane for a raw file
2963 if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
2964 if(cc == '.') cc = 'N';
2966 if(dna4Cat[cc] == 0) {
2967 cerr << "Error: reads file does not look like a Raw file" << endl;
2969 cerr << "Reads file looks like a FASTA file; please use -f" << endl;
2972 cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
2979 // This may be a primer character. If so, keep it in the
2980 // 'primer' field of the read buf and parse the rest of the
2983 if(asc2dnacat[c] > 0) {
2984 // First char is a DNA char
2985 int c2 = toupper(fb_.peek());
2986 // Second char is a color char
2987 if(asc2colcat[c2] > 0) {
2990 mytrim5 += 2; // trim primer and first color
2993 if(c < 0) { bail(r); return; }
2995 // _in now points just past the first character of a sequence
2996 // line, and c holds the first character
2997 while(!isspace(c) && c >= 0) {
2999 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
3001 if(c == '.') c = 'N';
3002 if(isalpha(c) && dstLen >= mytrim5) {
3003 size_t len = dstLen - mytrim5;
3004 if(len >= 1024) tooManyQualities(String<char>("(no name)"));
3005 r.patBufFw [len] = charToDna5[c];
3006 r.qualBuf[len] = 'I';
3008 } else if(isalpha(c)) dstLen++;
3009 if(isspace(fb_.peek())) break;
3012 if(dstLen >= (this->trim3_ + mytrim5)) {
3013 dstLen -= (this->trim3_ + mytrim5);
3017 _setBegin (r.patFw, (Dna5*)r.patBufFw);
3018 _setLength(r.patFw, dstLen);
3019 _setBegin (r.qual, r.qualBuf);
3020 _setLength(r.qual, dstLen);
3022 c = peekToEndOfLine(fb_);
3023 r.trimmed3 = this->trim3_;
3024 r.trimmed5 = mytrim5;
3025 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
3029 itoa10(readCnt_, r.nameBuf);
3030 _setBegin(r.name, r.nameBuf);
3031 nameLen = strlen(r.nameBuf);
3032 _setLength(r.name, nameLen);
3037 /// Read another read pair from a FASTQ input file
3038 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
3039 // (For now, we shouldn't ever be here)
3040 cerr << "In RawPatternSource.readPair()" << endl;
3046 virtual void resetForNextFile() {
3049 virtual void dump(ostream& out,
3050 const String<Dna5>& seq,
3051 const String<char>& qual,
3052 const String<char>& name)
3058 * Do things we need to do if we have to bail in the middle of a
3059 * read, usually because we reached the end of the input without
3062 void bail(ReadBuf& r) {
3063 seqan::clear(r.patFw);
3071 * Read a Raw-format file (one sequence per line). No quality strings
3072 * allowed. All qualities are assumed to be 'I' (40 on the Phred-33
3075 class ChainPatternSource : public BufferedFilePatternSource {
3077 ChainPatternSource(uint32_t seed,
3078 const vector<string>& infiles,
3080 const char *dumpfile,
3083 BufferedFilePatternSource(
3084 seed, infiles, NULL, false, useSpinlock, dumpfile, verbose, 0, 0, skip) { }
3088 /// Read another pattern from a Raw input file
3089 virtual void read(ReadBuf& r, uint32_t& patid) {
3093 seqan::clear(r.patFw);
3097 r.hitset.deserialize(fb_);
3098 } while(!r.hitset.initialized() && !fb_.eof());
3099 if(!r.hitset.initialized()) {
3101 seqan::clear(r.patFw);
3104 // Now copy the name/sequence/quals into r.name/r.patFw/r.qualFw
3105 _setBegin(r.name, (char*)r.nameBuf);
3106 _setCapacity(r.name, seqan::length(r.hitset.name));
3107 _setLength(r.name, seqan::length(r.hitset.name));
3108 memcpy(r.nameBuf, seqan::begin(r.hitset.name), seqan::length(r.hitset.name));
3109 _setBegin (r.patFw, (Dna5*)r.patBufFw);
3110 _setCapacity(r.patFw, seqan::length(r.hitset.seq));
3111 _setLength(r.patFw, seqan::length(r.hitset.seq));
3112 memcpy(r.patBufFw, seqan::begin(r.hitset.seq), seqan::length(r.hitset.seq));
3113 _setBegin (r.qual, r.qualBuf);
3114 _setCapacity(r.qual, seqan::length(r.hitset.qual));
3115 _setLength(r.qual, seqan::length(r.hitset.qual));
3116 memcpy(r.qualBuf, seqan::begin(r.hitset.qual), seqan::length(r.hitset.qual));
3118 r.readOrigBufLen = fb_.copyLastN(r.readOrigBuf);
3125 /// Read another read pair
3126 virtual void readPair(ReadBuf& ra, ReadBuf& rb, uint32_t& patid) {
3127 // (For now, we shouldn't ever be here)
3128 cerr << "In ChainPatternSource.readPair()" << endl;