4 * Reads past the next ambiguous or unambiguous stretch of sequence
5 * from the given FASTA file and returns its length. Does not do
6 * anything with the sequence characters themselves; this is purely for
9 RefRecord fastaRefReadSize(FileBuf& in,
10 const RefReadInParams& rparms,
12 BitpairOutFileBuf* bpout)
15 static int lastc = '>'; // last character seen
18 size_t len = 0; // 'len' counts toward total length
19 // 'off' counts number of ambiguous characters before first
20 // unambiguous character
23 // Pick off the first carat and any preceding whitespace
27 c = in.getPastWhitespace();
29 // Got eof right away; emit warning
30 cerr << "Warning: Empty input file" << endl;
32 return RefRecord(0, 0, true);
38 // Skip to the end of the id line; if the next line is either
39 // another id line or a comment line, keep skipping
41 // Skip to the end of the name line
43 if((c = in.getPastNewline()) == -1) {
45 cerr << "Warning: Encountered empty reference sequence" << endl;
47 return RefRecord(0, 0, true);
50 cerr << "Warning: Encountered empty reference sequence" << endl;
52 // continue until a non-name, non-comment line
55 first = false; // not the first in a sequence
56 off = 1; // The gap has already been consumed, so count it
57 if((c = in.get()) == -1) {
58 // Don't emit a warning, since this might legitimately be
59 // a gap on the end of the final sequence in the file
61 return RefRecord(off, len, first);
65 // Now skip to the first DNA character, counting gap characters
67 int lc = -1; // last-DNA char variable for color conversion
70 if(rparms.nsToAs && cat == 2) c = 'A';
72 // This is a DNA character
75 // Got two consecutive unambiguous DNAs
76 break; // to read-in loop
78 // Keep going; we need two consecutive unambiguous DNAs
79 lc = charToDna5[(int)c];
80 // The 'if(off > 0)' takes care of the case where
81 // the reference is entirely unambiguous and we don't
82 // want to incorrectly increment off.
85 break; // to read-in loop
88 if(lc != -1 && off == 0) off++;
90 off++; // skip over gap character and increment
92 if(off > 0 && lastc == '>') {
93 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
94 } else if(lastc == '>') {
95 cerr << "Warning: Encountered empty reference sequence" << endl;
98 return RefRecord(off, 0, first);
103 if(off > 0 && lastc == '>') {
104 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
105 } else if(lastc == '>') {
106 cerr << "Warning: Encountered empty reference sequence" << endl;
109 return RefRecord(off, 0, first);
112 assert(!rparms.color || (lc != -1));
113 assert_eq(1, dna4Cat[c]); // C must be unambiguous base
114 if(off > 0 && rparms.color && first) {
115 // Handle the case where the first record has ambiguous
116 // characters but we're in color space; one of those counts is
121 // in now points just past the first character of a sequence
122 // line, and c holds the first character
123 while(c != -1 && c != '>') {
124 if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
125 uint8_t cat = dna4Cat[c];
127 if(rparms.bisulfite && cc == 'C') c = cc = 'T';
129 // It's a DNA character
130 assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
137 bpout->write(dinuc2color[charToDna5[(int)c]][lc]);
138 } else if(!rparms.color) {
140 bpout->write(charToDna5[c]);
143 lc = charToDna5[(int)c];
144 } else if(cat == 2) {
145 // It's an N or a gap
147 assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
148 return RefRecord(off, len, first);
150 // Not DNA and not a gap, ignore it
153 cerr << "Unexpected character in sequence: ";
155 cerr << ((char)c) << endl;
157 cerr << "(" << c << ")" << endl;
165 return RefRecord(off, len, first);
169 printRecords(ostream& os, const vector<RefRecord>& l) {
170 for(size_t i = 0; i < l.size(); i++) {
171 os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
176 * Reverse the 'src' list of RefRecords into the 'dst' list. Don't
179 void reverseRefRecords(const vector<RefRecord>& src,
180 vector<RefRecord>& dst,
186 vector<RefRecord> cur;
187 for(int i = src.size()-1; i >= 0; i--) {
188 bool first = (i == (int)src.size()-1 || src[i+1].first);
190 cur.push_back(RefRecord(0, src[i].len, first));
193 if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
196 for(int i = 0; i < (int)cur.size(); i++) {
198 assert(cur[i].off == 0 || cur[i].len == 0);
199 if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
200 dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
204 dst.push_back(cur[i]);
209 cout << "Source: " << endl;
210 printRecords(cout, src);
211 cout << "Dest: " << endl;
212 printRecords(cout, dst);
216 vector<RefRecord> tmp;
217 reverseRefRecords(dst, tmp, true);
218 assert_eq(tmp.size(), src.size());
219 for(size_t i = 0; i < src.size(); i++) {
220 assert_eq(src[i].len, tmp[i].len);
221 assert_eq(src[i].off, tmp[i].off);
222 assert_eq(src[i].first, tmp[i].first);
229 * Calculate a vector containing the sizes of all of the patterns in
230 * all of the given input files, in order. Returns the total size of
231 * all references combined. Rewinds each istream before returning.
233 std::pair<size_t, size_t>
234 fastaRefReadSizes(vector<FileBuf*>& in,
235 vector<RefRecord>& recs,
236 vector<uint32_t>& plens,
237 const RefReadInParams& rparms,
238 BitpairOutFileBuf* bpout,
241 uint32_t unambigTot = 0;
242 uint32_t bothTot = 0;
243 RefReadInParams rpcp = rparms;
244 assert_gt(in.size(), 0);
245 uint32_t both = 0, unambig = 0;
246 // For each input istream
247 for(size_t i = 0; i < in.size(); i++) {
249 assert(!in[i]->eof());
250 // For each pattern in this istream
251 while(!in[i]->eof()) {
252 RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
256 plens.push_back(both);
261 #ifndef ACCOUNT_FOR_ALL_GAP_REFS
262 if(rec.len == 0) rec.first = false;
264 if((unambigTot + rec.len) < unambigTot) {
265 cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl
266 << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
267 << "and index each independently." << endl;
270 // Add the length of this record.
271 if(rec.first) numSeqs++;
272 unambigTot += rec.len; unambig += rec.len;
273 bothTot += rec.len; both += rec.len;
274 bothTot += rec.off; both += rec.off;
276 if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
279 // Reset the input stream
281 assert(!in[i]->eof());
283 // Check that it's really reset
284 int c = in[i]->get();
287 assert(!in[i]->eof());
290 assert_geq(bothTot, 0);
291 assert_geq(unambigTot, 0);
293 plens.push_back(both);
296 unambigTot, // total number of unambiguous DNA characters read
297 bothTot); // total number of DNA characters read, incl. ambiguous ones