Commit patch to not break on spaces.
[bowtie.git] / ref_read.cpp
1 #include "ref_read.h"
2
3 /**
4  * Reads past the next ambiguous or unambiguous stretch of sequence
5  * from the given FASTA file and returns its length.  Does not do
6  * anything with the sequence characters themselves; this is purely for
7  * measuring lengths.
8  */
9 RefRecord fastaRefReadSize(FileBuf& in,
10                            const RefReadInParams& rparms,
11                            bool first,
12                            BitpairOutFileBuf* bpout)
13 {
14         int c;
15         static int lastc = '>'; // last character seen
16
17         // RefRecord params
18         size_t len = 0; // 'len' counts toward total length
19         // 'off' counts number of ambiguous characters before first
20         // unambiguous character
21         size_t off = 0;
22
23         // Pick off the first carat and any preceding whitespace
24         if(first) {
25                 assert(!in.eof());
26                 lastc = '>';
27                 c = in.getPastWhitespace();
28                 if(in.eof()) {
29                         // Got eof right away; emit warning
30                         cerr << "Warning: Empty input file" << endl;
31                         lastc = -1;
32                         return RefRecord(0, 0, true);
33                 }
34                 assert(c == '>');
35         }
36
37         first = true;
38         // Skip to the end of the id line; if the next line is either
39         // another id line or a comment line, keep skipping
40         if(lastc == '>') {
41                 // Skip to the end of the name line
42                 do {
43                         if((c = in.getPastNewline()) == -1) {
44                                 // No more input
45                                 cerr << "Warning: Encountered empty reference sequence" << endl;
46                                 lastc = -1;
47                                 return RefRecord(0, 0, true);
48                         }
49                         if(c == '>') {
50                                 cerr << "Warning: Encountered empty reference sequence" << endl;
51                         }
52                         // continue until a non-name, non-comment line
53                 } while (c == '>');
54         } else {
55                 first = false; // not the first in a sequence
56                 off = 1; // The gap has already been consumed, so count it
57                 if((c = in.get()) == -1) {
58                         // Don't emit a warning, since this might legitimately be
59                         // a gap on the end of the final sequence in the file
60                         lastc = -1;
61                         return RefRecord(off, len, first);
62                 }
63         }
64
65         // Now skip to the first DNA character, counting gap characters
66         // as we go
67         int lc = -1; // last-DNA char variable for color conversion
68         while(true) {
69                 int cat = dna4Cat[c];
70                 if(rparms.nsToAs && cat == 2) c = 'A';
71                 if(cat == 1) {
72                         // This is a DNA character
73                         if(rparms.color) {
74                                 if(lc != -1) {
75                                         // Got two consecutive unambiguous DNAs
76                                         break; // to read-in loop
77                                 }
78                                 // Keep going; we need two consecutive unambiguous DNAs
79                                 lc = charToDna5[(int)c];
80                                 // The 'if(off > 0)' takes care of the case where
81                                 // the reference is entirely unambiguous and we don't
82                                 // want to incorrectly increment off.
83                                 if(off > 0) off++;
84                         } else {
85                                 break; // to read-in loop
86                         }
87                 } else if(cat == 2) {
88                         if(lc != -1 && off == 0) off++;
89                         lc = -1;
90                         off++; // skip over gap character and increment
91                 } else if(c == '>') {
92                         if(off > 0 && lastc == '>') {
93                                 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
94                         } else if(lastc == '>') {
95                                 cerr << "Warning: Encountered empty reference sequence" << endl;
96                         }
97                         lastc = '>';
98                         return RefRecord(off, 0, first);
99                 }
100                 c = in.get();
101                 if(c == -1) {
102                         // End-of-file
103                         if(off > 0 && lastc == '>') {
104                                 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
105                         } else if(lastc == '>') {
106                                 cerr << "Warning: Encountered empty reference sequence" << endl;
107                         }
108                         lastc = -1;
109                         return RefRecord(off, 0, first);
110                 }
111         }
112         assert(!rparms.color || (lc != -1));
113         assert_eq(1, dna4Cat[c]); // C must be unambiguous base
114         if(off > 0 && rparms.color && first) {
115                 // Handle the case where the first record has ambiguous
116                 // characters but we're in color space; one of those counts is
117                 // spurious
118                 off--;
119         }
120
121         // in now points just past the first character of a sequence
122         // line, and c holds the first character
123         while(c != -1 && c != '>') {
124                 if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
125                 uint8_t cat = dna4Cat[c];
126                 int cc = toupper(c);
127                 if(rparms.bisulfite && cc == 'C') c = cc = 'T';
128                 if(cat == 1) {
129                         // It's a DNA character
130                         assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
131                         // Consume it
132                         len++;
133                         // Output it
134                         if(bpout != NULL) {
135                                 if(rparms.color) {
136                                         // output color
137                                         bpout->write(dinuc2color[charToDna5[(int)c]][lc]);
138                                 } else if(!rparms.color) {
139                                         // output nucleotide
140                                         bpout->write(charToDna5[c]);
141                                 }
142                         }
143                         lc = charToDna5[(int)c];
144                 } else if(cat == 2) {
145                         // It's an N or a gap
146                         lastc = c;
147                         assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
148                         return RefRecord(off, len, first);
149                 } else {
150                         // Not DNA and not a gap, ignore it
151 #ifndef NDEBUG
152                         if(!isspace(c)) {
153                                 cerr << "Unexpected character in sequence: ";
154                                 if(isprint(c)) {
155                                         cerr << ((char)c) << endl;
156                                 } else {
157                                         cerr << "(" << c << ")" << endl;
158                                 }
159                         }
160 #endif
161                 }
162                 c = in.get();
163         }
164         lastc = c;
165         return RefRecord(off, len, first);
166 }
167
168 static void
169 printRecords(ostream& os, const vector<RefRecord>& l) {
170         for(size_t i = 0; i < l.size(); i++) {
171                 os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
172         }
173 }
174
175 /**
176  * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
177  * modify 'src'.
178  */
179 void reverseRefRecords(const vector<RefRecord>& src,
180                                            vector<RefRecord>& dst,
181                                            bool recursive,
182                                            bool verbose)
183 {
184         dst.clear();
185         {
186                 vector<RefRecord> cur;
187                 for(int i = src.size()-1; i >= 0; i--) {
188                         bool first = (i == (int)src.size()-1 || src[i+1].first);
189                         if(src[i].len) {
190                                 cur.push_back(RefRecord(0, src[i].len, first));
191                                 first = false;
192                         }
193                         if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
194                 }
195                 bool mergedLast;
196                 for(int i = 0; i < (int)cur.size(); i++) {
197                         mergedLast = false;
198                         assert(cur[i].off == 0 || cur[i].len == 0);
199                         if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
200                                 dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
201                                 i++;
202                                 mergedLast = true;
203                         } else {
204                                 dst.push_back(cur[i]);
205                         }
206                 }
207         }
208         if(verbose) {
209                 cout << "Source: " << endl;
210                 printRecords(cout, src);
211                 cout << "Dest: " << endl;
212                 printRecords(cout, dst);
213         }
214 #ifndef NDEBUG
215         if(!recursive) {
216                 vector<RefRecord> tmp;
217                 reverseRefRecords(dst, tmp, true);
218                 assert_eq(tmp.size(), src.size());
219                 for(size_t i = 0; i < src.size(); i++) {
220                         assert_eq(src[i].len, tmp[i].len);
221                         assert_eq(src[i].off, tmp[i].off);
222                         assert_eq(src[i].first, tmp[i].first);
223                 }
224         }
225 #endif
226 }
227
228 /**
229  * Calculate a vector containing the sizes of all of the patterns in
230  * all of the given input files, in order.  Returns the total size of
231  * all references combined.  Rewinds each istream before returning.
232  */
233 std::pair<size_t, size_t>
234 fastaRefReadSizes(vector<FileBuf*>& in,
235                   vector<RefRecord>& recs,
236                   vector<uint32_t>& plens,
237                   const RefReadInParams& rparms,
238                   BitpairOutFileBuf* bpout,
239                   int& numSeqs)
240 {
241         uint32_t unambigTot = 0;
242         uint32_t bothTot = 0;
243         RefReadInParams rpcp = rparms;
244         assert_gt(in.size(), 0);
245         uint32_t both = 0, unambig = 0;
246         // For each input istream
247         for(size_t i = 0; i < in.size(); i++) {
248                 bool first = true;
249                 assert(!in[i]->eof());
250                 // For each pattern in this istream
251                 while(!in[i]->eof()) {
252                         RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
253                         // Update plens
254                         if(rec.first) {
255                                 if(unambig > 0) {
256                                         plens.push_back(both);
257                                 }
258                                 both = 0;
259                                 unambig = 0;
260                         }
261 #ifndef ACCOUNT_FOR_ALL_GAP_REFS
262                         if(rec.len == 0) rec.first = false;
263 #endif
264                         if((unambigTot + rec.len) < unambigTot) {
265                                 cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
266                                      << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
267                                      << "and index each independently." << endl;
268                                 throw 1;
269                         }
270                         // Add the length of this record.
271                         if(rec.first) numSeqs++;
272                         unambigTot += rec.len; unambig += rec.len;
273                         bothTot += rec.len;    both += rec.len;
274                         bothTot += rec.off;    both += rec.off;
275                         first = false;
276                         if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
277                         recs.push_back(rec);
278                 }
279                 // Reset the input stream
280                 in[i]->reset();
281                 assert(!in[i]->eof());
282 #ifndef NDEBUG
283                 // Check that it's really reset
284                 int c = in[i]->get();
285                 assert_eq('>', c);
286                 in[i]->reset();
287                 assert(!in[i]->eof());
288 #endif
289         }
290         assert_geq(bothTot, 0);
291         assert_geq(unambigTot, 0);
292         if(unambig > 0) {
293                 plens.push_back(both);
294         }
295         return make_pair(
296                 unambigTot, // total number of unambiguous DNA characters read
297                 bothTot); // total number of DNA characters read, incl. ambiguous ones
298 }