11 #include <seqan/sequence.h>
13 #include "assert_helpers.h"
18 using namespace seqan;
21 * Encapsulates a stretch of the reference containing only unambiguous
22 * characters. From an ordered list of RefRecords, one can (almost)
23 * deduce the "shape" of the reference sequences (almost because we
24 * lose information about stretches of ambiguous characters at the end
25 * of reference sequences).
28 RefRecord() : off(), len(), first() { }
29 RefRecord(uint32_t _off, uint32_t _len, bool _first) :
30 off(_off), len(_len), first(_first)
33 RefRecord(FILE *in, bool swap) {
35 if(!fread(&off, 4, 1, in)) {
36 cerr << "Error reading RefRecord offset from FILE" << endl;
39 if(swap) off = endianSwapU32(off);
40 if(!fread(&len, 4, 1, in)) {
41 cerr << "Error reading RefRecord offset from FILE" << endl;
44 if(swap) len = endianSwapU32(len);
45 first = fgetc(in) ? true : false;
48 RefRecord(int in, bool swap) {
49 off = readU32(in, swap);
50 len = readU32(in, swap);
52 if(!read(in, &c, 1)) {
53 cerr << "Error reading RefRecord 'first' flag" << endl;
56 first = (c ? true : false);
59 void write(std::ostream& out, bool be) {
60 writeU32(out, off, be);
61 writeU32(out, len, be);
62 out.put(first ? 1 : 0);
65 uint32_t off; /// Offset of the first character in the record
66 uint32_t len; /// Length of the record
67 bool first; /// Whether this record is the first for a reference sequence
71 REF_READ_FORWARD = 0, // don't reverse reference sequence
72 REF_READ_REVERSE, // reverse entire reference sequence
73 REF_READ_REVERSE_EACH // reverse each unambiguous stretch of reference
77 * Parameters governing treatment of references as they're read in.
79 struct RefReadInParams {
80 RefReadInParams(bool col, int r, bool nsToA, bool bisulf) :
81 color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { }
82 // extract colors from reference
84 // reverse each reference sequence before passing it along
86 // convert ambiguous characters to As
88 // bisulfite-convert the reference
95 const RefReadInParams& rparms,
97 BitpairOutFileBuf* bpout = NULL);
99 extern std::pair<size_t, size_t>
101 vector<FileBuf*>& in,
102 vector<RefRecord>& recs,
103 vector<uint32_t>& plens,
104 const RefReadInParams& rparms,
105 BitpairOutFileBuf* bpout,
110 const vector<RefRecord>& src,
111 vector<RefRecord>& dst,
112 bool recursive = false,
113 bool verbose = false);
116 * Reads the next sequence from the given FASTA file and appends it to
117 * the end of dst, optionally reversing it.
119 template <typename TStr>
120 static RefRecord fastaRefReadAppend(FileBuf& in,
123 RefReadInParams& rparms,
126 typedef typename Value<TStr>::Type TVal;
128 static int lastc = '>';
130 c = in.getPastWhitespace();
132 cerr << "Reference file does not seem to be a FASTA file" << endl;
137 assert_neq(-1, lastc);
144 size_t ilen = length(dst);
146 // Chew up the id line; if the next line is either
147 // another id line or a comment line, keep chewing
148 int lc = -1; // last-DNA char variable for color conversion
150 if(c == '>' || c == '#') {
153 if((c = in.getPastNewline()) == -1) {
165 if(c == '\n' || c == '\r') {
166 while(c == '\r' || c == '\n') c = in.get();
173 if (name) name->push_back(c);
175 // c holds the first character on the line after the name
177 } while (c == '>' || c == '#');
179 ASSERT_ONLY(int cc = toupper(c));
180 assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
184 // Skip over an initial stretch of gaps or ambiguous characters.
185 // For colorspace we skip until we see two consecutive unambiguous
186 // characters (i.e. the first unambiguous color).
188 int cat = dna4Cat[c];
189 if(rparms.nsToAs && cat == 2) {
193 if(rparms.bisulfite && cc == 'C') c = cc = 'T';
195 // This is a DNA character
198 // Got two consecutive unambiguous DNAs
199 break; // to read-in loop
201 // Keep going; we need two consecutive unambiguous DNAs
202 lc = charToDna5[(int)c];
203 // The 'if(off > 0)' takes care of the case where
204 // the reference is entirely unambiguous and we don't
205 // want to incorrectly increment off.
208 break; // to read-in loop
210 } else if(cat == 2) {
211 if(lc != -1 && off == 0) {
216 } else if(c == '>') {
226 if(first && rparms.color && off > 0) {
227 // Handle the case where the first record has ambiguous
228 // characters but we're in color space; one of those counts is
232 assert(!rparms.color || lc != -1);
233 assert_eq(1, dna4Cat[c]);
235 // in now points just past the first character of a sequence
236 // line, and c holds the first character
238 // Note: can't have a comment in the middle of a sequence,
239 // though a comment can end a sequence
240 int cat = dna4Cat[c];
244 if(!rparms.color || lc != -1) len++;
245 // Add it to referenece buffer
247 appendValue(dst, (Dna)dinuc2color[charToDna5[(int)c]][lc]);
248 } else if(!rparms.color) {
249 appendValue(dst, (Dna)(char)c);
251 assert_lt((uint8_t)(Dna)dst[length(dst)-1], 4);
252 lc = charToDna5[(int)c];
255 if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
256 if (c == -1 || c == '>' || c == '#' || dna4Cat[c] == 2) {
260 if(rparms.bisulfite && toupper(c) == 'C') c = 'T';
264 // Optionally reverse the portion that we just appended.
265 // ilen = length of buffer before this last sequence was appended.
266 if(rparms.reverse == REF_READ_REVERSE_EACH) {
267 // Find limits of the portion we just appended
268 size_t nlen = length(dst);
269 assert_eq(nlen - ilen, len);
271 size_t halfway = ilen + (len>>1);
272 // Reverse it in-place
273 for(size_t i = ilen; i < halfway; i++) {
274 size_t diff = i-ilen;
275 size_t j = nlen-diff-1;
282 return RefRecord(off, len, first);
285 #endif /*ndef REF_READ_H_*/