ref_read.cpp

   1 #include "ref_read.h"
   2
   3 /**
   4  * Reads past the next ambiguous or unambiguous stretch of sequence
   5  * from the given FASTA file and returns its length.  Does not do
   6  * anything with the sequence characters themselves; this is purely for
   7  * measuring lengths.
   8  */
   9 RefRecord fastaRefReadSize(FileBuf& in,
  10                            const RefReadInParams& rparms,
  11                            bool first,
  12                            BitpairOutFileBuf* bpout)
  13 {
  14         int c;
  15         static int lastc = '>'; // last character seen
  16
  17         // RefRecord params
  18         size_t len = 0; // 'len' counts toward total length
  19         // 'off' counts number of ambiguous characters before first
  20         // unambiguous character
  21         size_t off = 0;
  22
  23         // Pick off the first carat and any preceding whitespace
  24         if(first) {
  25                 assert(!in.eof());
  26                 lastc = '>';
  27                 c = in.getPastWhitespace();
  28                 if(in.eof()) {
  29                         // Got eof right away; emit warning
  30                         cerr << "Warning: Empty input file" << endl;
  31                         lastc = -1;
  32                         return RefRecord(0, 0, true);
  33                 }
  34                 assert(c == '>');
  35         }
  36
  37         first = true;
  38         // Skip to the end of the id line; if the next line is either
  39         // another id line or a comment line, keep skipping
  40         if(lastc == '>') {
  41                 // Skip to the end of the name line
  42                 do {
  43                         if((c = in.getPastNewline()) == -1) {
  44                                 // No more input
  45                                 cerr << "Warning: Encountered empty reference sequence" << endl;
  46                                 lastc = -1;
  47                                 return RefRecord(0, 0, true);
  48                         }
  49                         if(c == '>') {
  50                                 cerr << "Warning: Encountered empty reference sequence" << endl;
  51                         }
  52                         // continue until a non-name, non-comment line
  53                 } while (c == '>');
  54         } else {
  55                 first = false; // not the first in a sequence
  56                 off = 1; // The gap has already been consumed, so count it
  57                 if((c = in.get()) == -1) {
  58                         // Don't emit a warning, since this might legitimately be
  59                         // a gap on the end of the final sequence in the file
  60                         lastc = -1;
  61                         return RefRecord(off, len, first);
  62                 }
  63         }
  64
  65         // Now skip to the first DNA character, counting gap characters
  66         // as we go
  67         int lc = -1; // last-DNA char variable for color conversion
  68         while(true) {
  69                 int cat = dna4Cat[c];
  70                 if(rparms.nsToAs && cat == 2) c = 'A';
  71                 if(cat == 1) {
  72                         // This is a DNA character
  73                         if(rparms.color) {
  74                                 if(lc != -1) {
  75                                         // Got two consecutive unambiguous DNAs
  76                                         break; // to read-in loop
  77                                 }
  78                                 // Keep going; we need two consecutive unambiguous DNAs
  79                                 lc = charToDna5[(int)c];
  80                                 // The 'if(off > 0)' takes care of the case where
  81                                 // the reference is entirely unambiguous and we don't
  82                                 // want to incorrectly increment off.
  83                                 if(off > 0) off++;
  84                         } else {
  85                                 break; // to read-in loop
  86                         }
  87                 } else if(cat == 2) {
  88                         if(lc != -1 && off == 0) off++;
  89                         lc = -1;
  90                         off++; // skip over gap character and increment
  91                 } else if(c == '>') {
  92                         if(off > 0 && lastc == '>') {
  93                                 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
  94                         } else if(lastc == '>') {
  95                                 cerr << "Warning: Encountered empty reference sequence" << endl;
  96                         }
  97                         lastc = '>';
  98                         return RefRecord(off, 0, first);
  99                 }
 100                 c = in.get();
 101                 if(c == -1) {
 102                         // End-of-file
 103                         if(off > 0 && lastc == '>') {
 104                                 cerr << "Warning: Encountered reference sequence with only gaps" << endl;
 105                         } else if(lastc == '>') {
 106                                 cerr << "Warning: Encountered empty reference sequence" << endl;
 107                         }
 108                         lastc = -1;
 109                         return RefRecord(off, 0, first);
 110                 }
 111         }
 112         assert(!rparms.color || (lc != -1));
 113         assert_eq(1, dna4Cat[c]); // C must be unambiguous base
 114         if(off > 0 && rparms.color && first) {
 115                 // Handle the case where the first record has ambiguous
 116                 // characters but we're in color space; one of those counts is
 117                 // spurious
 118                 off--;
 119         }
 120
 121         // in now points just past the first character of a sequence
 122         // line, and c holds the first character
 123         while(c != -1 && c != '>') {
 124                 if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
 125                 uint8_t cat = dna4Cat[c];
 126                 int cc = toupper(c);
 127                 if(rparms.bisulfite && cc == 'C') c = cc = 'T';
 128                 if(cat == 1) {
 129                         // It's a DNA character
 130                         assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
 131                         // Consume it
 132                         len++;
 133                         // Output it
 134                         if(bpout != NULL) {
 135                                 if(rparms.color) {
 136                                         // output color
 137                                         bpout->write(dinuc2color[charToDna5[(int)c]][lc]);
 138                                 } else if(!rparms.color) {
 139                                         // output nucleotide
 140                                         bpout->write(charToDna5[c]);
 141                                 }
 142                         }
 143                         lc = charToDna5[(int)c];
 144                 } else if(cat == 2) {
 145                         // It's an N or a gap
 146                         lastc = c;
 147                         assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
 148                         return RefRecord(off, len, first);
 149                 } else {
 150                         // Not DNA and not a gap, ignore it
 151 #ifndef NDEBUG
 152                         if(!isspace(c)) {
 153                                 cerr << "Unexpected character in sequence: ";
 154                                 if(isprint(c)) {
 155                                         cerr << ((char)c) << endl;
 156                                 } else {
 157                                         cerr << "(" << c << ")" << endl;
 158                                 }
 159                         }
 160 #endif
 161                 }
 162                 c = in.get();
 163         }
 164         lastc = c;
 165         return RefRecord(off, len, first);
 166 }
 167
 168 static void
 169 printRecords(ostream& os, const vector<RefRecord>& l) {
 170         for(size_t i = 0; i < l.size(); i++) {
 171                 os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
 172         }
 173 }
 174
 175 /**
 176  * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
 177  * modify 'src'.
 178  */
 179 void reverseRefRecords(const vector<RefRecord>& src,
 180                                            vector<RefRecord>& dst,
 181                                            bool recursive,
 182                                            bool verbose)
 183 {
 184         dst.clear();
 185         {
 186                 vector<RefRecord> cur;
 187                 for(int i = src.size()-1; i >= 0; i--) {
 188                         bool first = (i == (int)src.size()-1 || src[i+1].first);
 189                         if(src[i].len) {
 190                                 cur.push_back(RefRecord(0, src[i].len, first));
 191                                 first = false;
 192                         }
 193                         if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
 194                 }
 195                 bool mergedLast;
 196                 for(int i = 0; i < (int)cur.size(); i++) {
 197                         mergedLast = false;
 198                         assert(cur[i].off == 0 || cur[i].len == 0);
 199                         if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
 200                                 dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
 201                                 i++;
 202                                 mergedLast = true;
 203                         } else {
 204                                 dst.push_back(cur[i]);
 205                         }
 206                 }
 207         }
 208         if(verbose) {
 209                 cout << "Source: " << endl;
 210                 printRecords(cout, src);
 211                 cout << "Dest: " << endl;
 212                 printRecords(cout, dst);
 213         }
 214 #ifndef NDEBUG
 215         if(!recursive) {
 216                 vector<RefRecord> tmp;
 217                 reverseRefRecords(dst, tmp, true);
 218                 assert_eq(tmp.size(), src.size());
 219                 for(size_t i = 0; i < src.size(); i++) {
 220                         assert_eq(src[i].len, tmp[i].len);
 221                         assert_eq(src[i].off, tmp[i].off);
 222                         assert_eq(src[i].first, tmp[i].first);
 223                 }
 224         }
 225 #endif
 226 }
 227
 228 /**
 229  * Calculate a vector containing the sizes of all of the patterns in
 230  * all of the given input files, in order.  Returns the total size of
 231  * all references combined.  Rewinds each istream before returning.
 232  */
 233 std::pair<size_t, size_t>
 234 fastaRefReadSizes(vector<FileBuf*>& in,
 235                   vector<RefRecord>& recs,
 236                   vector<uint32_t>& plens,
 237                   const RefReadInParams& rparms,
 238                   BitpairOutFileBuf* bpout,
 239                   int& numSeqs)
 240 {
 241         uint32_t unambigTot = 0;
 242         uint32_t bothTot = 0;
 243         RefReadInParams rpcp = rparms;
 244         assert_gt(in.size(), 0);
 245         uint32_t both = 0, unambig = 0;
 246         // For each input istream
 247         for(size_t i = 0; i < in.size(); i++) {
 248                 bool first = true;
 249                 assert(!in[i]->eof());
 250                 // For each pattern in this istream
 251                 while(!in[i]->eof()) {
 252                         RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
 253                         // Update plens
 254                         if(rec.first) {
 255                                 if(unambig > 0) {
 256                                         plens.push_back(both);
 257                                 }
 258                                 both = 0;
 259                                 unambig = 0;
 260                         }
 261 #ifndef ACCOUNT_FOR_ALL_GAP_REFS
 262                         if(rec.len == 0) rec.first = false;
 263 #endif
 264                         if((unambigTot + rec.len) < unambigTot) {
 265                                 cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
 266                                      << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
 267                                      << "and index each independently." << endl;
 268                                 throw 1;
 269                         }
 270                         // Add the length of this record.
 271                         if(rec.first) numSeqs++;
 272                         unambigTot += rec.len; unambig += rec.len;
 273                         bothTot += rec.len;    both += rec.len;
 274                         bothTot += rec.off;    both += rec.off;
 275                         first = false;
 276                         if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
 277                         recs.push_back(rec);
 278                 }
 279                 // Reset the input stream
 280                 in[i]->reset();
 281                 assert(!in[i]->eof());
 282 #ifndef NDEBUG
 283                 // Check that it's really reset
 284                 int c = in[i]->get();
 285                 assert_eq('>', c);
 286                 in[i]->reset();
 287                 assert(!in[i]->eof());
 288 #endif
 289         }
 290         assert_geq(bothTot, 0);
 291         assert_geq(unambigTot, 0);
 292         if(unambig > 0) {
 293                 plens.push_back(both);
 294         }
 295         return make_pair(
 296                 unambigTot, // total number of unambiguous DNA characters read
 297                 bothTot); // total number of DNA characters read, incl. ambiguous ones
 298 }