htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp

   1 /*
   2  * getReadsInSnps:
   3  * This program takes a set of snps in a custom tab format, and a set of short mapped reads, and evaluates
   4  * the sequencing overlap over those snps. Additionally, a miaxture model is fit and used to classify the
   5  * snps as homozygous or heterozygous.
   6  *
   7  * In the final report, the output is:
   8  * <snp id> <chromosome> <position> <reference base> <a count> <c count> <g count> <t count> <total count> <snp call>
   9  * where snp call is one of:
  10  * -1: no call was made (not enough examples to make a call)
  11  * 0: the snp is homozygous
  12  * 1: the snp is heterozygous
  13  *
  14  */
  15 #include <sys/types.h>
  16 #include <iostream>
  17 #include <fstream>
  18 #include <vector>
  19 #include <map>
  20 #include <queue>
  21 #include <math.h>
  22 #include <string>
  23 #include <limits.h>
  24
  25 #include <gsl/gsl_statistics.h>
  26
  27 #include "chrom_list.h"
  28 #include "util.h"
  29
  30 #define WINDOW 25
  31 #define PI 3.14159265358979323846
  32
  33 #define DEBUG
  34
  35 #ifdef DEBUG
  36 //#include "duma.h"
  37 #endif
  38
  39 using namespace std;
  40
  41 void strrevcomp(string& output, const string& input);
  42
  43 double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); }
  44
  45 class Loci {
  46   public:
  47     string chr;
  48     unsigned int pos;
  49
  50     Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; }
  51     Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; }
  52     Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; }
  53
  54     bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } }
  55     bool operator<=(const Loci& a) const { if(this->chr == a.chr) { return this->pos <= a.pos; } else { return this->chr < a.chr; } }
  56
  57     bool operator>=(const Loci& a) const { if(this->chr == a.chr) { return this->pos >= a.pos; } else { return this->chr > a.chr; } }
  58     bool operator>(const Loci& a) const { if(this->chr == a.chr) { return this->pos > a.pos; } else { return this->chr > a.chr; } }
  59
  60     int operator-(const Loci& a) const { if(this->chr == a.chr) { return this->pos - a.pos; } else { return INT_MAX; } }
  61
  62 };
  63
  64
  65 class Read : public Loci {
  66   public:
  67     string seq;
  68
  69     unsigned int length() const { return seq.length(); }
  70
  71     Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; }
  72     Read(const Read& r) : Loci(r) { this->seq = r.seq; }
  73     Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;}
  74
  75     char operator[](size_t off) const {
  76       if(off < seq.length()) { return seq[off]; } else { return -1; }
  77     }
  78 };
  79
  80 typedef vector<Read> Reads;
  81
  82 class Nuc {
  83   protected:
  84     unsigned int n[4];
  85
  86     //background nucleotide probabilities
  87     //
  88     //can change this to a background model class later if needed
  89     static double qA;
  90     static double qC;
  91     static double qG;
  92     static double qT;
  93
  94     // pseudocount to avoid divide-by-zero errors
  95     static double pseudocount;
  96
  97   public:
  98
  99     Nuc() {
 100       n[0] = 0;
 101       n[1] = 0;
 102       n[2] = 0;
 103       n[3] = 0;
 104     }
 105
 106     Nuc(const Nuc& n) {
 107       this->n[0] = n.at(0);
 108       this->n[1] = n.at(1);
 109       this->n[2] = n.at(2);
 110       this->n[3] = n.at(3);
 111     }
 112
 113     Nuc& operator=(const Nuc& n) {
 114       if (this != &n) {
 115         this->n[0] = n.at(0);
 116         this->n[1] = n.at(1);
 117         this->n[2] = n.at(2);
 118         this->n[3] = n.at(3);
 119       }
 120       return *this;
 121     }
 122
 123     void add_nuc(char b) {
 124       switch(b) {
 125         case 'a': case 'A': n[0]++; break;
 126         case 'c': case 'C': n[1]++; break;
 127         case 'g': case 'G': n[2]++; break;
 128         case 't': case 'T': n[3]++; break;
 129       };
 130     }
 131
 132     char nth_nuc(unsigned int i) {
 133       if(i >= size()) { return 'N'; }
 134       else if(i < n[0]) { return 'A'; }
 135       else if(i < n[0] + n[1]) { return 'C'; }
 136       else if(i < n[0] + n[1] + n[2]) { return 'G'; }
 137       else { return 'T'; }
 138     }
 139
 140     unsigned int size() { return n[0] + n[1] + n[2] + n[3]; }
 141
 142     unsigned int& operator[](size_t b) { return n[b]; }
 143     unsigned int at(size_t b) const { return n[b]; }
 144
 145     double RE() {
 146
 147       /*
 148       double total = n[0] + n[1] + n[2] + n[3] + 4*Nuc::pseudocount;
 149       double pA = (Nuc::pseudocount + n[0]) / total;
 150       double pC = (Nuc::pseudocount + n[1]) / total;
 151       double pG = (Nuc::pseudocount + n[2]) / total;
 152       double pT = (Nuc::pseudocount + n[3]) / total;
 153
 154       return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT);
 155       */
 156
 157       unsigned int max = 0; unsigned int max_idx = 0;
 158       for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
 159       unsigned int max2 = 0; unsigned int max_idx2 = 0;
 160       for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } }
 161
 162       if(max_idx == max_idx2) { max_idx2++; }
 163
 164       double total = n[max_idx] + n[max_idx2];
 165       double p1 = (Nuc::pseudocount + n[max_idx]) / total;
 166       double p2 = (Nuc::pseudocount + n[max_idx2]) / total;
 167
 168       return p1*log2(p1/Nuc::qA) + p2*log2(p2/Nuc::qC);
 169     }
 170
 171     char consensus() {
 172       unsigned int max = 0; unsigned int max_idx = 0;
 173       for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
 174
 175       unsigned int max2 = 0; unsigned int max_idx2 = 0;
 176       for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } }
 177
 178       //For now pick arbitrary zygosity thresholds. Later, update to use mixture model.
 179       char c = '\0';
 180       if(RE() >= 1.25) {
 181         //homozygous
 182         switch(max_idx) {
 183           case 0: c = 'A'; break;
 184           case 1: c = 'C'; break;
 185           case 2: c = 'G'; break;
 186           case 3: c = 'T'; break;
 187         }
 188       } else {
 189         switch(max_idx | max_idx2) {
 190           case 1: c = 'M'; break;  //A,C
 191           case 2: c = 'R'; break;  //A,G
 192           case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G
 193           case 4: c = 'Y'; break; //C,T
 194           case 5: c = 'K'; break; //G,T
 195         }
 196       }
 197
 198       unsigned int N = size();
 199       if(N == 0) { return ' '; } else if(N < 10) { return tolower(c); } else { return c; }
 200     }
 201 };
 202
 203 double Nuc::pseudocount = 1e-10;
 204 double Nuc::qA = 0.25;
 205 double Nuc::qC = 0.25;
 206 double Nuc::qG = 0.25;
 207 double Nuc::qT = 0.25;
 208
 209 class Window : public Loci {
 210   public:
 211     //optional name for the window
 212     string name;
 213
 214     //the consensus sequence
 215     string sequence;
 216     unsigned int length;
 217     vector<Nuc> seq;
 218
 219     unsigned int reads;
 220
 221     Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) {
 222       this->name = name;
 223       this->length = length;
 224       this->sequence = "";
 225       seq.resize(length);
 226
 227       this->reads = 0;
 228     }
 229
 230     ~Window() {
 231       seq.clear();
 232     }
 233
 234     Window(const Window& r) : Loci(r) {
 235       this->name = r.name;
 236       this->length = r.length;
 237       this->seq = r.seq;
 238       this->sequence = r.sequence;
 239       this->reads = r.reads;
 240     }
 241
 242     Window& operator=(const Window& r) {
 243       Loci::operator=(r);
 244       this->name = r.name;
 245       this->length = r.length;
 246       this->sequence = r.sequence;
 247       this->seq = r.seq;
 248       this->reads = r.reads;
 249       return *this;
 250     }
 251
 252     void set_sequence(string s) {
 253       this->sequence = s;
 254       unsigned int a;
 255       //clear out endlines
 256       while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); }
 257     }
 258
 259    string get_sequence() {
 260      return this->sequence;
 261    }
 262
 263     void add_read(const Read& r) {
 264       if(this->chr != r.chr) return;
 265       int offset = r - (*this);
 266       this->reads++;
 267       for(unsigned int i = 0; i < r.length(); i++) {
 268         int seq_idx = offset + i;
 269         if(seq_idx < 0 || (seq_idx >= 0 && (unsigned)seq_idx > this->length) ) { continue; }
 270         seq[offset + i].add_nuc(r[i]);
 271       }
 272     }
 273
 274     void print_consensus(ostream& o) {
 275       unsigned int line_len = 100;
 276       o << ">Consensus for: " << name << " (" << this->chr << ":" << this->pos << "-" << this->pos+this->length << ")" << endl;
 277
 278       for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
 279         unsigned int max_len = sequence.length() - offset;
 280         unsigned int len = (line_len > max_len)?max_len:line_len;
 281         o << sequence.substr(offset,len) << endl;
 282         for(unsigned int i = offset; i < offset+len; i++) {
 283           char ref = toupper(sequence[i]);
 284           char con = toupper(seq[i].consensus());
 285           if(con == ' ') {
 286             o << ' ';
 287           } else if(con == ref) {
 288             o << '|';
 289           } else {
 290             o << '*';
 291           }
 292         }
 293         o << endl;
 294         for(unsigned int i = offset; i < offset+len; i++) { o << seq[i].consensus(); }
 295         o << endl << endl;
 296       }
 297     }
 298
 299     void print_fasta(ostream& o) {
 300       unsigned int line_len = 100;
 301
 302       string output = "";
 303       vector<string> variants;
 304
 305       for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
 306         unsigned int max_len = sequence.length() - offset;
 307         unsigned int len = (line_len > max_len)?max_len:line_len;
 308         for(unsigned int i = offset; i < offset+len; i++) {
 309           char con = toupper(seq[i].consensus());
 310           // weak consensus if lowercase.
 311           bool weak_con = seq[i].consensus() != con;
 312           if(con == ' ' || weak_con || toupper(con) == toupper(sequence[i])) {
 313             output += sequence[i];
 314           } else {
 315             output += con;
 316             char buff[128];
 317             sprintf(buff,"%d:%c>%c",i,sequence[i],con);
 318             string var = buff;
 319             variants.push_back(var);
 320           }
 321         }
 322         //output += '\n';
 323       }
 324       o << ">" << this->chr << ":" << this->pos << "-" << this->pos+this->length << "|";
 325       for(vector<string>::iterator i = variants.begin(); i != variants.end(); ++i) {
 326         o << (*i);
 327         if(i+1 != variants.end()) o << "|";
 328       }
 329       o << endl << output << endl;
 330     }
 331
 332     void print_RE(ostream& o) {
 333       for(unsigned int i = 0; i < sequence.length(); i++) {
 334           char ref = toupper(sequence[i]);
 335           char con = toupper(seq[i].consensus());
 336           if(con != ' ' && con != ref) {
 337             o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i][0] << "," << seq[i][1] << "," << seq[i][2] << "," << seq[i][3] << "]" << endl;
 338           }
 339       }
 340     }
 341
 342     void print_logo(ostream& o) {
 343       unsigned int max = 0;
 344       for(unsigned int i = 0; i < sequence.length(); i++) {
 345         if(seq[i].size() > max) { max = seq[i].size(); }
 346       }
 347
 348       for(unsigned int i = 0; i < max; i++) {
 349         for(unsigned int j = 0; j < sequence.length(); j++) {
 350           o << seq[j].nth_nuc(i);
 351         }
 352         o << endl;
 353       }
 354     }
 355 };
 356
 357 typedef vector<Window> Windows;
 358
 359 class SNP : public Loci {
 360   public:
 361
 362     string name;
 363     char reference_base;
 364     char consensus[4]; // represent the consensus sequence in order. Most often, only the first 1 or 2 will matter.
 365     unsigned int A;
 366     unsigned int C;
 367     unsigned int G;
 368     unsigned int T;
 369     unsigned int N;
 370     unsigned int total;
 371
 372     SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) {
 373       this->name = name;
 374       this->A = 0;
 375       this->C = 0;
 376       this->G = 0;
 377       this->T = 0;
 378       this->N = 0;
 379
 380       this->reference_base = reference_base;
 381     }
 382
 383     SNP(const SNP& h) : Loci(h) {
 384       this->name = h.name;
 385       this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
 386       this->reference_base = h.reference_base;
 387     }
 388
 389     SNP& operator=(const SNP& h) {
 390       this->name = h.name;
 391       this->chr = h.chr;
 392       this->pos = h.pos;
 393       this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
 394       this->reference_base = h.reference_base;
 395       return *this;
 396     }
 397
 398     void eval_consensus() {
 399       // if A is the max
 400       if(A >= C & A >= G & A >= T) { consensus[0] = 'A';
 401         if(C >= G & C >= T) { consensus[1] = 'C';
 402           if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
 403           else       { consensus[2] = 'T'; consensus[3] = 'G'; }
 404         } else if(G >= C & G >= T) { consensus[1] = 'G';
 405           if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
 406           else       { consensus[2] = 'T'; consensus[3] = 'C'; }
 407         } else { consensus[1] = 'T';
 408           if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
 409           else       { consensus[2] = 'G'; consensus[3] = 'C'; }
 410         }
 411
 412
 413       // if C is the max
 414       } else if(C >= A & C >= G & C >= T) { consensus[0] = 'C';
 415         if(A >= G & A >= T) { consensus[1] = 'A';
 416           if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
 417           else       { consensus[2] = 'T'; consensus[3] = 'G'; }
 418         } else if(G >= A & G >= T) { consensus[1] = 'G';
 419           if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
 420           else       { consensus[2] = 'T'; consensus[3] = 'A'; }
 421         } else { consensus[1] = 'T';
 422           if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
 423           else       { consensus[2] = 'G'; consensus[3] = 'A'; }
 424         }
 425       } else if(G >= A & G >= C & G >= T) { consensus[0] = 'G';
 426         if(A >= C & A >= T) { consensus[1] = 'A';
 427           if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
 428           else       { consensus[2] = 'T'; consensus[3] = 'C'; }
 429         } else if(C >= A & C >= T) { consensus[1] = 'C';
 430           if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
 431           else       { consensus[2] = 'T'; consensus[3] = 'A'; }
 432         } else { consensus[1] = 'T';
 433           if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
 434           else       { consensus[2] = 'C'; consensus[3] = 'A'; }
 435         }
 436       } else { consensus[0] = 'T';
 437         if(A >= C & A >= G) { consensus[1] = 'A';
 438           if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
 439           else       { consensus[2] = 'G'; consensus[3] = 'C'; }
 440         } else if(C >= A & C >= G) { consensus[1] = 'C';
 441           if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
 442           else       { consensus[2] = 'G'; consensus[3] = 'A'; }
 443         } else { consensus[1] = 'G';
 444           if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
 445           else       { consensus[2] = 'C'; consensus[3] = 'A'; }
 446         }
 447       }
 448     }
 449
 450     void add_read(char nuc) {
 451       switch(nuc) {
 452         case 'a':
 453         case 'A':
 454           A++; break;
 455         case 'c':
 456         case 'C':
 457           C++; break;
 458         case 'g':
 459         case 'G':
 460           G++; break;
 461         case 't':
 462         case 'T':
 463           T++; break;
 464         default:
 465           N++; break;
 466       }
 467       total++;
 468     }
 469
 470   void clean(unsigned int threshold) {
 471     if(A <= threshold) { A = 0; }
 472     if(C <= threshold) { C = 0; }
 473     if(G <= threshold) { G = 0; }
 474     if(T <= threshold) { T = 0; }
 475     total = A + C + G + T;
 476     eval_consensus();
 477   }
 478
 479   double RE(unsigned int th = 2) {
 480     if(total == 0) { return 0.0; }
 481
 482     double pA = (double)( ((A<th)?A:0)+1e-10)/(double)total;
 483     double pC = (double)( ((C<th)?C:0)+1e-10)/(double)total;
 484     double pG = (double)( ((G<th)?G:0)+1e-10)/(double)total;
 485     double pT = (double)( ((T<th)?T:0)+1e-10)/(double)total;
 486
 487     //assume equal distribution of A,C,G,T
 488     double l2 = log(2);
 489     return pA*log(pA/0.25)/l2 + pC*log(pC/0.25)/l2 + pG*log(pG/0.25)/l2 + pT*log(pT/0.25)/l2;
 490   }
 491 };
 492
 493 typedef vector<SNP> SNPs;
 494
 495 //Class to calulate mixture model. Very not general right now, but should be easy enough to make more general
 496 //if the need arises
 497 class GaussianMixture {
 498
 499 public:
 500   double p;
 501   double u1;
 502   double s1;
 503   double u2;
 504   double s2;
 505   double Q;
 506
 507   unsigned int N;
 508
 509   double delta;
 510
 511   GaussianMixture(SNPs& snps, double delta = 1e-10) {
 512     //initialize model
 513     this->p = 0.5;
 514     //model 1: heterozygous
 515     this->u1 = 1.0;
 516     this->s1 = 0.5;
 517
 518     //model 2: homozygous
 519     this->u2 = 2.0;
 520     this->s2 = 0.5;
 521
 522     this->delta = delta;
 523   }
 524
 525   bool classify(double x) {
 526     return(norm_prob(x,u1,s1) >= norm_prob(x,u2,s2)) ;
 527   }
 528
 529   // Use EM to fit gaussian mixture model to discern heterozygous from homozygous snps
 530   void fit(SNPs& snps, unsigned int count_th) {
 531     //initialize relative entropy and probabilities
 532     vector<double> RE;
 533     vector<double> pr;
 534     for(unsigned int i = 0; i < snps.size(); ++i) {
 535       if(snps[i].total >= 8) {
 536         RE.push_back(snps[i].RE(count_th));
 537         pr.push_back(0.5);
 538       }
 539     }
 540
 541     this->N = RE.size();
 542
 543     cerr << this->N << " snps checked\n";
 544
 545     //calculate initial expectation
 546     this->Q = 0.0;
 547     for(unsigned int i = 0; i < N; ++i) {
 548       Q +=    pr[i]    * (log( this->p ) - log(sqrt(2.0*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
 549       Q += (1.0-pr[i]) * (log(1-this->p) - log(sqrt(2.0*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
 550     }
 551
 552     cerr << "Q: " << this->Q << endl;
 553
 554     double Q_new = 0;
 555     //expectation maximization to iteratively update pi's and parameters until Q settles down.
 556     while(1) {
 557       cerr << "loop Q: " << Q << endl;
 558       Q_new = 0.0;
 559
 560       double p_sum = 0.0, q_sum = 0.0, u1_sum = 0.0, u2_sum = 0.0;
 561       for(unsigned int i = 0; i < N; ++i) {
 562         pr[i] = pr[i]*norm_prob(RE[i],this->u1,this->s1) /
 563                 (pr[i]*norm_prob(RE[i],this->u1,this->s1) + (1.0 - pr[i])*(norm_prob(RE[i],this->u2,this->s2)));
 564
 565         p_sum += pr[i];
 566         q_sum += (1.0 - pr[i]);
 567
 568         u1_sum += pr[i]*RE[i];
 569         u2_sum += (1.0 - pr[i])*RE[i];
 570
 571         Q_new += pr[i]      * (log( this->p ) - log(sqrt(2*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
 572         Q_new += (1.0-pr[i])* (log(1-this->p) - log(sqrt(2*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
 573       }
 574
 575       //update variables of the distributions (interwoven with pi loop to save cpu)
 576       this->p  = p_sum / this->N;
 577       this->u1 = u1_sum / p_sum;
 578       this->u2 = u2_sum / q_sum;
 579
 580       double s1_sum = 0.0, s2_sum = 0.0;
 581       for(unsigned int i = 0; i < N; ++i) {
 582         s1_sum +=    pr[i]    * (RE[i] - this->u1)*(RE[i] - this->u1);
 583         s2_sum += (1.0-pr[i]) * (RE[i] - this->u2)*(RE[i] - this->u2);
 584       }
 585
 586       this->s1 = sqrt(s1_sum/p_sum);
 587       this->s2 = sqrt(s2_sum/q_sum);
 588
 589       if(fabs(this->Q - Q_new) < 1e-5) { break; }
 590       this->Q = Q_new;
 591     }
 592     cerr << "Q: " << Q << endl;
 593   }
 594
 595   void print_model() {
 596     cout << "Q: " << Q << " p: " << p << " norm(" << u1 << "," << s1 << ");norm(" << u2 << "," << s2 << ")" << endl;
 597   }
 598 };
 599
 600
 601 ostream &operator<<( ostream &out, const SNP &h ) {
 602   out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total;
 603
 604   return out;
 605 }
 606
 607
 608 void read_snps(const char* filename, SNPs& snps) {
 609   string delim("\t");
 610
 611   ifstream feat(filename);
 612   size_t N = 0;
 613   while(feat.peek() != EOF) {
 614     char line[1024];
 615     feat.getline(line,1024,'\n');
 616     N++;
 617     string line_str(line);
 618     vector<string> fields;
 619     split(line_str, delim, fields);
 620     if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
 621
 622     string name = fields[0];
 623     string chr = fields[1];
 624     unsigned int pos = atoi(fields[2].c_str());
 625     char base = (fields[3])[0];
 626
 627     SNP snp(name,chr,pos,base);
 628     snps.push_back(snp);
 629   }
 630
 631   //sort the features so we can run through it once
 632   std::stable_sort(snps.begin(),snps.end());
 633   feat.close();
 634
 635   cerr << "Found and sorted " << snps.size() << " snps." << endl;
 636 }
 637
 638 void read_align_file(char* filename, Reads& features) {
 639   string delim(" \n");
 640   string location_delim(":");
 641   char strand_str[2]; strand_str[1] = '\0';
 642   ifstream seqs(filename);
 643   string name("");
 644   while(seqs.peek() != EOF) {
 645     char line[2048];
 646     seqs.getline(line,2048,'\n');
 647
 648     string line_str(line);
 649     vector<string> fields;
 650     split(line_str, delim, fields);
 651     if(fields.size() != 7) { continue; }
 652
 653     vector<string> location; split(fields[3], location_delim, location);
 654     string chr = location[0];
 655     if(chr == "newcontam") { continue; }
 656     if(chr == "NA") { continue; }
 657
 658     int pos = atoi(location[1].c_str());
 659     bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
 660
 661     string seq;
 662     if(strand == 0) { seq = fields[0]; } else { strrevcomp(seq,fields[0]); }
 663     Read read(chr,pos,seq);
 664     features.push_back(read);
 665   }
 666   seqs.close();
 667
 668   //sort the data so we can run through it once
 669   std::sort(features.begin(),features.end());
 670   cerr << "Found and sorted " << features.size() << " reads." << endl;
 671 }
 672
 673 void read_window_file(const char* filename, Windows& ws) {
 674   string delim("\t");
 675
 676   ifstream win_file(filename);
 677
 678   unsigned int N = 0;
 679   while(win_file.peek() != EOF) {
 680     char line[1024];
 681     win_file.getline(line,1024,'\n');
 682     N++;
 683     string line_str(line);
 684     vector<string> fields;
 685     split(line_str, delim, fields);
 686     if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
 687
 688     string name = fields[0];
 689     string chr = fields[1];
 690     if(chr == "NA") { continue; }
 691     if(chr == "contam") { continue; }
 692     int start = atoi(fields[2].c_str());
 693     int stop = atoi(fields[3].c_str());
 694
 695     Window w(name,chr,start,stop-start+1);
 696     ws.push_back(w);
 697   }
 698
 699   //sort the features so we can run through it once
 700   std::stable_sort(ws.begin(),ws.end());
 701   win_file.close();
 702
 703   cerr << "Found and sorted " << ws.size() << " windows." << endl;
 704 }
 705
 706 void count_read_in_features(Windows& windows, Reads& data) {
 707   Windows::iterator wind_it = windows.begin();
 708
 709   for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
 710     //skip to first feature after read
 711     string start_chr = wind_it->chr;
 712     while(wind_it != windows.end() && (wind_it->chr < i->chr || (wind_it->chr == i->chr && wind_it->pos + wind_it->length < i->pos) )) {
 713       wind_it++;
 714     }
 715
 716     //stop if we have run out of features.
 717     if(wind_it == windows.end()) { break; }
 718
 719     if(i->pos + i->length > wind_it->pos && i->pos < (wind_it->pos + wind_it->length)) {
 720       wind_it->add_read(*i);
 721     }
 722   }
 723 }
 724
 725 void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) {
 726         char temp[1024];
 727
 728         string chrom = peaks[0].chr;
 729         string chrom_filename = chrom_filenames[chrom];
 730         ifstream chrom_file(chrom_filename.c_str());
 731         chrom_file.getline(temp, 1024);
 732         size_t offset = chrom_file.gcount();
 733         for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
 734           if(i->chr != chrom) {
 735             chrom = i->chr;
 736             chrom_filename = chrom_filenames[chrom];
 737             chrom_file.close(); chrom_file.open(chrom_filename.c_str());
 738             chrom_file.getline(temp, 1024);
 739             offset = chrom_file.gcount();
 740           }
 741           unsigned int begin = i->pos - 1;
 742           unsigned int end   = i->pos+i->length-2;
 743
 744           unsigned int begin_pos = offset + (int)begin/50 + begin;
 745           unsigned int end_pos = offset + (int)end/50 + end;
 746
 747           unsigned int read_len = end_pos - begin_pos;
 748           char buffer[read_len+1];
 749           chrom_file.seekg(begin_pos, ios_base::beg);
 750           chrom_file.read(buffer, read_len);
 751           buffer[read_len] = '\0';
 752           i->set_sequence(buffer);
 753         }
 754         chrom_file.close();
 755 }
 756
 757
 758 int main(int argc, char** argv) {
 759   if(argc != 4) { cerr << "Usage: " << argv[0] << " read_file window_file chromosome_file\n"; exit(1); }
 760
 761   char read_filename[1024]; strcpy(read_filename,argv[1]);
 762   char window_filename[1024]; strcpy(window_filename,argv[2]);
 763   char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
 764
 765   Windows windows; read_window_file(window_filename, windows);
 766   ChromList reference_seq(chromosome_filename);
 767
 768   retrieveSequenceData(reference_seq, windows);
 769
 770   cerr << "Established reference sequences\n";
 771
 772   Reads reads; read_align_file(read_filename, reads);
 773
 774   count_read_in_features(windows, reads);
 775
 776   for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) {
 777     //w->print_consensus(cout);
 778     //w->print_logo(cout);
 779     w->print_RE(cerr);
 780     w->print_fasta(cout);
 781   }
 782 }
 783
 784 void strrevcomp(string& output, const string& input)
 785 {
 786   output = input;
 787   unsigned int i;
 788
 789   for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; }
 790
 791   for (unsigned int p1 = 0; p1 < output.length(); ++p1) {
 792     if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; }
 793     else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; }
 794     else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; }
 795     else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; }
 796   }
 797 }
 798