3 * This program takes a set of snps in a custom tab format, and a set of short mapped reads, and evaluates
4 * the sequencing overlap over those snps. Additionally, a miaxture model is fit and used to classify the
5 * snps as homozygous or heterozygous.
7 * In the final report, the output is:
8 * <snp id> <chromosome> <position> <reference base> <a count> <c count> <g count> <t count> <total count> <snp call>
9 * where snp call is one of:
10 * -1: no call was made (not enough examples to make a call)
11 * 0: the snp is homozygous
12 * 1: the snp is heterozygous
15 #include <sys/types.h>
25 #include <gsl/gsl_statistics.h>
27 #include "chrom_list.h"
31 #define PI 3.14159265358979323846
41 void strrevcomp(string& output, const string& input);
43 double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); }
50 Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; }
51 Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; }
52 Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; }
54 bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } }
55 bool operator<=(const Loci& a) const { if(this->chr == a.chr) { return this->pos <= a.pos; } else { return this->chr < a.chr; } }
57 bool operator>=(const Loci& a) const { if(this->chr == a.chr) { return this->pos >= a.pos; } else { return this->chr > a.chr; } }
58 bool operator>(const Loci& a) const { if(this->chr == a.chr) { return this->pos > a.pos; } else { return this->chr > a.chr; } }
60 int operator-(const Loci& a) const { if(this->chr == a.chr) { return this->pos - a.pos; } else { return INT_MAX; } }
65 class Read : public Loci {
69 unsigned int length() const { return seq.length(); }
71 Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; }
72 Read(const Read& r) : Loci(r) { this->seq = r.seq; }
73 Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;}
75 char operator[](size_t off) const {
76 if(off < seq.length()) { return seq[off]; } else { return -1; }
80 typedef vector<Read> Reads;
86 //background nucleotide probabilities
88 //can change this to a background model class later if needed
94 // pseudocount to avoid divide-by-zero errors
95 static double pseudocount;
107 this->n[0] = n.at(0);
108 this->n[1] = n.at(1);
109 this->n[2] = n.at(2);
110 this->n[3] = n.at(3);
113 Nuc& operator=(const Nuc& n) {
115 this->n[0] = n.at(0);
116 this->n[1] = n.at(1);
117 this->n[2] = n.at(2);
118 this->n[3] = n.at(3);
123 void add_nuc(char b) {
125 case 'a': case 'A': n[0]++; break;
126 case 'c': case 'C': n[1]++; break;
127 case 'g': case 'G': n[2]++; break;
128 case 't': case 'T': n[3]++; break;
132 char nth_nuc(unsigned int i) {
133 if(i >= size()) { return 'N'; }
134 else if(i < n[0]) { return 'A'; }
135 else if(i < n[0] + n[1]) { return 'C'; }
136 else if(i < n[0] + n[1] + n[2]) { return 'G'; }
140 unsigned int size() { return n[0] + n[1] + n[2] + n[3]; }
142 unsigned int& operator[](size_t b) { return n[b]; }
143 unsigned int at(size_t b) const { return n[b]; }
148 double total = n[0] + n[1] + n[2] + n[3] + 4*Nuc::pseudocount;
149 double pA = (Nuc::pseudocount + n[0]) / total;
150 double pC = (Nuc::pseudocount + n[1]) / total;
151 double pG = (Nuc::pseudocount + n[2]) / total;
152 double pT = (Nuc::pseudocount + n[3]) / total;
154 return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT);
157 unsigned int max = 0; unsigned int max_idx = 0;
158 for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
159 unsigned int max2 = 0; unsigned int max_idx2 = 0;
160 for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } }
162 if(max_idx == max_idx2) { max_idx2++; }
164 double total = n[max_idx] + n[max_idx2];
165 double p1 = (Nuc::pseudocount + n[max_idx]) / total;
166 double p2 = (Nuc::pseudocount + n[max_idx2]) / total;
168 return p1*log2(p1/Nuc::qA) + p2*log2(p2/Nuc::qC);
172 unsigned int N = size();
173 if(N == 0) { return ' '; }
174 unsigned int max = 0; unsigned int max_idx = 0;
175 for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
177 unsigned int max2 = 0; unsigned int max_idx2 = 0;
178 for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } }
180 //For now pick arbitrary zygosity thresholds. Later, update to use mixture model.
185 case 0: c = 'A'; break;
186 case 1: c = 'C'; break;
187 case 2: c = 'G'; break;
188 case 3: c = 'T'; break;
191 switch(max_idx | max_idx2) {
192 case 1: c = 'M'; break; //A,C
193 case 2: c = 'R'; break; //A,G
194 case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G
195 case 4: c = 'Y'; break; //C,T
196 case 5: c = 'K'; break; //G,T
200 if(N < 10) { return tolower(c); } else { return c; }
204 double Nuc::pseudocount = 1e-10;
205 double Nuc::qA = 0.25;
206 double Nuc::qC = 0.25;
207 double Nuc::qG = 0.25;
208 double Nuc::qT = 0.25;
210 class Window : public Loci {
212 //optional name for the window
215 //the consensus sequence
222 Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) {
224 this->length = length;
227 if(length > 10000) { cerr << "ERROR: window of size " << length << endl; exit(1); }
237 Window(const Window& r) : Loci(r) {
239 this->length = r.length;
241 this->sequence = r.sequence;
242 this->reads = r.reads;
245 Window& operator=(const Window& r) {
246 if (this == &r) return *this;
249 this->length = r.length;
250 this->sequence = r.sequence;
252 this->reads = r.reads;
256 void set_sequence(string s) {
260 while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); }
263 string get_sequence() {
264 return this->sequence;
267 void add_read(const Read& r) {
268 if(this->chr != r.chr) return;
269 int offset = r - (*this);
271 for(unsigned int i = 0; i < r.length(); i++) {
272 int seq_idx = offset + i;
273 if(seq_idx > 0 && (unsigned)seq_idx >= seq.size()) { cerr << "Error: offset: " << offset << " seq_idx " << seq_idx << " size: " << seq.size() << " seqlen: " << sequence.length() << endl; }
274 if(seq_idx < 0 || (seq_idx >= 0 && (unsigned)seq_idx > this->length) ) { continue; }
275 seq[offset + i].add_nuc(r[i]);
279 void print_consensus(ostream& o) {
280 unsigned int line_len = 100;
281 o << ">Consensus for: " << name << " (" << this->chr << ":" << this->pos << "-" << this->pos+this->length << ")" << endl;
283 for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
284 unsigned int max_len = sequence.length() - offset;
285 unsigned int len = (line_len > max_len)?max_len:line_len;
286 o << sequence.substr(offset,len) << endl;
287 for(unsigned int i = offset; i < offset+len; i++) {
288 char ref = toupper(sequence[i]);
289 char con = toupper(seq[i].consensus());
292 } else if(con == ref) {
299 for(unsigned int i = offset; i < offset+len; i++) { o << seq[i].consensus(); }
304 void print_fasta(ostream& o) {
305 unsigned int line_len = 100;
308 vector<string> variants;
310 for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
311 unsigned int max_len = sequence.length() - offset;
312 unsigned int len = (line_len > max_len)?max_len:line_len;
313 for(unsigned int i = offset; i < offset+len; i++) {
314 if(i >= seq.size()) {
315 cerr << "Error: offset of " << i << " exceeds read data: " << seq.size() << " in string of " << sequence.length() << endl;
318 char con = toupper(seq[i].consensus());
319 // weak consensus if lowercase.
320 bool weak_con = seq[i].consensus() != con;
321 if(con == ' ' || weak_con || toupper(con) == toupper(sequence[i])) {
322 output.append(1,sequence[i]);
324 output.append(1,con);
326 sprintf(buff,"%d:%c>%c",i,sequence[i],con);
328 variants.push_back(var);
333 o << ">" << this->chr << ":" << this->pos << "-" << this->pos+this->length << "|";
334 for(vector<string>::iterator i = variants.begin(); i != variants.end(); ++i) {
336 if(i+1 != variants.end()) o << "|";
338 o << endl << output << endl;
341 void print_RE(ostream& o) {
342 for(unsigned int i = 0; i < sequence.length(); i++) {
343 char ref = toupper(sequence[i]);
344 char con = toupper(seq[i].consensus());
345 if(con != ' ' && con != ref) {
346 o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i][0] << "," << seq[i][1] << "," << seq[i][2] << "," << seq[i][3] << "]" << endl;
351 void print_logo(ostream& o) {
352 unsigned int max = 0;
353 for(unsigned int i = 0; i < sequence.length(); i++) {
354 if(seq[i].size() > max) { max = seq[i].size(); }
357 for(unsigned int i = 0; i < max; i++) {
358 for(unsigned int j = 0; j < sequence.length(); j++) {
359 o << seq[j].nth_nuc(i);
366 typedef vector<Window> Windows;
368 class SNP : public Loci {
373 char consensus[4]; // represent the consensus sequence in order. Most often, only the first 1 or 2 will matter.
381 SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) {
389 this->reference_base = reference_base;
392 SNP(const SNP& h) : Loci(h) {
394 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
395 this->reference_base = h.reference_base;
398 SNP& operator=(const SNP& h) {
402 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
403 this->reference_base = h.reference_base;
407 void eval_consensus() {
409 if(A >= C & A >= G & A >= T) { consensus[0] = 'A';
410 if(C >= G & C >= T) { consensus[1] = 'C';
411 if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
412 else { consensus[2] = 'T'; consensus[3] = 'G'; }
413 } else if(G >= C & G >= T) { consensus[1] = 'G';
414 if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
415 else { consensus[2] = 'T'; consensus[3] = 'C'; }
416 } else { consensus[1] = 'T';
417 if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
418 else { consensus[2] = 'G'; consensus[3] = 'C'; }
423 } else if(C >= A & C >= G & C >= T) { consensus[0] = 'C';
424 if(A >= G & A >= T) { consensus[1] = 'A';
425 if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
426 else { consensus[2] = 'T'; consensus[3] = 'G'; }
427 } else if(G >= A & G >= T) { consensus[1] = 'G';
428 if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
429 else { consensus[2] = 'T'; consensus[3] = 'A'; }
430 } else { consensus[1] = 'T';
431 if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
432 else { consensus[2] = 'G'; consensus[3] = 'A'; }
434 } else if(G >= A & G >= C & G >= T) { consensus[0] = 'G';
435 if(A >= C & A >= T) { consensus[1] = 'A';
436 if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
437 else { consensus[2] = 'T'; consensus[3] = 'C'; }
438 } else if(C >= A & C >= T) { consensus[1] = 'C';
439 if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
440 else { consensus[2] = 'T'; consensus[3] = 'A'; }
441 } else { consensus[1] = 'T';
442 if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
443 else { consensus[2] = 'C'; consensus[3] = 'A'; }
445 } else { consensus[0] = 'T';
446 if(A >= C & A >= G) { consensus[1] = 'A';
447 if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
448 else { consensus[2] = 'G'; consensus[3] = 'C'; }
449 } else if(C >= A & C >= G) { consensus[1] = 'C';
450 if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
451 else { consensus[2] = 'G'; consensus[3] = 'A'; }
452 } else { consensus[1] = 'G';
453 if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
454 else { consensus[2] = 'C'; consensus[3] = 'A'; }
459 void add_read(char nuc) {
479 void clean(unsigned int threshold) {
480 if(A <= threshold) { A = 0; }
481 if(C <= threshold) { C = 0; }
482 if(G <= threshold) { G = 0; }
483 if(T <= threshold) { T = 0; }
484 total = A + C + G + T;
488 double RE(unsigned int th = 2) {
489 if(total == 0) { return 0.0; }
491 double pA = (double)( ((A<th)?A:0)+1e-10)/(double)total;
492 double pC = (double)( ((C<th)?C:0)+1e-10)/(double)total;
493 double pG = (double)( ((G<th)?G:0)+1e-10)/(double)total;
494 double pT = (double)( ((T<th)?T:0)+1e-10)/(double)total;
496 //assume equal distribution of A,C,G,T
498 return pA*log(pA/0.25)/l2 + pC*log(pC/0.25)/l2 + pG*log(pG/0.25)/l2 + pT*log(pT/0.25)/l2;
502 typedef vector<SNP> SNPs;
504 //Class to calulate mixture model. Very not general right now, but should be easy enough to make more general
506 class GaussianMixture {
520 GaussianMixture(SNPs& snps, double delta = 1e-10) {
523 //model 1: heterozygous
527 //model 2: homozygous
534 bool classify(double x) {
535 return(norm_prob(x,u1,s1) >= norm_prob(x,u2,s2)) ;
538 // Use EM to fit gaussian mixture model to discern heterozygous from homozygous snps
539 void fit(SNPs& snps, unsigned int count_th) {
540 //initialize relative entropy and probabilities
543 for(unsigned int i = 0; i < snps.size(); ++i) {
544 if(snps[i].total >= 8) {
545 RE.push_back(snps[i].RE(count_th));
552 cerr << this->N << " snps checked\n";
554 //calculate initial expectation
556 for(unsigned int i = 0; i < N; ++i) {
557 Q += pr[i] * (log( this->p ) - log(sqrt(2.0*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
558 Q += (1.0-pr[i]) * (log(1-this->p) - log(sqrt(2.0*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
561 cerr << "Q: " << this->Q << endl;
564 //expectation maximization to iteratively update pi's and parameters until Q settles down.
566 cerr << "loop Q: " << Q << endl;
569 double p_sum = 0.0, q_sum = 0.0, u1_sum = 0.0, u2_sum = 0.0;
570 for(unsigned int i = 0; i < N; ++i) {
571 pr[i] = pr[i]*norm_prob(RE[i],this->u1,this->s1) /
572 (pr[i]*norm_prob(RE[i],this->u1,this->s1) + (1.0 - pr[i])*(norm_prob(RE[i],this->u2,this->s2)));
575 q_sum += (1.0 - pr[i]);
577 u1_sum += pr[i]*RE[i];
578 u2_sum += (1.0 - pr[i])*RE[i];
580 Q_new += pr[i] * (log( this->p ) - log(sqrt(2*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
581 Q_new += (1.0-pr[i])* (log(1-this->p) - log(sqrt(2*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
584 //update variables of the distributions (interwoven with pi loop to save cpu)
585 this->p = p_sum / this->N;
586 this->u1 = u1_sum / p_sum;
587 this->u2 = u2_sum / q_sum;
589 double s1_sum = 0.0, s2_sum = 0.0;
590 for(unsigned int i = 0; i < N; ++i) {
591 s1_sum += pr[i] * (RE[i] - this->u1)*(RE[i] - this->u1);
592 s2_sum += (1.0-pr[i]) * (RE[i] - this->u2)*(RE[i] - this->u2);
595 this->s1 = sqrt(s1_sum/p_sum);
596 this->s2 = sqrt(s2_sum/q_sum);
598 if(fabs(this->Q - Q_new) < 1e-5) { break; }
601 cerr << "Q: " << Q << endl;
605 cout << "Q: " << Q << " p: " << p << " norm(" << u1 << "," << s1 << ");norm(" << u2 << "," << s2 << ")" << endl;
610 ostream &operator<<( ostream &out, const SNP &h ) {
611 out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total;
617 void read_snps(const char* filename, SNPs& snps) {
620 ifstream feat(filename);
622 while(feat.peek() != EOF) {
624 feat.getline(line,1024,'\n');
626 string line_str(line);
627 vector<string> fields;
628 split(line_str, delim, fields);
629 if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
631 string name = fields[0];
632 string chr = fields[1];
633 unsigned int pos = atoi(fields[2].c_str());
634 char base = (fields[3])[0];
636 SNP snp(name,chr,pos,base);
640 //sort the features so we can run through it once
641 std::stable_sort(snps.begin(),snps.end());
644 cerr << "Found and sorted " << snps.size() << " snps." << endl;
647 void read_align_file(char* filename, Reads& features) {
649 string location_delim(":");
650 char strand_str[2]; strand_str[1] = '\0';
651 ifstream seqs(filename);
653 while(seqs.peek() != EOF) {
655 seqs.getline(line,2048,'\n');
657 string line_str(line);
658 vector<string> fields;
659 split(line_str, delim, fields);
660 if(fields.size() != 7) { continue; }
662 vector<string> location; split(fields[3], location_delim, location);
663 string chr = location[0];
664 if(chr == "newcontam") { continue; }
665 if(chr == "NA") { continue; }
667 int pos = atoi(location[1].c_str());
668 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
671 if(strand == 0) { seq = fields[0]; } else { strrevcomp(seq,fields[0]); }
672 Read read(chr,pos,seq);
673 features.push_back(read);
677 //sort the data so we can run through it once
678 std::sort(features.begin(),features.end());
679 cerr << "Found and sorted " << features.size() << " reads." << endl;
682 void read_window_file(const char* filename, Windows& ws) {
685 ifstream win_file(filename);
688 while(win_file.peek() != EOF) {
690 win_file.getline(line,1024,'\n');
692 string line_str(line);
693 vector<string> fields;
694 split(line_str, delim, fields);
695 if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
697 string name = fields[0];
698 string chr = fields[1];
699 if(chr == "NA") { continue; }
700 if(chr == "contam") { continue; }
701 int start = atoi(fields[2].c_str());
702 int stop = atoi(fields[3].c_str());
704 Window w(name,chr,start,stop-start+1);
708 //sort the features so we can run through it once
709 std::stable_sort(ws.begin(),ws.end());
712 cerr << "Found and sorted " << ws.size() << " windows." << endl;
715 void count_read_in_features(Windows& windows, Reads& data) {
716 Windows::iterator wind_it = windows.begin();
718 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
719 //skip to first feature after read
720 string start_chr = wind_it->chr;
721 while(wind_it != windows.end() && (wind_it->chr < i->chr || (wind_it->chr == i->chr && wind_it->pos + wind_it->length < i->pos) )) {
725 //stop if we have run out of features.
726 if(wind_it == windows.end()) { break; }
728 if(i->pos + i->length() > wind_it->pos && i->pos < (wind_it->pos + wind_it->length)) {
729 wind_it->add_read(*i);
734 void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) {
737 string chrom = peaks[0].chr;
738 string chrom_filename = chrom_filenames[chrom];
739 ifstream chrom_file(chrom_filename.c_str());
740 chrom_file.getline(temp, 1024);
741 size_t offset = chrom_file.gcount();
742 for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
743 if(i->chr != chrom) {
745 chrom_filename = chrom_filenames[chrom];
746 chrom_file.close(); chrom_file.open(chrom_filename.c_str());
747 chrom_file.getline(temp, 1024);
748 offset = chrom_file.gcount();
750 unsigned int begin = i->pos - 1;
751 unsigned int end = i->pos+i->length;
753 unsigned int begin_pos = offset + (int)begin/50 + begin;
754 unsigned int end_pos = offset + (int)end/50 + end;
756 unsigned int read_len = end_pos - begin_pos;
757 char buffer[read_len+1];
758 chrom_file.seekg(begin_pos, ios_base::beg);
759 chrom_file.read(buffer, read_len);
760 buffer[read_len] = '\0';
761 i->set_sequence(buffer);
767 int main(int argc, char** argv) {
768 if(argc != 4) { cerr << "Usage: " << argv[0] << " read_file window_file chromosome_file\n"; exit(1); }
770 char read_filename[1024]; strcpy(read_filename,argv[1]);
771 char window_filename[1024]; strcpy(window_filename,argv[2]);
772 char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
774 Windows windows; read_window_file(window_filename, windows);
775 ChromList reference_seq(chromosome_filename);
777 retrieveSequenceData(reference_seq, windows);
779 cerr << "Established reference sequences\n";
781 Reads reads; read_align_file(read_filename, reads);
783 count_read_in_features(windows, reads);
785 for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) {
786 //w->print_consensus(cout);
787 //w->print_logo(cout);
789 w->print_fasta(cout);
793 void strrevcomp(string& output, const string& input)
798 for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; }
800 for (unsigned int p1 = 0; p1 < output.length(); ++p1) {
801 if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; }
802 else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; }
803 else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; }
804 else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; }