3 * This program takes a set of snps in a custom tab format, and a set of short mapped reads, and evaluates
4 * the sequencing overlap over those snps. Additionally, a miaxture model is fit and used to classify the
5 * snps as homozygous or heterozygous.
7 * In the final report, the output is:
8 * <snp id> <chromosome> <position> <reference base> <a count> <c count> <g count> <t count> <total count> <snp call>
9 * where snp call is one of:
10 * -1: no call was made (not enough examples to make a call)
11 * 0: the snp is homozygous
12 * 1: the snp is heterozygous
15 #include <sys/types.h>
25 #include <gsl/gsl_statistics.h>
27 #include "../SRLib/chrom_list.h"
28 #include "../SRLib/util.h"
29 #include "../SRLib/loci.h"
30 #include "../SRLib/read.h"
31 #include "../SRLib/nuc.h"
34 #define PI 3.14159265358979323846
44 class Window : public Loci {
46 //optional name for the window
49 //the consensus sequence
56 Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) {
58 this->length = length;
59 if(length > 10000) { cerr << "ERROR: window of size " << length << endl; exit(1); }
61 this->sequence = string(length,'\0');
62 seq.resize(this->sequence.length());
70 Window(const Window& r) : Loci(r) {
72 this->length = r.length;
74 this->sequence = r.sequence;
75 this->reads = r.reads;
78 Window& operator=(const Window& r) {
79 if (this == &r) return *this;
82 this->length = r.length;
83 this->sequence = r.sequence;
85 this->reads = r.reads;
89 void set_sequence(string s) {
93 while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); }
94 seq.clear(); seq.resize(sequence.length());
97 string get_sequence() {
98 return this->sequence;
101 void add_read(const Read& r) {
102 if(this->chr != r.chr) return;
106 int overlap_len = r.length();
108 //if the read begins before the window starts:
109 // ---(====[====)------]------
110 if(r.pos < this->pos) {
111 r_offset = this->pos - r.pos;
114 w_offset = r.pos - this->pos;
118 //if the read ends after the window ends
119 // ----[----(====]======)-----
120 if(r.pos + r.length() >= this->pos + this->length) {
121 overlap_len = (this->pos + this->length) - r.pos;
123 overlap_len = r.length();
126 for(; overlap_len > 0; overlap_len--) {
127 seq[w_offset++].add_nuc(r[r_offset++]);
133 void print_consensus(ostream& o) {
134 unsigned int line_len = 100;
135 o << ">Consensus for: " << name << " (" << this->chr << ":" << this->pos << "-" << this->pos+this->length << ")" << endl;
137 for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
138 unsigned int max_len = sequence.length() - offset;
139 unsigned int len = (line_len > max_len)?max_len:line_len;
140 o << sequence.substr(offset,len) << endl;
141 for(unsigned int i = offset; i < offset+len; i++) {
142 char ref = toupper(sequence[i]);
143 char con = toupper(seq[i].consensus());
146 } else if(con == ref) {
153 for(unsigned int i = offset; i < offset+len; i++) { o << seq[i].consensus(); }
158 void print_fasta(ostream& o) {
159 unsigned int line_len = 100;
161 if(sequence.length() != seq.size()) {
162 cerr << "Size mismatch: sequence is " << sequence.length() << " but data has " << seq.size() << endl;
166 vector<string> variants;
168 for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) {
169 unsigned int max_len = sequence.length() - offset;
170 unsigned int len = (line_len > max_len)?max_len:line_len;
171 for(unsigned int i = offset; i < offset+len; i++) {
172 if(i >= seq.size()) {
173 cerr << "Error: offset of " << i << " exceeds read data: " << seq.size() << " in string of " << sequence.length() << endl;
176 char con = toupper(seq[i].consensus());
177 // weak consensus if lowercase.
178 bool weak_con = seq[i].consensus() != con;
179 if(con == ' ' || weak_con || toupper(con) == toupper(sequence[i])) {
180 output.append(1,sequence[i]);
182 output.append(1,con);
184 sprintf(buff,"%d:%c>%c",i,sequence[i],con);
186 variants.push_back(var);
191 o << ">" << this->chr << ":" << this->pos << "-" << this->pos+this->length << "|";
192 for(vector<string>::iterator i = variants.begin(); i != variants.end(); ++i) {
194 if(i+1 != variants.end()) o << "|";
196 o << endl << output << endl;
199 void print_RE(ostream& o) {
200 for(unsigned int i = 0; i < sequence.length(); i++) {
201 char ref = toupper(sequence[i]);
202 char con = toupper(seq[i].consensus());
203 if(con != ' ' && con != ref) {
204 o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i].A() << "," << seq[i].C() << "," << seq[i].G() << "," << seq[i].T() << "]" << endl;
209 void print_logo(ostream& o) {
210 unsigned int max = 0;
211 for(unsigned int i = 0; i < sequence.length(); i++) {
212 if(seq[i].N() > max) { max = seq[i].N(); }
215 for(unsigned int i = 0; i < max; i++) {
216 for(unsigned int j = 0; j < sequence.length(); j++) {
217 o << seq[j].nth_nuc(i);
224 typedef vector<Window> Windows;
226 class SNP : public Loci {
231 char consensus[4]; // represent the consensus sequence in order. Most often, only the first 1 or 2 will matter.
239 SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) {
247 this->reference_base = reference_base;
250 SNP(const SNP& h) : Loci(h) {
252 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
253 this->reference_base = h.reference_base;
256 SNP& operator=(const SNP& h) {
260 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
261 this->reference_base = h.reference_base;
265 void eval_consensus() {
267 if(A >= C & A >= G & A >= T) { consensus[0] = 'A';
268 if(C >= G & C >= T) { consensus[1] = 'C';
269 if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
270 else { consensus[2] = 'T'; consensus[3] = 'G'; }
271 } else if(G >= C & G >= T) { consensus[1] = 'G';
272 if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
273 else { consensus[2] = 'T'; consensus[3] = 'C'; }
274 } else { consensus[1] = 'T';
275 if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
276 else { consensus[2] = 'G'; consensus[3] = 'C'; }
281 } else if(C >= A & C >= G & C >= T) { consensus[0] = 'C';
282 if(A >= G & A >= T) { consensus[1] = 'A';
283 if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; }
284 else { consensus[2] = 'T'; consensus[3] = 'G'; }
285 } else if(G >= A & G >= T) { consensus[1] = 'G';
286 if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
287 else { consensus[2] = 'T'; consensus[3] = 'A'; }
288 } else { consensus[1] = 'T';
289 if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
290 else { consensus[2] = 'G'; consensus[3] = 'A'; }
292 } else if(G >= A & G >= C & G >= T) { consensus[0] = 'G';
293 if(A >= C & A >= T) { consensus[1] = 'A';
294 if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; }
295 else { consensus[2] = 'T'; consensus[3] = 'C'; }
296 } else if(C >= A & C >= T) { consensus[1] = 'C';
297 if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; }
298 else { consensus[2] = 'T'; consensus[3] = 'A'; }
299 } else { consensus[1] = 'T';
300 if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
301 else { consensus[2] = 'C'; consensus[3] = 'A'; }
303 } else { consensus[0] = 'T';
304 if(A >= C & A >= G) { consensus[1] = 'A';
305 if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; }
306 else { consensus[2] = 'G'; consensus[3] = 'C'; }
307 } else if(C >= A & C >= G) { consensus[1] = 'C';
308 if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; }
309 else { consensus[2] = 'G'; consensus[3] = 'A'; }
310 } else { consensus[1] = 'G';
311 if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; }
312 else { consensus[2] = 'C'; consensus[3] = 'A'; }
317 void add_read(char nuc) {
337 void clean(unsigned int threshold) {
338 if(A <= threshold) { A = 0; }
339 if(C <= threshold) { C = 0; }
340 if(G <= threshold) { G = 0; }
341 if(T <= threshold) { T = 0; }
342 total = A + C + G + T;
346 double RE(unsigned int th = 2) {
347 if(total == 0) { return 0.0; }
349 double pA = (double)( ((A<th)?A:0)+1e-10)/(double)total;
350 double pC = (double)( ((C<th)?C:0)+1e-10)/(double)total;
351 double pG = (double)( ((G<th)?G:0)+1e-10)/(double)total;
352 double pT = (double)( ((T<th)?T:0)+1e-10)/(double)total;
354 //assume equal distribution of A,C,G,T
356 return pA*log(pA/0.25)/l2 + pC*log(pC/0.25)/l2 + pG*log(pG/0.25)/l2 + pT*log(pT/0.25)/l2;
360 typedef vector<SNP> SNPs;
362 //Class to calulate mixture model. Very not general right now, but should be easy enough to make more general
364 class GaussianMixture {
378 GaussianMixture(SNPs& snps, double delta = 1e-10) {
381 //model 1: heterozygous
385 //model 2: homozygous
392 bool classify(double x) {
393 return(norm_prob(x,u1,s1) >= norm_prob(x,u2,s2)) ;
396 // Use EM to fit gaussian mixture model to discern heterozygous from homozygous snps
397 void fit(SNPs& snps, unsigned int count_th) {
398 //initialize relative entropy and probabilities
401 for(unsigned int i = 0; i < snps.size(); ++i) {
402 if(snps[i].total >= 8) {
403 RE.push_back(snps[i].RE(count_th));
410 cerr << this->N << " snps checked\n";
412 //calculate initial expectation
414 for(unsigned int i = 0; i < N; ++i) {
415 Q += pr[i] * (log( this->p ) - log(sqrt(2.0*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
416 Q += (1.0-pr[i]) * (log(1-this->p) - log(sqrt(2.0*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
419 cerr << "Q: " << this->Q << endl;
422 //expectation maximization to iteratively update pi's and parameters until Q settles down.
424 cerr << "loop Q: " << Q << endl;
427 double p_sum = 0.0, q_sum = 0.0, u1_sum = 0.0, u2_sum = 0.0;
428 for(unsigned int i = 0; i < N; ++i) {
429 pr[i] = pr[i]*norm_prob(RE[i],this->u1,this->s1) /
430 (pr[i]*norm_prob(RE[i],this->u1,this->s1) + (1.0 - pr[i])*(norm_prob(RE[i],this->u2,this->s2)));
433 q_sum += (1.0 - pr[i]);
435 u1_sum += pr[i]*RE[i];
436 u2_sum += (1.0 - pr[i])*RE[i];
438 Q_new += pr[i] * (log( this->p ) - log(sqrt(2*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1));
439 Q_new += (1.0-pr[i])* (log(1-this->p) - log(sqrt(2*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2));
442 //update variables of the distributions (interwoven with pi loop to save cpu)
443 this->p = p_sum / this->N;
444 this->u1 = u1_sum / p_sum;
445 this->u2 = u2_sum / q_sum;
447 double s1_sum = 0.0, s2_sum = 0.0;
448 for(unsigned int i = 0; i < N; ++i) {
449 s1_sum += pr[i] * (RE[i] - this->u1)*(RE[i] - this->u1);
450 s2_sum += (1.0-pr[i]) * (RE[i] - this->u2)*(RE[i] - this->u2);
453 this->s1 = sqrt(s1_sum/p_sum);
454 this->s2 = sqrt(s2_sum/q_sum);
456 if(fabs(this->Q - Q_new) < 1e-5) { break; }
459 cerr << "Q: " << Q << endl;
463 cout << "Q: " << Q << " p: " << p << " norm(" << u1 << "," << s1 << ");norm(" << u2 << "," << s2 << ")" << endl;
468 ostream &operator<<( ostream &out, const SNP &h ) {
469 out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total;
475 void read_snps(const char* filename, SNPs& snps) {
478 ifstream feat(filename);
480 while(feat.peek() != EOF) {
482 feat.getline(line,1024,'\n');
484 string line_str(line);
485 vector<string> fields;
486 split(line_str, delim, fields);
487 if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
489 string name = fields[0];
490 string chr = fields[1];
491 unsigned int pos = atoi(fields[2].c_str());
492 char base = (fields[3])[0];
494 SNP snp(name,chr,pos,base);
498 //sort the features so we can run through it once
499 std::stable_sort(snps.begin(),snps.end());
502 cerr << "Found AND sorted " << snps.size() << " snps." << endl;
505 void read_align_file(char* filename, Reads& features) {
507 string location_delim(":");
508 char strand_str[2]; strand_str[1] = '\0';
509 ifstream seqs(filename);
511 while(seqs.peek() != EOF) {
513 seqs.getline(line,2048,'\n');
515 string line_str(line);
516 vector<string> fields;
517 split(line_str, delim, fields);
518 if(fields.size() != 7) { continue; }
520 vector<string> location; split(fields[3], location_delim, location);
521 string chr = location[0];
522 if(chr == "newcontam") { continue; }
523 if(chr == "NA") { continue; }
525 int pos = atoi(location[1].c_str());
526 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
529 if(strand == 0) { seq = fields[0]; } else { revcomp(seq,fields[0]); }
530 Read read(chr,pos,0,seq);
531 features.push_back(read);
535 //sort the data so we can run through it once
536 std::sort(features.begin(),features.end());
537 cerr << "Found and sorted " << features.size() << " reads." << endl;
540 void read_window_file(const char* filename, Windows& ws) {
543 ifstream win_file(filename);
546 while(win_file.peek() != EOF) {
548 win_file.getline(line,1024,'\n');
550 string line_str(line);
551 vector<string> fields;
552 split(line_str, delim, fields);
553 if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
555 string name = fields[0];
556 string chr = fields[1];
557 if(chr == "NA") { continue; }
558 if(chr == "contam") { continue; }
559 int start = atoi(fields[2].c_str());
560 int stop = atoi(fields[3].c_str());
562 Window w(name,chr,start,stop-start+1);
566 //sort the features so we can run through it once
567 std::stable_sort(ws.begin(),ws.end());
570 cerr << "Found and sorted " << ws.size() << " windows." << endl;
573 void count_read_in_features(Windows& windows, Reads& data) {
574 Windows::iterator wind_it = windows.begin();
576 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
577 //skip to first feature after read
578 string start_chr = wind_it->chr;
579 while(wind_it != windows.end() && (wind_it->chr < i->chr || (wind_it->chr == i->chr && wind_it->pos + wind_it->length < i->pos) )) {
583 //stop if we have run out of features.
584 if(wind_it == windows.end()) { break; }
586 if(i->pos + i->length() > wind_it->pos && i->pos < (wind_it->pos + wind_it->length)) {
587 wind_it->add_read(*i);
592 void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) {
595 string chrom = peaks[0].chr;
596 string chrom_filename = chrom_filenames[chrom];
597 ifstream chrom_file(chrom_filename.c_str());
598 chrom_file.getline(temp, 1024);
599 size_t offset = chrom_file.gcount();
600 for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
601 if(i->chr != chrom) {
603 cout << "XXX: " << (*(chrom_filenames.find(chrom))).first << endl;
604 if(chrom_filenames.find(chrom) == chrom_filenames.end()) {
605 cerr << "Chrom: " << chrom << " not found\n";
607 chrom_filename = chrom_filenames[chrom];
608 chrom_file.close(); chrom_file.open(chrom_filename.c_str());
609 chrom_file.getline(temp, 1024);
610 offset = chrom_file.gcount();
612 unsigned int begin = i->pos - 1;
613 unsigned int end = i->pos+i->length;
615 unsigned int begin_pos = offset + (int)begin/50 + begin;
616 unsigned int end_pos = offset + (int)end/50 + end;
618 unsigned int read_len = end_pos - begin_pos;
619 char buffer[read_len+1];
620 chrom_file.seekg(begin_pos, ios_base::beg);
621 chrom_file.read(buffer, read_len);
622 buffer[read_len] = '\0';
623 i->set_sequence(buffer);
629 int main(int argc, char** argv) {
630 if(argc != 4) { cerr << "Usage: " << argv[0] << " read_file window_file chromosome_file\n"; exit(1); }
632 char read_filename[1024]; strcpy(read_filename,argv[1]);
633 char window_filename[1024]; strcpy(window_filename,argv[2]);
634 char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
636 Windows windows; read_window_file(window_filename, windows);
638 if(windows.size() == 0) { cout << "No windows loaded." << endl; exit(0); }
639 ChromList reference_seq(chromosome_filename);
641 retrieveSequenceData(reference_seq, windows);
643 cerr << "Established reference sequences\n";
645 Reads reads; read_align_file(read_filename, reads);
647 if(reads.size() == 0) { cout << "No reads loaded." << endl; exit(0); }
649 count_read_in_features(windows, reads);
651 for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) {
652 //w->print_consensus(cout);
653 //w->print_logo(cout);
655 w->print_fasta(cout);