From a68e996a16a0030f1d3a3f3decc8d7c98d5af433 Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Tue, 23 Dec 2008 18:35:37 +0000 Subject: [PATCH] Added code to identify snps and update the gneome accordingly. This code has an insidious bug in which occasionally the first character of the output consensus sequences is modified, or the consensus sequence does not properly end, and has a few nonsense characters. So far, the bug has eluded my memory debugger. --- htswanalysis/src/GetReadsInSnps/Makefile | 10 +- .../src/GetReadsInSnps/chrom_list.cpp | 22 + htswanalysis/src/GetReadsInSnps/chrom_list.h | 27 + .../src/GetReadsInSnps/getReadsInSnps.cpp | 259 +++++- .../src/GetReadsInSnps/getRegionConsensus.cpp | 798 ++++++++++++++++++ .../src/GetReadsInSnps/patch_genome.cpp | 71 ++ htswanalysis/src/GetReadsInSnps/util.cpp | 21 + htswanalysis/src/GetReadsInSnps/util.h | 11 + 8 files changed, 1205 insertions(+), 14 deletions(-) create mode 100644 htswanalysis/src/GetReadsInSnps/chrom_list.cpp create mode 100644 htswanalysis/src/GetReadsInSnps/chrom_list.h create mode 100755 htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp create mode 100644 htswanalysis/src/GetReadsInSnps/patch_genome.cpp create mode 100644 htswanalysis/src/GetReadsInSnps/util.cpp create mode 100644 htswanalysis/src/GetReadsInSnps/util.h diff --git a/htswanalysis/src/GetReadsInSnps/Makefile b/htswanalysis/src/GetReadsInSnps/Makefile index 236a59d..f8c3488 100644 --- a/htswanalysis/src/GetReadsInSnps/Makefile +++ b/htswanalysis/src/GetReadsInSnps/Makefile @@ -1,4 +1,10 @@ -CPPFLAGS=-g -Wall -O3 -I/opt/local/include +CPPFLAGS=-g -Wall -I/opt/local/include +#LDFLAGS=-lgsl -lgslcblas -lm -lduma -L/opt/local/lib +#CPPFLAGS=-g -Wall -O3 -I/opt/local/include LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib -all: getReadsInSnps +all: patch_genome getReadsInSnps getRegionConsensus + +patch_genome: patch_genome.cpp chrom_list.cpp util.cpp + +getRegionConsensus: getRegionConsensus.cpp chrom_list.cpp util.cpp diff --git a/htswanalysis/src/GetReadsInSnps/chrom_list.cpp b/htswanalysis/src/GetReadsInSnps/chrom_list.cpp new file mode 100644 index 0000000..a3fd8a0 --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/chrom_list.cpp @@ -0,0 +1,22 @@ +#include +#include +#include +#include + +#include "chrom_list.h" +#include "util.h" + +using namespace std; + +ChromList::ChromList(char* filename) { + char buffer[1024]; string delim("\t"); + ifstream chrom_file(filename); + while(chrom_file.getline(buffer, 1024)) { + string temp(buffer); + vector words; + split(temp, delim,words); + if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); } + (*this)[ words[0] ] = words[1]; + } + chrom_file.close(); +} diff --git a/htswanalysis/src/GetReadsInSnps/chrom_list.h b/htswanalysis/src/GetReadsInSnps/chrom_list.h new file mode 100644 index 0000000..6e5446b --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/chrom_list.h @@ -0,0 +1,27 @@ +#include + +#ifndef CHROMLIST_H +#define CHROMLIST_H + +using namespace std; +using namespace __gnu_cxx; + +struct eq_str { + bool operator() (const string& a, const string& b) const { + return(strcmp(a.c_str(), b.c_str()) == 0); + } +}; + +struct hash_str { + size_t operator() (const string& a) const { + hash h; + return(h(a.c_str())); + } +}; + +class ChromList : public hash_map { + public: + ChromList(char* filename); +}; + +#endif diff --git a/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp b/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp index 34d7d83..5fe1fbc 100755 --- a/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp +++ b/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp @@ -1,6 +1,18 @@ +/* + * getReadsInSnps: + * This program takes a set of snps in a custom tab format, and a set of short mapped reads, and evaluates + * the sequencing overlap over those snps. Additionally, a miaxture model is fit and used to classify the + * snps as homozygous or heterozygous. + * + * In the final report, the output is: + * + * where snp call is one of: + * -1: no call was made (not enough examples to make a call) + * 0: the snp is homozygous + * 1: the snp is heterozygous + * + */ #include -#include -#include #include #include #include @@ -12,12 +24,17 @@ #include #define WINDOW 25 +#define PI 3.14159265358979323846 + +//#define DEBUG using namespace std; void split (const string& text, const string& separators, vector& words); char *strrevcomp(const string& input); +double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); } + class Loci { public: string chr; @@ -31,6 +48,7 @@ class Loci { }; + class Read : public Loci { public: string seq; @@ -40,11 +58,14 @@ class Read : public Loci { Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;} }; +typedef vector Reads; + class SNP : public Loci { public: string name; char reference_base; + char consensus[4]; // represent the consensus sequence in order. Most often, only the first 1 or 2 will matter. unsigned int A; unsigned int C; unsigned int G; @@ -54,7 +75,12 @@ class SNP : public Loci { SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) { this->name = name; - this->A = 0; this->C = 0; this->G = 0; this->T = 0; this->total = 0; + this->A = 0; + this->C = 0; + this->G = 0; + this->T = 0; + this->N = 0; + this->reference_base = reference_base; } @@ -73,6 +99,58 @@ class SNP : public Loci { return *this; } + void eval_consensus() { + // if A is the max + if(A >= C & A >= G & A >= T) { consensus[0] = 'A'; + if(C >= G & C >= T) { consensus[1] = 'C'; + if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'G'; } + } else if(G >= C & G >= T) { consensus[1] = 'G'; + if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'C'; } + } else { consensus[1] = 'T'; + if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'C'; } + } + + + // if C is the max + } else if(C >= A & C >= G & C >= T) { consensus[0] = 'C'; + if(A >= G & A >= T) { consensus[1] = 'A'; + if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'G'; } + } else if(G >= A & G >= T) { consensus[1] = 'G'; + if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'A'; } + } else { consensus[1] = 'T'; + if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'A'; } + } + } else if(G >= A & G >= C & G >= T) { consensus[0] = 'G'; + if(A >= C & A >= T) { consensus[1] = 'A'; + if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'C'; } + } else if(C >= A & C >= T) { consensus[1] = 'C'; + if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'A'; } + } else { consensus[1] = 'T'; + if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; } + else { consensus[2] = 'C'; consensus[3] = 'A'; } + } + } else { consensus[0] = 'T'; + if(A >= C & A >= G) { consensus[1] = 'A'; + if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'C'; } + } else if(C >= A & C >= G) { consensus[1] = 'C'; + if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'A'; } + } else { consensus[1] = 'G'; + if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; } + else { consensus[2] = 'C'; consensus[3] = 'A'; } + } + } + } + void add_read(char nuc) { switch(nuc) { case 'a': @@ -92,15 +170,144 @@ class SNP : public Loci { } total++; } + + void clean(unsigned int threshold) { + if(A <= threshold) { A = 0; } + if(C <= threshold) { C = 0; } + if(G <= threshold) { G = 0; } + if(T <= threshold) { T = 0; } + total = A + C + G + T; + eval_consensus(); + } + + double RE(unsigned int th = 2) { + if(total == 0) { return 0.0; } + + double pA = (double)( ((A SNPs; + +//Class to calulate mixture model. Very not general right now, but should be easy enough to make more general +//if the need arises +class GaussianMixture { + +public: + double p; + double u1; + double s1; + double u2; + double s2; + double Q; + + unsigned int N; + + double delta; + + GaussianMixture(SNPs& snps, double delta = 1e-10) { + //initialize model + this->p = 0.5; + //model 1: heterozygous + this->u1 = 1.0; + this->s1 = 0.5; + + //model 2: homozygous + this->u2 = 2.0; + this->s2 = 0.5; + + this->delta = delta; + } + + bool classify(double x) { + return(norm_prob(x,u1,s1) >= norm_prob(x,u2,s2)) ; + } + + // Use EM to fit gaussian mixture model to discern heterozygous from homozygous snps + void fit(SNPs& snps, unsigned int count_th) { + //initialize relative entropy and probabilities + vector RE; + vector pr; + for(unsigned int i = 0; i < snps.size(); ++i) { + if(snps[i].total >= 8) { + RE.push_back(snps[i].RE(count_th)); + pr.push_back(0.5); + } + } + + this->N = RE.size(); + + cerr << this->N << " snps checked\n"; + + //calculate initial expectation + this->Q = 0.0; + for(unsigned int i = 0; i < N; ++i) { + Q += pr[i] * (log( this->p ) - log(sqrt(2.0*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1)); + Q += (1.0-pr[i]) * (log(1-this->p) - log(sqrt(2.0*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2)); + } + + cerr << "Q: " << this->Q << endl; + + double Q_new = 0; + //expectation maximization to iteratively update pi's and parameters until Q settles down. + while(1) { + cerr << "loop Q: " << Q << endl; + Q_new = 0.0; + + double p_sum = 0.0, q_sum = 0.0, u1_sum = 0.0, u2_sum = 0.0; + for(unsigned int i = 0; i < N; ++i) { + pr[i] = pr[i]*norm_prob(RE[i],this->u1,this->s1) / + (pr[i]*norm_prob(RE[i],this->u1,this->s1) + (1.0 - pr[i])*(norm_prob(RE[i],this->u2,this->s2))); + + p_sum += pr[i]; + q_sum += (1.0 - pr[i]); + + u1_sum += pr[i]*RE[i]; + u2_sum += (1.0 - pr[i])*RE[i]; + + Q_new += pr[i] * (log( this->p ) - log(sqrt(2*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1)); + Q_new += (1.0-pr[i])* (log(1-this->p) - log(sqrt(2*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2)); + } + + //update variables of the distributions (interwoven with pi loop to save cpu) + this->p = p_sum / this->N; + this->u1 = u1_sum / p_sum; + this->u2 = u2_sum / q_sum; + + double s1_sum = 0.0, s2_sum = 0.0; + for(unsigned int i = 0; i < N; ++i) { + s1_sum += pr[i] * (RE[i] - this->u1)*(RE[i] - this->u1); + s2_sum += (1.0-pr[i]) * (RE[i] - this->u2)*(RE[i] - this->u2); + } + + this->s1 = sqrt(s1_sum/p_sum); + this->s2 = sqrt(s2_sum/q_sum); + + if(fabs(this->Q - Q_new) < 1e-5) { break; } + this->Q = Q_new; + } + cerr << "Q: " << Q << endl; + } + + void print_model() { + cout << "Q: " << Q << " p: " << p << " norm(" << u1 << "," << s1 << ");norm(" << u2 << "," << s2 << ")" << endl; + } }; + ostream &operator<<( ostream &out, const SNP &h ) { out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total; + return out; } -typedef vector Reads; -typedef vector SNPs; void read_snps(const char* filename, SNPs& snps) { string delim("\t"); @@ -132,6 +339,8 @@ void read_snps(const char* filename, SNPs& snps) { cerr << "Found and sorted " << snps.size() << " snps." << endl; } + + void read_align_file(char* filename, Reads& features) { string delim(" \n"); string location_delim(":"); @@ -155,11 +364,7 @@ void read_align_file(char* filename, Reads& features) { bool strand = ((fields[4].c_str())[0] == 'F')?0:1; string seq; - if(strand == 0) { - seq = fields[0]; - } else { - seq = strrevcomp(fields[0]); - } + if(strand == 0) { seq = fields[0]; } else { seq = strrevcomp(fields[0]); } Read read(chr,pos,seq); features.push_back(read); } @@ -193,19 +398,49 @@ void count_read_at_snps(SNPs& snps, Reads& data) { } int main(int argc, char** argv) { - if(argc != 3) { cerr << "Usage: " << argv[0] << " snp_file read_file\n"; exit(1); } + if(argc != 4) { cerr << "Usage: " << argv[0] << " snp_file read_file non_reference_output_file\n"; exit(1); } char snp_filename[1024]; strcpy(snp_filename,argv[1]); char read_filename[1024]; strcpy(read_filename,argv[2]); + char nonref_filename[1024]; strcpy(nonref_filename,argv[3]); SNPs snps; read_snps(snp_filename, snps); Reads reads; read_align_file(read_filename, reads); count_read_at_snps(snps, reads); + //fix a guassian mixture model to the snps to classify + GaussianMixture g(snps); + g.fit(snps, 2); + +#ifdef DEBUG + g.print_model(); +#endif + + ofstream nonref(nonref_filename); + int group; for(SNPs::iterator i = snps.begin(); i != snps.end(); ++i) { - cout << (*i) << endl; + group = -1; + if(i->total >= 10) { i->eval_consensus(); group = g.classify(i->RE()); } + cout << (*i) << "\t" << group << "\t"; + if(group == 0) cout << i->consensus[0]; + else if(group == 1) cout << i->consensus[0] << "," << i->consensus[1]; + + if( ( group == 0 && i->consensus[0] != toupper(i->reference_base) ) || group == 1) { + //detected difference from consensus sequence + nonref <chr << "\t" << i->pos << "\t"; + if(group == 0) { nonref << i->consensus[0] << endl; } + if(group == 1) { + if(i->consensus[0] != toupper(i->reference_base)) { + nonref << i->consensus[0] << endl; + } else { + nonref << i->consensus[1] << endl; + } + } + } + cout << endl; } + nonref.close(); } void split (const string& text, const string& separators, vector& words) { diff --git a/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp b/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp new file mode 100755 index 0000000..1f0ae1b --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp @@ -0,0 +1,798 @@ +/* + * getReadsInSnps: + * This program takes a set of snps in a custom tab format, and a set of short mapped reads, and evaluates + * the sequencing overlap over those snps. Additionally, a miaxture model is fit and used to classify the + * snps as homozygous or heterozygous. + * + * In the final report, the output is: + * + * where snp call is one of: + * -1: no call was made (not enough examples to make a call) + * 0: the snp is homozygous + * 1: the snp is heterozygous + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "chrom_list.h" +#include "util.h" + +#define WINDOW 25 +#define PI 3.14159265358979323846 + +#define DEBUG + +#ifdef DEBUG +//#include "duma.h" +#endif + +using namespace std; + +void strrevcomp(string& output, const string& input); + +double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); } + +class Loci { + public: + string chr; + unsigned int pos; + + Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; } + Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; } + Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; } + + bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } } + bool operator<=(const Loci& a) const { if(this->chr == a.chr) { return this->pos <= a.pos; } else { return this->chr < a.chr; } } + + bool operator>=(const Loci& a) const { if(this->chr == a.chr) { return this->pos >= a.pos; } else { return this->chr > a.chr; } } + bool operator>(const Loci& a) const { if(this->chr == a.chr) { return this->pos > a.pos; } else { return this->chr > a.chr; } } + + int operator-(const Loci& a) const { if(this->chr == a.chr) { return this->pos - a.pos; } else { return INT_MAX; } } + +}; + + +class Read : public Loci { + public: + string seq; + + unsigned int length() const { return seq.length(); } + + Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; } + Read(const Read& r) : Loci(r) { this->seq = r.seq; } + Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;} + + char operator[](size_t off) const { + if(off < seq.length()) { return seq[off]; } else { return -1; } + } +}; + +typedef vector Reads; + +class Nuc { + protected: + unsigned int n[4]; + + //background nucleotide probabilities + // + //can change this to a background model class later if needed + static double qA; + static double qC; + static double qG; + static double qT; + + // pseudocount to avoid divide-by-zero errors + static double pseudocount; + + public: + + Nuc() { + n[0] = 0; + n[1] = 0; + n[2] = 0; + n[3] = 0; + } + + Nuc(const Nuc& n) { + this->n[0] = n.at(0); + this->n[1] = n.at(1); + this->n[2] = n.at(2); + this->n[3] = n.at(3); + } + + Nuc& operator=(const Nuc& n) { + if (this != &n) { + this->n[0] = n.at(0); + this->n[1] = n.at(1); + this->n[2] = n.at(2); + this->n[3] = n.at(3); + } + return *this; + } + + void add_nuc(char b) { + switch(b) { + case 'a': case 'A': n[0]++; break; + case 'c': case 'C': n[1]++; break; + case 'g': case 'G': n[2]++; break; + case 't': case 'T': n[3]++; break; + }; + } + + char nth_nuc(unsigned int i) { + if(i >= size()) { return 'N'; } + else if(i < n[0]) { return 'A'; } + else if(i < n[0] + n[1]) { return 'C'; } + else if(i < n[0] + n[1] + n[2]) { return 'G'; } + else { return 'T'; } + } + + unsigned int size() { return n[0] + n[1] + n[2] + n[3]; } + + unsigned int& operator[](size_t b) { return n[b]; } + unsigned int at(size_t b) const { return n[b]; } + + double RE() { + + /* + double total = n[0] + n[1] + n[2] + n[3] + 4*Nuc::pseudocount; + double pA = (Nuc::pseudocount + n[0]) / total; + double pC = (Nuc::pseudocount + n[1]) / total; + double pG = (Nuc::pseudocount + n[2]) / total; + double pT = (Nuc::pseudocount + n[3]) / total; + + return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT); + */ + + unsigned int max = 0; unsigned int max_idx = 0; + for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } + unsigned int max2 = 0; unsigned int max_idx2 = 0; + for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } } + + if(max_idx == max_idx2) { max_idx2++; } + + double total = n[max_idx] + n[max_idx2]; + double p1 = (Nuc::pseudocount + n[max_idx]) / total; + double p2 = (Nuc::pseudocount + n[max_idx2]) / total; + + return p1*log2(p1/Nuc::qA) + p2*log2(p2/Nuc::qC); + } + + char consensus() { + unsigned int max = 0; unsigned int max_idx = 0; + for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } + + unsigned int max2 = 0; unsigned int max_idx2 = 0; + for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } } + + //For now pick arbitrary zygosity thresholds. Later, update to use mixture model. + char c = '\0'; + if(RE() >= 1.25) { + //homozygous + switch(max_idx) { + case 0: c = 'A'; break; + case 1: c = 'C'; break; + case 2: c = 'G'; break; + case 3: c = 'T'; break; + } + } else { + switch(max_idx | max_idx2) { + case 1: c = 'M'; break; //A,C + case 2: c = 'R'; break; //A,G + case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G + case 4: c = 'Y'; break; //C,T + case 5: c = 'K'; break; //G,T + } + } + + unsigned int N = size(); + if(N == 0) { return ' '; } else if(N < 10) { return tolower(c); } else { return c; } + } +}; + +double Nuc::pseudocount = 1e-10; +double Nuc::qA = 0.25; +double Nuc::qC = 0.25; +double Nuc::qG = 0.25; +double Nuc::qT = 0.25; + +class Window : public Loci { + public: + //optional name for the window + string name; + + //the consensus sequence + string sequence; + unsigned int length; + vector seq; + + unsigned int reads; + + Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) { + this->name = name; + this->length = length; + this->sequence = ""; + seq.resize(length); + + this->reads = 0; + } + + ~Window() { + seq.clear(); + } + + Window(const Window& r) : Loci(r) { + this->name = r.name; + this->length = r.length; + this->seq = r.seq; + this->sequence = r.sequence; + this->reads = r.reads; + } + + Window& operator=(const Window& r) { + Loci::operator=(r); + this->name = r.name; + this->length = r.length; + this->sequence = r.sequence; + this->seq = r.seq; + this->reads = r.reads; + return *this; + } + + void set_sequence(string s) { + this->sequence = s; + unsigned int a; + //clear out endlines + while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); } + } + + string get_sequence() { + return this->sequence; + } + + void add_read(const Read& r) { + if(this->chr != r.chr) return; + int offset = r - (*this); + this->reads++; + for(unsigned int i = 0; i < r.length(); i++) { + int seq_idx = offset + i; + if(seq_idx < 0 || (seq_idx >= 0 && (unsigned)seq_idx > this->length) ) { continue; } + seq[offset + i].add_nuc(r[i]); + } + } + + void print_consensus(ostream& o) { + unsigned int line_len = 100; + o << ">Consensus for: " << name << " (" << this->chr << ":" << this->pos << "-" << this->pos+this->length << ")" << endl; + + for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) { + unsigned int max_len = sequence.length() - offset; + unsigned int len = (line_len > max_len)?max_len:line_len; + o << sequence.substr(offset,len) << endl; + for(unsigned int i = offset; i < offset+len; i++) { + char ref = toupper(sequence[i]); + char con = toupper(seq[i].consensus()); + if(con == ' ') { + o << ' '; + } else if(con == ref) { + o << '|'; + } else { + o << '*'; + } + } + o << endl; + for(unsigned int i = offset; i < offset+len; i++) { o << seq[i].consensus(); } + o << endl << endl; + } + } + + void print_fasta(ostream& o) { + unsigned int line_len = 100; + + string output = ""; + vector variants; + + for(unsigned int offset = 0; offset < sequence.length(); offset += line_len) { + unsigned int max_len = sequence.length() - offset; + unsigned int len = (line_len > max_len)?max_len:line_len; + for(unsigned int i = offset; i < offset+len; i++) { + char con = toupper(seq[i].consensus()); + // weak consensus if lowercase. + bool weak_con = seq[i].consensus() != con; + if(con == ' ' || weak_con || toupper(con) == toupper(sequence[i])) { + output += sequence[i]; + } else { + output += con; + char buff[128]; + sprintf(buff,"%d:%c>%c",i,sequence[i],con); + string var = buff; + variants.push_back(var); + } + } + //output += '\n'; + } + o << ">" << this->chr << ":" << this->pos << "-" << this->pos+this->length << "|"; + for(vector::iterator i = variants.begin(); i != variants.end(); ++i) { + o << (*i); + if(i+1 != variants.end()) o << "|"; + } + o << endl << output << endl; + } + + void print_RE(ostream& o) { + for(unsigned int i = 0; i < sequence.length(); i++) { + char ref = toupper(sequence[i]); + char con = toupper(seq[i].consensus()); + if(con != ' ' && con != ref) { + o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i][0] << "," << seq[i][1] << "," << seq[i][2] << "," << seq[i][3] << "]" << endl; + } + } + } + + void print_logo(ostream& o) { + unsigned int max = 0; + for(unsigned int i = 0; i < sequence.length(); i++) { + if(seq[i].size() > max) { max = seq[i].size(); } + } + + for(unsigned int i = 0; i < max; i++) { + for(unsigned int j = 0; j < sequence.length(); j++) { + o << seq[j].nth_nuc(i); + } + o << endl; + } + } +}; + +typedef vector Windows; + +class SNP : public Loci { + public: + + string name; + char reference_base; + char consensus[4]; // represent the consensus sequence in order. Most often, only the first 1 or 2 will matter. + unsigned int A; + unsigned int C; + unsigned int G; + unsigned int T; + unsigned int N; + unsigned int total; + + SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) { + this->name = name; + this->A = 0; + this->C = 0; + this->G = 0; + this->T = 0; + this->N = 0; + + this->reference_base = reference_base; + } + + SNP(const SNP& h) : Loci(h) { + this->name = h.name; + this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total; + this->reference_base = h.reference_base; + } + + SNP& operator=(const SNP& h) { + this->name = h.name; + this->chr = h.chr; + this->pos = h.pos; + this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total; + this->reference_base = h.reference_base; + return *this; + } + + void eval_consensus() { + // if A is the max + if(A >= C & A >= G & A >= T) { consensus[0] = 'A'; + if(C >= G & C >= T) { consensus[1] = 'C'; + if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'G'; } + } else if(G >= C & G >= T) { consensus[1] = 'G'; + if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'C'; } + } else { consensus[1] = 'T'; + if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'C'; } + } + + + // if C is the max + } else if(C >= A & C >= G & C >= T) { consensus[0] = 'C'; + if(A >= G & A >= T) { consensus[1] = 'A'; + if(G >= T) { consensus[2] = 'G'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'G'; } + } else if(G >= A & G >= T) { consensus[1] = 'G'; + if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'A'; } + } else { consensus[1] = 'T'; + if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'A'; } + } + } else if(G >= A & G >= C & G >= T) { consensus[0] = 'G'; + if(A >= C & A >= T) { consensus[1] = 'A'; + if(C >= T) { consensus[2] = 'C'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'C'; } + } else if(C >= A & C >= T) { consensus[1] = 'C'; + if(A >= T) { consensus[2] = 'A'; consensus[3] = 'T'; } + else { consensus[2] = 'T'; consensus[3] = 'A'; } + } else { consensus[1] = 'T'; + if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; } + else { consensus[2] = 'C'; consensus[3] = 'A'; } + } + } else { consensus[0] = 'T'; + if(A >= C & A >= G) { consensus[1] = 'A'; + if(C >= G) { consensus[2] = 'C'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'C'; } + } else if(C >= A & C >= G) { consensus[1] = 'C'; + if(A >= G) { consensus[2] = 'A'; consensus[3] = 'G'; } + else { consensus[2] = 'G'; consensus[3] = 'A'; } + } else { consensus[1] = 'G'; + if(A >= C) { consensus[2] = 'A'; consensus[3] = 'C'; } + else { consensus[2] = 'C'; consensus[3] = 'A'; } + } + } + } + + void add_read(char nuc) { + switch(nuc) { + case 'a': + case 'A': + A++; break; + case 'c': + case 'C': + C++; break; + case 'g': + case 'G': + G++; break; + case 't': + case 'T': + T++; break; + default: + N++; break; + } + total++; + } + + void clean(unsigned int threshold) { + if(A <= threshold) { A = 0; } + if(C <= threshold) { C = 0; } + if(G <= threshold) { G = 0; } + if(T <= threshold) { T = 0; } + total = A + C + G + T; + eval_consensus(); + } + + double RE(unsigned int th = 2) { + if(total == 0) { return 0.0; } + + double pA = (double)( ((A SNPs; + +//Class to calulate mixture model. Very not general right now, but should be easy enough to make more general +//if the need arises +class GaussianMixture { + +public: + double p; + double u1; + double s1; + double u2; + double s2; + double Q; + + unsigned int N; + + double delta; + + GaussianMixture(SNPs& snps, double delta = 1e-10) { + //initialize model + this->p = 0.5; + //model 1: heterozygous + this->u1 = 1.0; + this->s1 = 0.5; + + //model 2: homozygous + this->u2 = 2.0; + this->s2 = 0.5; + + this->delta = delta; + } + + bool classify(double x) { + return(norm_prob(x,u1,s1) >= norm_prob(x,u2,s2)) ; + } + + // Use EM to fit gaussian mixture model to discern heterozygous from homozygous snps + void fit(SNPs& snps, unsigned int count_th) { + //initialize relative entropy and probabilities + vector RE; + vector pr; + for(unsigned int i = 0; i < snps.size(); ++i) { + if(snps[i].total >= 8) { + RE.push_back(snps[i].RE(count_th)); + pr.push_back(0.5); + } + } + + this->N = RE.size(); + + cerr << this->N << " snps checked\n"; + + //calculate initial expectation + this->Q = 0.0; + for(unsigned int i = 0; i < N; ++i) { + Q += pr[i] * (log( this->p ) - log(sqrt(2.0*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1)); + Q += (1.0-pr[i]) * (log(1-this->p) - log(sqrt(2.0*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2)); + } + + cerr << "Q: " << this->Q << endl; + + double Q_new = 0; + //expectation maximization to iteratively update pi's and parameters until Q settles down. + while(1) { + cerr << "loop Q: " << Q << endl; + Q_new = 0.0; + + double p_sum = 0.0, q_sum = 0.0, u1_sum = 0.0, u2_sum = 0.0; + for(unsigned int i = 0; i < N; ++i) { + pr[i] = pr[i]*norm_prob(RE[i],this->u1,this->s1) / + (pr[i]*norm_prob(RE[i],this->u1,this->s1) + (1.0 - pr[i])*(norm_prob(RE[i],this->u2,this->s2))); + + p_sum += pr[i]; + q_sum += (1.0 - pr[i]); + + u1_sum += pr[i]*RE[i]; + u2_sum += (1.0 - pr[i])*RE[i]; + + Q_new += pr[i] * (log( this->p ) - log(sqrt(2*PI)) - log(this->s1) - (RE[i] - this->u1)*(RE[i] - this->u1)/(2.0*this->s1*this->s1)); + Q_new += (1.0-pr[i])* (log(1-this->p) - log(sqrt(2*PI)) - log(this->s2) - (RE[i] - this->u2)*(RE[i] - this->u2)/(2.0*this->s2*this->s2)); + } + + //update variables of the distributions (interwoven with pi loop to save cpu) + this->p = p_sum / this->N; + this->u1 = u1_sum / p_sum; + this->u2 = u2_sum / q_sum; + + double s1_sum = 0.0, s2_sum = 0.0; + for(unsigned int i = 0; i < N; ++i) { + s1_sum += pr[i] * (RE[i] - this->u1)*(RE[i] - this->u1); + s2_sum += (1.0-pr[i]) * (RE[i] - this->u2)*(RE[i] - this->u2); + } + + this->s1 = sqrt(s1_sum/p_sum); + this->s2 = sqrt(s2_sum/q_sum); + + if(fabs(this->Q - Q_new) < 1e-5) { break; } + this->Q = Q_new; + } + cerr << "Q: " << Q << endl; + } + + void print_model() { + cout << "Q: " << Q << " p: " << p << " norm(" << u1 << "," << s1 << ");norm(" << u2 << "," << s2 << ")" << endl; + } +}; + + +ostream &operator<<( ostream &out, const SNP &h ) { + out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total; + + return out; +} + + +void read_snps(const char* filename, SNPs& snps) { + string delim("\t"); + + ifstream feat(filename); + size_t N = 0; + while(feat.peek() != EOF) { + char line[1024]; + feat.getline(line,1024,'\n'); + N++; + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; } + + string name = fields[0]; + string chr = fields[1]; + unsigned int pos = atoi(fields[2].c_str()); + char base = (fields[3])[0]; + + SNP snp(name,chr,pos,base); + snps.push_back(snp); + } + + //sort the features so we can run through it once + std::stable_sort(snps.begin(),snps.end()); + feat.close(); + + cerr << "Found and sorted " << snps.size() << " snps." << endl; +} + +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "newcontam") { continue; } + if(chr == "NA") { continue; } + + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + string seq; + if(strand == 0) { seq = fields[0]; } else { strrevcomp(seq,fields[0]); } + Read read(chr,pos,seq); + features.push_back(read); + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end()); + cerr << "Found and sorted " << features.size() << " reads." << endl; +} + +void read_window_file(const char* filename, Windows& ws) { + string delim("\t"); + + ifstream win_file(filename); + + unsigned int N = 0; + while(win_file.peek() != EOF) { + char line[1024]; + win_file.getline(line,1024,'\n'); + N++; + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; } + + string name = fields[0]; + string chr = fields[1]; + if(chr == "NA") { continue; } + if(chr == "contam") { continue; } + int start = atoi(fields[2].c_str()); + int stop = atoi(fields[3].c_str()); + + Window w(name,chr,start,stop-start+1); + ws.push_back(w); + } + + //sort the features so we can run through it once + std::stable_sort(ws.begin(),ws.end()); + win_file.close(); + + cerr << "Found and sorted " << ws.size() << " windows." << endl; +} + +void count_read_in_features(Windows& windows, Reads& data) { + Windows::iterator wind_it = windows.begin(); + + for(Reads::iterator i = data.begin(); i != data.end(); ++i) { + //skip to first feature after read + string start_chr = wind_it->chr; + while(wind_it != windows.end() && (wind_it->chr < i->chr || (wind_it->chr == i->chr && wind_it->pos + wind_it->length < i->pos) )) { + wind_it++; + } + + //stop if we have run out of features. + if(wind_it == windows.end()) { break; } + + if(i->pos + i->length > wind_it->pos && i->pos < (wind_it->pos + wind_it->length)) { + wind_it->add_read(*i); + } + } +} + +void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) { + char temp[1024]; + + string chrom = peaks[0].chr; + string chrom_filename = chrom_filenames[chrom]; + ifstream chrom_file(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + size_t offset = chrom_file.gcount(); + for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) { + if(i->chr != chrom) { + chrom = i->chr; + chrom_filename = chrom_filenames[chrom]; + chrom_file.close(); chrom_file.open(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + offset = chrom_file.gcount(); + } + unsigned int begin = i->pos - 1; + unsigned int end = i->pos+i->length-2; + + unsigned int begin_pos = offset + (int)begin/50 + begin; + unsigned int end_pos = offset + (int)end/50 + end; + + unsigned int read_len = end_pos - begin_pos; + char buffer[read_len+1]; + chrom_file.seekg(begin_pos, ios_base::beg); + chrom_file.read(buffer, read_len); + buffer[read_len] = '\0'; + i->set_sequence(buffer); + } + chrom_file.close(); +} + + +int main(int argc, char** argv) { + if(argc != 4) { cerr << "Usage: " << argv[0] << " read_file window_file chromosome_file\n"; exit(1); } + + char read_filename[1024]; strcpy(read_filename,argv[1]); + char window_filename[1024]; strcpy(window_filename,argv[2]); + char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]); + + Windows windows; read_window_file(window_filename, windows); + ChromList reference_seq(chromosome_filename); + + retrieveSequenceData(reference_seq, windows); + + cerr << "Established reference sequences\n"; + + Reads reads; read_align_file(read_filename, reads); + + count_read_in_features(windows, reads); + + for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) { + //w->print_consensus(cout); + //w->print_logo(cout); + w->print_RE(cerr); + w->print_fasta(cout); + } +} + +void strrevcomp(string& output, const string& input) +{ + output = input; + unsigned int i; + + for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; } + + for (unsigned int p1 = 0; p1 < output.length(); ++p1) { + if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; } + else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; } + else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; } + else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; } + } +} + diff --git a/htswanalysis/src/GetReadsInSnps/patch_genome.cpp b/htswanalysis/src/GetReadsInSnps/patch_genome.cpp new file mode 100644 index 0000000..7b97a6d --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/patch_genome.cpp @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include + +#include "defs.h" +#include "util.h" +#include "chrom_list.h" + +using namespace std; + +string retrieveSequenceData(ChromList chrom_filenames, string chr, unsigned int pos, unsigned int len, string& seq); + +int main(int argc, char** argv) { + if(argc != 3) { cerr << "Usage: " << argv[0] << " chromosomes_file variants_file" << endl; return(0); } + + char chromosome_filename[1024]; strcpy(chromosome_filename,argv[1]); + char variants_filename[1024]; strcpy(variants_filename,argv[2]); + ChromList chromosome_files(chromosome_filename); + ifstream var(variants_filename); + char buffer[1024]; + unsigned int count = 0; + while(var.getline(buffer, 1024)) { + string temp(buffer); string delim("\t"); + vector words; + split(temp, delim, words); + if(words.size() != 3) { cerr << "Error in variant list. Expected 3 columns, but found " << words.size() << endl; exit(1); } + + unsigned int pos = atoi(words[1].c_str()); + string seq; + retrieveSequenceData(chromosome_files, words[0], pos-50, 100, seq); + seq[49] = words[2][0]; + cout << ">" << words[0] << "|" << pos << "\n" << seq << endl; + count++; + } + var.close(); + cerr << "Patched genome in " << count << " places\n"; +} + +string retrieveSequenceData(ChromList chrom_filenames, string chr, unsigned int pos, unsigned int len, string& seq) { + + //pick out the correct chromosome file + string chrom_filename = chrom_filenames[chr]; + ifstream chrom_file(chrom_filename.c_str()); + + //knock off the header line, and account for its size contribution to the file + char temp[1024]; chrom_file.getline(temp, 1024); + size_t offset = chrom_file.gcount(); + + //calculate positions in fasta file to extract. + //assumes 50bp lines + unsigned int begin_pos = offset + (int)pos/50 + pos; + unsigned int end_pos = offset + (int)(pos+len)/50 + (pos+len); + + //pull out all sequence, including newlines + unsigned int read_len = end_pos - begin_pos; + char buffer[read_len+1]; buffer[read_len] = '\0'; + chrom_file.seekg(begin_pos, ios_base::beg); + chrom_file.read(buffer, read_len); + chrom_file.close(); + + //remove newlines, leaving just the dna sequence + unsigned int a; + seq = buffer; + while( (a= (seq.find("\n"))) != string::npos) { + seq.erase(a,1); + } + + return seq; +} diff --git a/htswanalysis/src/GetReadsInSnps/util.cpp b/htswanalysis/src/GetReadsInSnps/util.cpp new file mode 100644 index 0000000..dcaf73e --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/util.cpp @@ -0,0 +1,21 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include + +using namespace std; + +void split (const string& text, const string& separators, vector& words) { + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} + +#endif diff --git a/htswanalysis/src/GetReadsInSnps/util.h b/htswanalysis/src/GetReadsInSnps/util.h new file mode 100644 index 0000000..8a73b63 --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/util.h @@ -0,0 +1,11 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include + +using namespace std; + +void split (const string& text, const string& separators, vector& words); + +#endif -- 2.30.2