From 8d65240e9f54c38c1aa8f287e8849edacc52ed4d Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Wed, 24 Dec 2008 21:07:23 +0000 Subject: [PATCH] Bugfixes --- htswanalysis/src/GetReadsInSnps/Makefile | 10 +- .../src/GetReadsInSnps/getRegionConsensus.cpp | 254 ++++-------------- 2 files changed, 60 insertions(+), 204 deletions(-) diff --git a/htswanalysis/src/GetReadsInSnps/Makefile b/htswanalysis/src/GetReadsInSnps/Makefile index f8c3488..ae11c5c 100644 --- a/htswanalysis/src/GetReadsInSnps/Makefile +++ b/htswanalysis/src/GetReadsInSnps/Makefile @@ -1,10 +1,16 @@ CPPFLAGS=-g -Wall -I/opt/local/include #LDFLAGS=-lgsl -lgslcblas -lm -lduma -L/opt/local/lib #CPPFLAGS=-g -Wall -O3 -I/opt/local/include -LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib +LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib -L../SRLib + +all: getReadsInSnps + +getReadsInSnps: getReadsInSnps.o ../SRLib/Loci.o ../SRLib/Read.o ../SRLib/SNP.o ../SRLib/Util.o + g++ $(LDFLAGS) -L../SRLib $^ -o $@ all: patch_genome getReadsInSnps getRegionConsensus patch_genome: patch_genome.cpp chrom_list.cpp util.cpp -getRegionConsensus: getRegionConsensus.cpp chrom_list.cpp util.cpp +getRegionConsensus: ../SRLib/chrom_list.o ../SRLib/util.o ../SRLib/read.o ../SRLib/loci.o ../SRLib/nuc.o + g++ getRegionConsensus.cpp $(CPPFLAGS) $(LDFLAGS) $^ -o $@ diff --git a/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp b/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp index 4cbd878..abc70f7 100755 --- a/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp +++ b/htswanalysis/src/GetReadsInSnps/getRegionConsensus.cpp @@ -24,8 +24,11 @@ #include -#include "chrom_list.h" -#include "util.h" +#include "../SRLib/chrom_list.h" +#include "../SRLib/util.h" +#include "../SRLib/loci.h" +#include "../SRLib/read.h" +#include "../SRLib/nuc.h" #define WINDOW 25 #define PI 3.14159265358979323846 @@ -38,175 +41,6 @@ using namespace std; -void strrevcomp(string& output, const string& input); - -double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); } - -class Loci { - public: - string chr; - unsigned int pos; - - Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; } - Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; } - Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; } - - bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } } - bool operator<=(const Loci& a) const { if(this->chr == a.chr) { return this->pos <= a.pos; } else { return this->chr < a.chr; } } - - bool operator>=(const Loci& a) const { if(this->chr == a.chr) { return this->pos >= a.pos; } else { return this->chr > a.chr; } } - bool operator>(const Loci& a) const { if(this->chr == a.chr) { return this->pos > a.pos; } else { return this->chr > a.chr; } } - - int operator-(const Loci& a) const { if(this->chr == a.chr) { return this->pos - a.pos; } else { return INT_MAX; } } - -}; - - -class Read : public Loci { - public: - string seq; - - unsigned int length() const { return seq.length(); } - - Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; } - Read(const Read& r) : Loci(r) { this->seq = r.seq; } - Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;} - - char operator[](size_t off) const { - if(off < seq.length()) { return seq[off]; } else { return -1; } - } -}; - -typedef vector Reads; - -class Nuc { - protected: - unsigned int n[4]; - - //background nucleotide probabilities - // - //can change this to a background model class later if needed - static double qA; - static double qC; - static double qG; - static double qT; - - // pseudocount to avoid divide-by-zero errors - static double pseudocount; - - public: - - Nuc() { - n[0] = 0; - n[1] = 0; - n[2] = 0; - n[3] = 0; - } - - Nuc(const Nuc& n) { - this->n[0] = n.at(0); - this->n[1] = n.at(1); - this->n[2] = n.at(2); - this->n[3] = n.at(3); - } - - Nuc& operator=(const Nuc& n) { - if (this != &n) { - this->n[0] = n.at(0); - this->n[1] = n.at(1); - this->n[2] = n.at(2); - this->n[3] = n.at(3); - } - return *this; - } - - void add_nuc(char b) { - switch(b) { - case 'a': case 'A': n[0]++; break; - case 'c': case 'C': n[1]++; break; - case 'g': case 'G': n[2]++; break; - case 't': case 'T': n[3]++; break; - }; - } - - char nth_nuc(unsigned int i) { - if(i >= size()) { return 'N'; } - else if(i < n[0]) { return 'A'; } - else if(i < n[0] + n[1]) { return 'C'; } - else if(i < n[0] + n[1] + n[2]) { return 'G'; } - else { return 'T'; } - } - - unsigned int size() { return n[0] + n[1] + n[2] + n[3]; } - - unsigned int& operator[](size_t b) { return n[b]; } - unsigned int at(size_t b) const { return n[b]; } - - double RE() { - - /* - double total = n[0] + n[1] + n[2] + n[3] + 4*Nuc::pseudocount; - double pA = (Nuc::pseudocount + n[0]) / total; - double pC = (Nuc::pseudocount + n[1]) / total; - double pG = (Nuc::pseudocount + n[2]) / total; - double pT = (Nuc::pseudocount + n[3]) / total; - - return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT); - */ - - unsigned int max = 0; unsigned int max_idx = 0; - for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } - unsigned int max2 = 0; unsigned int max_idx2 = 0; - for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } } - - if(max_idx == max_idx2) { max_idx2++; } - - double total = n[max_idx] + n[max_idx2]; - double p1 = (Nuc::pseudocount + n[max_idx]) / total; - double p2 = (Nuc::pseudocount + n[max_idx2]) / total; - - return p1*log2(p1/Nuc::qA) + p2*log2(p2/Nuc::qC); - } - - char consensus() { - unsigned int N = size(); - if(N == 0) { return ' '; } - unsigned int max = 0; unsigned int max_idx = 0; - for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } - - unsigned int max2 = 0; unsigned int max_idx2 = 0; - for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } } - - //For now pick arbitrary zygosity thresholds. Later, update to use mixture model. - char c = '\0'; - if(RE() >= 1.25) { - //homozygous - switch(max_idx) { - case 0: c = 'A'; break; - case 1: c = 'C'; break; - case 2: c = 'G'; break; - case 3: c = 'T'; break; - } - } else { - switch(max_idx | max_idx2) { - case 1: c = 'M'; break; //A,C - case 2: c = 'R'; break; //A,G - case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G - case 4: c = 'Y'; break; //C,T - case 5: c = 'K'; break; //G,T - } - } - - if(N < 10) { return tolower(c); } else { return c; } - } -}; - -double Nuc::pseudocount = 1e-10; -double Nuc::qA = 0.25; -double Nuc::qC = 0.25; -double Nuc::qG = 0.25; -double Nuc::qT = 0.25; - class Window : public Loci { public: //optional name for the window @@ -222,11 +56,10 @@ class Window : public Loci { Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) { this->name = name; this->length = length; - this->sequence = ""; - if(length > 10000) { cerr << "ERROR: window of size " << length << endl; exit(1); } - seq.resize(length); + this->sequence = string(length,'\0'); + seq.resize(this->sequence.length()); this->reads = 0; } @@ -258,6 +91,7 @@ class Window : public Loci { unsigned int a; //clear out endlines while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); } + seq.clear(); seq.resize(sequence.length()); } string get_sequence() { @@ -266,14 +100,34 @@ class Window : public Loci { void add_read(const Read& r) { if(this->chr != r.chr) return; - int offset = r - (*this); - this->reads++; - for(unsigned int i = 0; i < r.length(); i++) { - int seq_idx = offset + i; - if(seq_idx > 0 && (unsigned)seq_idx >= seq.size()) { cerr << "Error: offset: " << offset << " seq_idx " << seq_idx << " size: " << seq.size() << " seqlen: " << sequence.length() << endl; } - if(seq_idx < 0 || (seq_idx >= 0 && (unsigned)seq_idx > this->length) ) { continue; } - seq[offset + i].add_nuc(r[i]); + + int w_offset = 0; + int r_offset = 0; + int overlap_len = r.length(); + + //if the read begins before the window starts: + // ---(====[====)------]------ + if(r.pos < this->pos) { + r_offset = this->pos - r.pos; + w_offset = 0; + } else { + w_offset = r.pos - this->pos; + r_offset = 0; + } + + //if the read ends after the window ends + // ----[----(====]======)----- + if(r.pos + r.length() >= this->pos + this->length) { + overlap_len = (this->pos + this->length) - r.pos; + } else { + overlap_len = r.length(); } + + for(; overlap_len > 0; overlap_len--) { + seq[w_offset++].add_nuc(r[r_offset++]); + } + + this->reads++; } void print_consensus(ostream& o) { @@ -303,6 +157,10 @@ class Window : public Loci { void print_fasta(ostream& o) { unsigned int line_len = 100; + + if(sequence.length() != seq.size()) { + cerr << "Size mismatch: sequence is " << sequence.length() << " but data has " << seq.size() << endl; + } string output = ""; vector variants; @@ -343,7 +201,7 @@ class Window : public Loci { char ref = toupper(sequence[i]); char con = toupper(seq[i].consensus()); if(con != ' ' && con != ref) { - o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i][0] << "," << seq[i][1] << "," << seq[i][2] << "," << seq[i][3] << "]" << endl; + o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i].A() << "," << seq[i].C() << "," << seq[i].G() << "," << seq[i].T() << "]" << endl; } } } @@ -351,7 +209,7 @@ class Window : public Loci { void print_logo(ostream& o) { unsigned int max = 0; for(unsigned int i = 0; i < sequence.length(); i++) { - if(seq[i].size() > max) { max = seq[i].size(); } + if(seq[i].N() > max) { max = seq[i].N(); } } for(unsigned int i = 0; i < max; i++) { @@ -641,7 +499,7 @@ void read_snps(const char* filename, SNPs& snps) { std::stable_sort(snps.begin(),snps.end()); feat.close(); - cerr << "Found and sorted " << snps.size() << " snps." << endl; + cerr << "Found AND sorted " << snps.size() << " snps." << endl; } void read_align_file(char* filename, Reads& features) { @@ -668,8 +526,8 @@ void read_align_file(char* filename, Reads& features) { bool strand = ((fields[4].c_str())[0] == 'F')?0:1; string seq; - if(strand == 0) { seq = fields[0]; } else { strrevcomp(seq,fields[0]); } - Read read(chr,pos,seq); + if(strand == 0) { seq = fields[0]; } else { revcomp(seq,fields[0]); } + Read read(chr,pos,0,seq); features.push_back(read); } seqs.close(); @@ -742,6 +600,10 @@ void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) { for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) { if(i->chr != chrom) { chrom = i->chr; + cout << "XXX: " << (*(chrom_filenames.find(chrom))).first << endl; + if(chrom_filenames.find(chrom) == chrom_filenames.end()) { + cerr << "Chrom: " << chrom << " not found\n"; + } chrom_filename = chrom_filenames[chrom]; chrom_file.close(); chrom_file.open(chrom_filename.c_str()); chrom_file.getline(temp, 1024); @@ -772,6 +634,8 @@ int main(int argc, char** argv) { char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]); Windows windows; read_window_file(window_filename, windows); + + if(windows.size() == 0) { cout << "No windows loaded." << endl; exit(0); } ChromList reference_seq(chromosome_filename); retrieveSequenceData(reference_seq, windows); @@ -780,6 +644,8 @@ int main(int argc, char** argv) { Reads reads; read_align_file(read_filename, reads); + if(reads.size() == 0) { cout << "No reads loaded." << endl; exit(0); } + count_read_in_features(windows, reads); for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) { @@ -789,19 +655,3 @@ int main(int argc, char** argv) { w->print_fasta(cout); } } - -void strrevcomp(string& output, const string& input) -{ - output = input; - unsigned int i; - - for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; } - - for (unsigned int p1 = 0; p1 < output.length(); ++p1) { - if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; } - else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; } - else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; } - else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; } - } -} - -- 2.30.2