From dae5566d25d2af65622b73e0e3a50350038004f0 Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Fri, 12 Dec 2008 00:34:23 +0000 Subject: [PATCH] Initial checkin of code to evaluate known snps from dbsnp in solexa sequencing --- htswanalysis/src/GetReadsInSnps/Makefile | 2 + .../src/GetReadsInSnps/getReadsInSnps.cpp | 254 ++++++++++++++++++ 2 files changed, 256 insertions(+) create mode 100644 htswanalysis/src/GetReadsInSnps/Makefile create mode 100755 htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp diff --git a/htswanalysis/src/GetReadsInSnps/Makefile b/htswanalysis/src/GetReadsInSnps/Makefile new file mode 100644 index 0000000..a5225be --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/Makefile @@ -0,0 +1,2 @@ +CPPFLAGS=-g -Wall -O3 -I/opt/local/include +LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib diff --git a/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp b/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp new file mode 100755 index 0000000..eb2b5dc --- /dev/null +++ b/htswanalysis/src/GetReadsInSnps/getReadsInSnps.cpp @@ -0,0 +1,254 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define WINDOW 25 + +using namespace std; + +void split (const string& text, const string& separators, vector& words); +char *strrevcomp(const string& input); + +class Read { + public: + string chr; + unsigned int pos; + string seq; + + Read(string chr, unsigned int pos, string seq) { this->chr = chr; this->pos = pos; this->seq = seq; } + Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; } + Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;} +}; + +class SNP { + public: + + string name; + string chr; + unsigned int pos; + char reference_base; + unsigned int A; + unsigned int C; + unsigned int G; + unsigned int T; + unsigned int N; + unsigned int total; + + SNP(string name, string chr, unsigned int pos, char reference_base) { + this->name = name; + this->chr = chr; + this->pos = pos; + this->A = 0; this->C = 0; this->G = 0; this->T = 0; this->total = 0; + this->reference_base = reference_base; + } + + SNP(const SNP& h) { + this->name = h.name; + this->chr = h.chr; + this->pos = h.pos; + this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total; + this->reference_base = h.reference_base; + } + + SNP& operator=(const SNP& h) { + this->name = h.name; + this->chr = h.chr; + this->pos = h.pos; + this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total; + this->reference_base = h.reference_base; + return *this; + } + + void add_read(char nuc) { + switch(nuc) { + case 'a': + case 'A': + A++; break; + case 'c': + case 'C': + C++; break; + case 'g': + case 'G': + G++; break; + case 't': + case 'T': + T++; break; + default: + N++; break; + } + total++; + } +}; + +bool SNP_lt_Read(const SNP& feat, const Read& read) { + //First compare based on chromosomes + if(read.chr == feat.chr) { return feat.pos < read.pos; } + else { return feat.chr < read.chr; } +} + +ostream &operator<<( ostream &out, const SNP &h ) { + out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total; + return out; +} + +typedef vector Reads; +typedef vector SNPs; + +bool compare_snps(const SNP &a, const SNP &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.pos < b.pos; } } +bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } + +void read_snps(const char* filename, SNPs& snps) { + string delim("\t"); + + ifstream feat(filename); + size_t N = 0; + while(feat.peek() != EOF) { + char line[1024]; + feat.getline(line,1024,'\n'); + N++; + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; } + + string name = fields[0]; + string chr = fields[1]; + unsigned int pos = atoi(fields[2].c_str()); + char base = (fields[3])[0]; + + SNP snp(name,chr,pos,base); + snps.push_back(snp); + } + + //sort the features so we can run through it once + std::stable_sort(snps.begin(),snps.end(),compare_snps); + feat.close(); + + cerr << "Found and sorted " << snps.size() << " snps." << endl; +} + +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "NA") { continue; } + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + string seq; + if(strand == 0) { + seq = fields[0]; + } else { + seq = strrevcomp(fields[0]); + } + Read read(chr,pos,seq); + features.push_back(read); + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end(),compare_reads); + cerr << "Found and sorted " << features.size() << " reads." << endl; +} + +void count_read_at_snps(SNPs& snps, Reads& data) { + SNPs::iterator snp_it = snps.begin(); + + //assume, for now, that all reads have the same length + unsigned int read_len = data[0].seq.length(); + + for(Reads::iterator i = data.begin(); i != data.end(); ++i) { + //skip to first feature after read + string start_chr = snp_it->chr; + while(snp_it != snps.end() && SNP_lt_Read(*snp_it, *i)) { + snp_it++; + } + + //stop if we have run out of features. + if(snp_it == snps.end()) { break; } + + if(i->pos + read_len > snp_it->pos && i->pos <= snp_it->pos) { + cout << i->chr.c_str() << "\t" << i->pos << "\t" << i->seq.c_str() << endl; + snp_it->add_read(i->seq[snp_it->pos - i->pos]); + } + } +} + +int main(int argc, char** argv) { + if(argc != 3) { cerr << "Usage: " << argv[0] << " snp_file read_file\n"; exit(1); } + + char snp_filename[1024]; strcpy(snp_filename,argv[1]); + char read_filename[1024]; strcpy(read_filename,argv[2]); + + SNPs snps; read_snps(snp_filename, snps); + Reads reads; read_align_file(read_filename, reads); + + count_read_at_snps(snps, reads); + + for(SNPs::iterator i = snps.begin(); i != snps.end(); ++i) { + cout << (*i) << endl; + } +} + +void split (const string& text, const string& separators, vector& words) { + + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} + +char *strrevcomp(const string& input) +{ + char* str = new char[input.length()]; + strcpy(str,input.c_str()); + + char *p1, *p2; + + if (! str || ! *str) + return str; + + for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2) { + *p1 ^= *p2; + *p2 ^= *p1; + *p1 ^= *p2; + } + + for (p1 = str; p1 < str + strlen(str); ++p1) { + if(*p1 == 'a' || *p1 == 'A') { *p1 = 'T'; } + else if(*p1 == 'c' || *p1 == 'C') { *p1 = 'G'; } + else if(*p1 == 'g' || *p1 == 'G') { *p1 = 'C'; } + else if(*p1 == 't' || *p1 == 'T') { *p1 = 'A'; } + } + + return str; +} + -- 2.30.2