12 #include <gsl/gsl_statistics.h>
18 void split (const string& text, const string& separators, vector<string>& words);
19 char *strrevcomp(const string& input);
26 Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; }
27 Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; }
28 Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; }
30 bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } }
34 class Read : public Loci {
38 Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; }
39 Read(const Read& r) : Loci(r) { this->seq = r.seq; }
40 Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;}
43 class SNP : public Loci {
55 SNP(string name, string chr, unsigned int pos, char reference_base) : Loci(chr,pos) {
57 this->A = 0; this->C = 0; this->G = 0; this->T = 0; this->total = 0;
58 this->reference_base = reference_base;
61 SNP(const SNP& h) : Loci(h) {
63 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
64 this->reference_base = h.reference_base;
67 SNP& operator=(const SNP& h) {
71 this->A = h.A; this->C = h.C; this->G = h.G; this->T = h.T; this->total = h.total;
72 this->reference_base = h.reference_base;
76 void add_read(char nuc) {
97 ostream &operator<<( ostream &out, const SNP &h ) {
98 out << h.name.c_str() << "\t" << h.chr.c_str() << "\t" << h.pos << "\t" << h.reference_base << "\t" << h.A << "\t" << h.C << "\t" << h.G << "\t" << h.T << "\t" << h.total;
102 typedef vector<Read> Reads;
103 typedef vector<SNP> SNPs;
105 void read_snps(const char* filename, SNPs& snps) {
108 ifstream feat(filename);
110 while(feat.peek() != EOF) {
112 feat.getline(line,1024,'\n');
114 string line_str(line);
115 vector<string> fields;
116 split(line_str, delim, fields);
117 if(fields.size() != 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
119 string name = fields[0];
120 string chr = fields[1];
121 unsigned int pos = atoi(fields[2].c_str());
122 char base = (fields[3])[0];
124 SNP snp(name,chr,pos,base);
128 //sort the features so we can run through it once
129 std::stable_sort(snps.begin(),snps.end());
132 cerr << "Found and sorted " << snps.size() << " snps." << endl;
135 void read_align_file(char* filename, Reads& features) {
137 string location_delim(":");
138 char strand_str[2]; strand_str[1] = '\0';
139 ifstream seqs(filename);
141 while(seqs.peek() != EOF) {
143 seqs.getline(line,2048,'\n');
145 string line_str(line);
146 vector<string> fields;
147 split(line_str, delim, fields);
148 if(fields.size() != 7) { continue; }
151 vector<string> location; split(fields[3], location_delim, location);
152 string chr = location[0];
153 if(chr == "NA") { continue; }
154 int pos = atoi(location[1].c_str());
155 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
161 seq = strrevcomp(fields[0]);
163 Read read(chr,pos,seq);
164 features.push_back(read);
168 //sort the data so we can run through it once
169 std::sort(features.begin(),features.end());
170 cerr << "Found and sorted " << features.size() << " reads." << endl;
173 void count_read_at_snps(SNPs& snps, Reads& data) {
174 SNPs::iterator snp_it = snps.begin();
176 //assume, for now, that all reads have the same length
177 unsigned int read_len = data[0].seq.length();
179 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
180 //skip to first feature after read
181 string start_chr = snp_it->chr;
182 while(snp_it != snps.end() && *snp_it < *i) {
186 //stop if we have run out of features.
187 if(snp_it == snps.end()) { break; }
189 if(i->pos + read_len > snp_it->pos && i->pos <= snp_it->pos) {
190 snp_it->add_read(i->seq[snp_it->pos - i->pos]);
195 int main(int argc, char** argv) {
196 if(argc != 3) { cerr << "Usage: " << argv[0] << " snp_file read_file\n"; exit(1); }
198 char snp_filename[1024]; strcpy(snp_filename,argv[1]);
199 char read_filename[1024]; strcpy(read_filename,argv[2]);
201 SNPs snps; read_snps(snp_filename, snps);
202 Reads reads; read_align_file(read_filename, reads);
204 count_read_at_snps(snps, reads);
206 for(SNPs::iterator i = snps.begin(); i != snps.end(); ++i) {
207 cout << (*i) << endl;
211 void split (const string& text, const string& separators, vector<string>& words) {
213 size_t n = text.length ();
214 size_t start = text.find_first_not_of (separators);
217 size_t stop = text.find_first_of (separators, start);
218 if (stop > n) stop = n;
219 words.push_back (text.substr (start, stop-start));
220 start = text.find_first_not_of (separators, stop+1);
224 char *strrevcomp(const string& input)
226 char* str = new char[input.length()];
227 strcpy(str,input.c_str());
234 for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2) {
240 for (p1 = str; p1 < str + strlen(str); ++p1) {
241 if(*p1 == 'a' || *p1 == 'A') { *p1 = 'T'; }
242 else if(*p1 == 'c' || *p1 == 'C') { *p1 = 'G'; }
243 else if(*p1 == 'g' || *p1 == 'G') { *p1 = 'C'; }
244 else if(*p1 == 't' || *p1 == 'T') { *p1 = 'A'; }