12 #include <gsl/gsl_statistics.h>
18 void split (const string& text, const string& separators, vector<string>& words);
26 Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
27 Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
28 Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
31 typedef enum ReadType {MSP1,HPA2};
43 unsigned int msp1_count;
44 unsigned int hpa2_count;
47 Region(string name, string chr, int start, int end, bool strand) {
52 this->strand = strand;
57 Region(const Region& h) {
60 this->start = h.start;
62 this->strand = h.strand;
63 this->msp1_count = h.msp1_count;
64 this->hpa2_count = h.hpa2_count;
67 Region& operator=(const Region& h) {
70 this->start = h.start;
72 this->strand = h.strand;
73 this->msp1_count = h.msp1_count;
74 this->hpa2_count = h.hpa2_count;
79 bool Region_lt_Read(const Region& feat, const Read& read) {
80 //First compare based on chromosomes
81 if(read.chr == feat.chr) { return feat.end < read.pos; }
82 else { return feat.chr < read.chr; }
85 ostream &operator<<( ostream &out, const Region &h ) {
86 out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.msp1_count << "\t" << h.hpa2_count;
90 typedef vector<Read> Reads;
91 typedef vector<Region> Regions;
93 bool compare_regions(const Region &a, const Region &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
94 bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
96 void read_features(const char* filename, Regions& features) {
99 ifstream feat(filename);
101 while(feat.peek() != EOF) {
103 feat.getline(line,1024,'\n');
105 string line_str(line);
106 vector<string> fields;
107 split(line_str, delim, fields);
108 if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
110 string name = fields[0];
111 string chr = fields[1];
112 if(chr == "NA") { continue; }
113 int start = atoi(fields[2].c_str());
114 int stop = atoi(fields[3].c_str());
117 Region feat(name,chr,start,stop,strand);
118 features.push_back(feat);
121 //sort the features so we can run through it once
122 std::stable_sort(features.begin(),features.end(),compare_regions);
126 void read_align_file(char* filename, Reads& features) {
128 string location_delim(":");
129 char strand_str[2]; strand_str[1] = '\0';
130 ifstream seqs(filename);
132 while(seqs.peek() != EOF) {
134 seqs.getline(line,2048,'\n');
136 string line_str(line);
137 vector<string> fields;
138 split(line_str, delim, fields);
139 if(fields.size() != 7) { continue; }
141 vector<string> location; split(fields[3], location_delim, location);
142 string chr = location[0];
143 if(chr == "NA") { continue; }
144 int pos = atoi(location[1].c_str());
145 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
148 Read read(chr,pos,false);
149 features.push_back(read);
151 Read read(chr,pos+25,true);
152 features.push_back(read);
157 //sort the data so we can run through it once
158 std::sort(features.begin(),features.end(),compare_reads);
159 cerr << "Reads sorted\n";
162 unsigned int count_regions_assayed(Regions& r, unsigned int threshold = 1) {
163 unsigned int n_assayed = 0;
164 for(Regions::iterator i = r.begin(); i != r.end(); ++i) {
165 if(i->msp1_count >= threshold) { n_assayed++; }
170 unsigned int count_regions_methylated(Regions& r, unsigned int msp_threshold = 1, unsigned int hpa_threshold = 1) {
171 unsigned int n_methylated = 0;
172 for(Regions::iterator i = r.begin(); i != r.end(); ++i) {
173 if(i->msp1_count >= msp_threshold && i->hpa2_count >= hpa_threshold) { n_methylated++; }
178 unsigned int count_errors(Regions& r, unsigned int msp_threshold = 1, unsigned int hpa_threshold = 1) {
179 unsigned int n_error = 0;
180 for(Regions::iterator i = r.begin(); i != r.end(); ++i) {
181 if(i->msp1_count < msp_threshold && i->hpa2_count >= hpa_threshold) { n_error++; }
186 pair<unsigned int, unsigned int> count_region_tags(Regions& r) {
187 pair<unsigned int, unsigned int> counts; counts.first = 0; counts.second = 0;
188 for(Regions::iterator i = r.begin(); i != r.end(); ++i) {
189 counts.first += i->msp1_count;
190 counts.second += i->hpa2_count;
195 void count_read_in_features(Regions& features, Reads& data, ReadType type) {
196 Regions::iterator feat_it = features.begin();
198 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
199 //skip to first feature after read
200 string start_chr = feat_it->chr;
201 while(feat_it != features.end() && Region_lt_Read(*feat_it, *i)) {
205 //stop if we have run out of features.
206 if(feat_it == features.end()) { break; }
208 if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
209 if(type == MSP1) { feat_it->msp1_count++; }
210 if(type == HPA2) { feat_it->hpa2_count++; }
215 int compare_double(const void* a, const void* b) {
216 return *(double*)a - *(double*)b;
219 pair<unsigned int, unsigned int> count_region_tags(Regions& r);
220 unsigned int count_regions_assayed(Regions& r, unsigned int threshold);
221 unsigned int count_regions_methylated(Regions& r, unsigned int msp_threshold, unsigned int hpa_threshold);
222 unsigned int count_errors(Regions& r, unsigned int msp_threshold, unsigned int hpa_threshold);
224 int main(int argc, char** argv) {
225 if(argc < 4) { cerr << "Usage: " << argv[0] << " msp1_reads hpa2_reads regions\n"; exit(1); }
227 char msp1_filename[1024]; strcpy(msp1_filename,argv[1]);
228 char hpa2_filename[1024]; strcpy(hpa2_filename,argv[2]);
229 char region_filename[1024]; strcpy(region_filename,argv[3]);
231 Reads msp1; read_align_file(msp1_filename, msp1);
232 Reads hpa2; read_align_file(hpa2_filename, hpa2);
234 Regions regions; read_features(region_filename, regions);
236 count_read_in_features(regions, msp1, MSP1);
237 count_read_in_features(regions, hpa2, HPA2);
239 pair<unsigned int,unsigned int> total = count_region_tags(regions);
240 cout << total.first << " MSP1 tags, " << total.second << " HPA2 tags" << endl;
242 unsigned int n_assayed = count_regions_assayed(regions);
243 cout << n_assayed << " of " << regions.size() << " regions assayed (" << (double)n_assayed/(double)regions.size() << ")\n";
245 unsigned int n_methylated = count_regions_methylated(regions);
246 cout << n_methylated << " of " << n_assayed << " regions methylated (" << (double)n_methylated/(double)n_assayed << ")\n";
248 unsigned int n_error = count_errors(regions);
249 cout << n_error << " of " << regions.size() << " regions with error (" << (double)n_error/(double)regions.size() << ")\n";
252 ofstream assayed("Assayed.bed");
253 ofstream methylated("Methylated.bed");
254 ofstream methylcalls("MethylseqCalls.tab");
256 unsigned int msp_threshold = 1;
257 unsigned int hpa_threshold = 1;
259 for(unsigned int idx = 0; idx < regions.size(); idx++) {
261 bool is_methylated = 0;
262 if(regions[idx].msp1_count >= msp_threshold) {
263 assayed << regions[idx].chr << "\t" << regions[idx].start << "\t" << regions[idx].end << endl;
265 if(regions[idx].hpa2_count >= hpa_threshold) {
266 methylated << regions[idx].chr << "\t" << regions[idx].start << "\t" << regions[idx].end << endl;
270 methylcalls << regions[idx].name << "\t" << regions[idx].chr << "\t" << regions[idx].start << "\t" << regions[idx].end << "\t" << regions[idx].msp1_count << "\t" << regions[idx].hpa2_count << "\t" << is_assayed << "\t" << is_methylated << endl;
274 void split (const string& text, const string& separators, vector<string>& words) {
276 size_t n = text.length ();
277 size_t start = text.find_first_not_of (separators);
280 size_t stop = text.find_first_of (separators, start);
281 if (stop > n) stop = n;
282 words.push_back (text.substr (start, stop-start));
283 start = text.find_first_not_of (separators, stop+1);