12 #include <gsl/gsl_statistics.h>
18 void split (const string& text, const string& separators, vector<string>& words);
26 Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
27 Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
28 Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
40 unsigned int count[2];
42 Hit(string name, string chr, int start, int end, bool strand) {
47 this->strand = strand;
55 this->start = h.start;
57 this->strand = h.strand;
58 count[0] = h.count[0];
59 count[1] = h.count[1];
62 Hit& operator=(const Hit& h) {
65 this->start = h.start;
67 this->strand = h.strand;
68 count[0] = h.count[0];
69 count[1] = h.count[1];
74 bool Feat_lt_Read(const Hit& feat, const Read& read) {
75 //First compare based on chromosomes
76 if(read.chr == feat.chr) { return feat.end < read.pos; }
77 else { return feat.chr < read.chr; }
80 ostream &operator<<( ostream &out, const Hit &h ) {
81 out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count;
85 typedef vector<Hit> Hits;
86 typedef vector<Read> Reads;
88 bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
89 bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
91 void read_features(const char* filename, Hits& features) {
94 ifstream feat(filename);
96 while(feat.peek() != EOF) {
98 feat.getline(line,1024,'\n');
100 string line_str(line);
101 vector<string> fields;
102 split(line_str, delim, fields);
103 if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
105 string name = fields[0];
106 string chr = fields[1];
107 if(chr == "NA") { continue; }
108 int start = atoi(fields[2].c_str());
109 int stop = atoi(fields[3].c_str());
112 Hit feat(name,chr,start,stop,strand);
113 features.push_back(feat);
116 //sort the features so we can run through it once
117 std::stable_sort(features.begin(),features.end(),compare_hits);
121 void read_align_file(char* filename, Reads& features) {
123 string location_delim(":");
124 char strand_str[2]; strand_str[1] = '\0';
125 ifstream seqs(filename);
127 while(seqs.peek() != EOF) {
129 seqs.getline(line,2048,'\n');
131 string line_str(line);
132 vector<string> fields;
133 split(line_str, delim, fields);
134 if(fields.size() != 7) { continue; }
136 vector<string> location; split(fields[3], location_delim, location);
137 string chr = location[0];
138 if(chr == "NA") { continue; }
139 int pos = atoi(location[1].c_str());
140 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
143 Read read(chr,pos,false);
144 features.push_back(read);
146 Read read(chr,pos+25,true);
147 features.push_back(read);
152 //sort the data so we can run through it once
153 std::sort(features.begin(),features.end(),compare_reads);
156 void count_read_in_features(Hits& features, Reads& data, unsigned int idx) {
157 Hits::iterator feat_it = features.begin();
159 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
160 //skip to first feature after read
161 string start_chr = feat_it->chr;
162 while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) {
166 //stop if we have run out of features.
167 if(feat_it == features.end()) { break; }
169 if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
170 feat_it->count[idx]++;
175 int getdir (string dir, vector<string> &files)
179 if((dp = opendir(dir.c_str())) == NULL) {
180 cout << "Error(" << errno << ") opening " << dir << endl;
184 while ((dirp = readdir(dp)) != NULL) {
185 if(dirp->d_name[0] == '.') { continue; }
186 files.push_back(dir + "/" + string(dirp->d_name));
192 int compare_double(const void* a, const void* b) {
193 return *(double*)a - *(double*)b;
196 int main(int argc, char** argv) {
197 if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); }
199 char read_filename[1024]; strcpy(read_filename,argv[1]);
200 char bg_filename[1024]; strcpy(bg_filename,argv[2]);
201 string testdir(argv[3]);
202 Reads reads; read_align_file(read_filename, reads);
203 Reads bg; read_align_file(bg_filename, bg);
205 vector<string> tests;
206 getdir(testdir,tests);
208 for(unsigned int i = 0; i < tests.size(); i++) {
210 read_features(tests[i].c_str(), features);
211 count_read_in_features(features, reads,0);
212 count_read_in_features(features, bg,1);
213 double read_data[features.size()];
214 double bg_data[features.size()];
215 double readZ = reads.size()/1000000.0;
216 double bgZ = bg.size()/1000000.0;
217 for(unsigned int idx = 0; idx < features.size(); idx++) {
218 read_data[idx] = ((double)features[idx].count[0]/readZ);
219 bg_data[idx] = ((double)features[idx].count[1]/bgZ);
220 // cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl;
222 double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size());
223 //qsort(data, features.size(), sizeof(double), compare_double);
224 //double median = gsl_stats_median_from_sorted_data(data, 1, features.size());
225 //double var = gsl_stats_variance_m(data,1,features.size(),mean);
226 cout << tests[i].c_str() << "\t" << enrichment << endl;
230 void split (const string& text, const string& separators, vector<string>& words) {
232 size_t n = text.length ();
233 size_t start = text.find_first_not_of (separators);
236 size_t stop = text.find_first_of (separators, start);
237 if (stop > n) stop = n;
238 words.push_back (text.substr (start, stop-start));
239 start = text.find_first_not_of (separators, stop+1);