12 #include <gsl/gsl_statistics.h>
19 void int_to_chromstr(int chrom, char* chr);
20 int chromstr_to_int(string chrom);
22 void split (const string& text, const string& separators, vector<string>& words);
30 Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
31 Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
32 Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
44 unsigned int count[2];
46 Hit(string name, string chr, int start, int end, bool strand) {
51 this->strand = strand;
59 this->start = h.start;
61 this->strand = h.strand;
62 count[0] = h.count[0];
63 count[1] = h.count[1];
66 Hit& operator=(const Hit& h) {
69 this->start = h.start;
71 this->strand = h.strand;
72 count[0] = h.count[0];
73 count[1] = h.count[1];
78 bool Feat_lt_Read(const Hit& feat, const Read& read) {
79 //First compare based on chromosomes
80 if(read.chr == feat.chr) { return feat.end < read.pos; }
81 else { return feat.chr < read.chr; }
84 ostream &operator<<( ostream &out, const Hit &h ) {
85 out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count;
89 typedef vector<Hit> Hits;
90 typedef vector<Read> Reads;
92 bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
93 bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
95 void read_features(const char* filename, Hits& features) {
98 ifstream feat(filename);
100 while(feat.peek() != EOF) {
102 feat.getline(line,1024,'\n');
104 string line_str(line);
105 vector<string> fields;
106 split(line_str, delim, fields);
107 if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
109 string name = fields[0];
110 string chr = fields[1];
111 if(chr == "NA") { continue; }
112 int start = atoi(fields[2].c_str());
113 int stop = atoi(fields[3].c_str());
116 Hit feat(name,chr,start,stop,strand);
117 features.push_back(feat);
120 //sort the features so we can run through it once
121 std::stable_sort(features.begin(),features.end(),compare_hits);
125 void read_align_file(char* filename, Reads& features) {
127 string location_delim(":");
128 char strand_str[2]; strand_str[1] = '\0';
129 ifstream seqs(filename);
131 while(seqs.peek() != EOF) {
133 seqs.getline(line,2048,'\n');
135 string line_str(line);
136 vector<string> fields;
137 split(line_str, delim, fields);
138 if(fields.size() != 7) { continue; }
140 vector<string> location; split(fields[3], location_delim, location);
141 string chr = location[0];
142 if(chr == "NA") { continue; }
143 int pos = atoi(location[1].c_str());
144 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
147 Read read(chr,pos,false);
148 features.push_back(read);
150 Read read(chr,pos+25,true);
151 features.push_back(read);
156 //sort the data so we can run through it once
157 std::sort(features.begin(),features.end(),compare_reads);
160 void count_read_in_features(Hits& features, Reads& data, unsigned int idx) {
161 Hits::iterator feat_it = features.begin();
163 for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
164 //skip to first feature after read
165 string start_chr = feat_it->chr;
166 while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) {
170 //stop if we have run out of features.
171 if(feat_it == features.end()) { break; }
173 if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
174 feat_it->count[idx]++;
179 int getdir (string dir, vector<string> &files)
183 if((dp = opendir(dir.c_str())) == NULL) {
184 cout << "Error(" << errno << ") opening " << dir << endl;
188 while ((dirp = readdir(dp)) != NULL) {
189 if(dirp->d_name[0] == '.') { continue; }
190 files.push_back(dir + "/" + string(dirp->d_name));
196 int compare_double(const void* a, const void* b) {
197 return *(double*)a - *(double*)b;
200 int main(int argc, char** argv) {
201 if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); }
203 char read_filename[1024]; strcpy(read_filename,argv[1]);
204 char bg_filename[1024]; strcpy(bg_filename,argv[2]);
205 string testdir(argv[3]);
206 Reads reads; read_align_file(read_filename, reads);
207 Reads bg; read_align_file(bg_filename, bg);
209 vector<string> tests;
210 getdir(testdir,tests);
212 for(unsigned int i = 0; i < tests.size(); i++) {
214 read_features(tests[i].c_str(), features);
215 count_read_in_features(features, reads,0);
216 count_read_in_features(features, bg,1);
217 double read_data[features.size()];
218 double bg_data[features.size()];
219 double readZ = reads.size()/1000000.0;
220 double bgZ = bg.size()/1000000.0;
221 for(unsigned int idx = 0; idx < features.size(); idx++) {
222 read_data[idx] = ((double)features[idx].count[0]/readZ);
223 bg_data[idx] = ((double)features[idx].count[1]/bgZ);
224 // cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl;
226 double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size());
227 //qsort(data, features.size(), sizeof(double), compare_double);
228 //double median = gsl_stats_median_from_sorted_data(data, 1, features.size());
229 //double var = gsl_stats_variance_m(data,1,features.size(),mean);
230 cout << tests[i].c_str() << "\t" << enrichment << endl;
234 void split (const string& text, const string& separators, vector<string>& words) {
236 size_t n = text.length ();
237 size_t start = text.find_first_not_of (separators);
240 size_t stop = text.find_first_of (separators, start);
241 if (stop > n) stop = n;
242 words.push_back (text.substr (start, stop-start));
243 start = text.find_first_not_of (separators, stop+1);