--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <queue>
+#include <math.h>
+#include <string>
+
+#include <gsl/gsl_statistics.h>
+
+#define WINDOW 25
+
+using namespace std;
+
+
+void int_to_chromstr(int chrom, char* chr);
+int chromstr_to_int(string chrom);
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+class Hit {
+ public:
+ int chr;
+ int start;
+ int end;
+ bool strand;
+
+ string name;
+
+ unsigned int count[2];
+
+ Hit(string name, int chr, int start, int end, bool strand) { this->name = name; this->chr = chr; this->start = start; this->end = end; this->strand = strand; count[0] = 0; count[1] = 0; }
+};
+
+ostream &operator<<( ostream &out, const Hit &h ) {
+ out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0] << "\t" << h.count[1];
+ return out;
+}
+
+typedef vector<Hit> Hits;
+
+bool compare_hits(const Hit &a, const Hit &b) {
+ if(a.chr == b.chr) {
+ return a.end < b.start;
+ } else {
+ return a.chr < b.chr;
+ }
+}
+
+void read_features(char* filename, Hits& features) {
+ string delim("\t");
+
+ ifstream feat(filename);
+ size_t N = 0;
+ while(feat.peek() != EOF) {
+ char line[1024];
+ feat.getline(line,1024,'\n');
+ N++;
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
+
+ string name = fields[0];
+ int chr = chromstr_to_int(fields[1]);
+ if(chr == -1) { continue; }
+ int start = atoi(fields[2].c_str());
+ int stop = atoi(fields[3].c_str());
+ bool strand = 0;
+
+ Hit feat(name,chr,start,stop,strand);
+ features.push_back(feat);
+ }
+
+ //sort the features so we can run through it once
+ std::stable_sort(features.begin(),features.end(),compare_hits);
+ feat.close();
+}
+
+void read_align_file(char* filename, Hits& features) {
+ string delim = " \n";
+ string location_delim = ":";
+ char strand_str[2]; strand_str[1] = '\0';
+ ifstream seqs(filename);
+ string name("");
+ while(seqs.peek() != EOF) {
+ char line[2048];
+ seqs.getline(line,2048,'\n');
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() == 3) { continue; }
+
+ vector<string> location; split(fields[3], location_delim, location);
+ int chr = chromstr_to_int(location[0]);
+ if(chr == -1) { continue; }
+ int pos = atoi(location[1].c_str());
+ bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+ if(strand == 0) {
+ Hit hit(name,chr,pos,pos,strand);
+ features.push_back(hit);
+ } else {
+ Hit hit(name,chr,pos+32,pos+32,strand);
+ features.push_back(hit);
+ }
+ }
+ seqs.close();
+
+ //sort the data so we can run through it once
+ std::sort(features.begin(),features.end(),compare_hits);
+}
+
+void print_features(Hits& hits, string filename, unsigned int size_1, unsigned int size_2) {
+ ofstream out(filename.c_str());
+ for(Hits::iterator i = hits.begin(); i != hits.end(); ++i) {
+ Hit h = (*i);
+ out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0]/(double)(size_1/1000000.0) << "\t" << h.count[1]/(double)(size_2/1000000.0) << "\t" << (double)log(((double)h.count[0]/(double)size_1)/(double)(h.count[1]/(double)size_2))/log(2) << endl;
+ }
+ out.close();
+}
+
+void count_read_in_features(Hits& features, Hits& data, unsigned int idx) {
+ int chrom = -1;
+ unsigned int feat_idx = 0;
+ int num_hits = 0;
+ for(Hits::iterator i = data.begin(); i != data.end(); ++i) {
+ num_hits++;
+ if(chrom == -1 || i->chr != chrom) {
+ chrom = i->chr;
+ feat_idx = 0;
+ while(feat_idx < features.size() && features[feat_idx].chr != chrom) { feat_idx++; }
+ if(feat_idx == features.size()) { break; }
+ }
+
+ double hit_size = (double)features[feat_idx].end - features[feat_idx].start;
+ double dist_to_feature = (double)(i->start - features[feat_idx].start) / hit_size;
+ //if we have passed the last feature, fast forward to the next
+ while( feat_idx < features.size() && dist_to_feature > 1 ) {
+ feat_idx++;
+ if(features[feat_idx].chr != i->chr ) { goto end_loop; }
+ hit_size = (double)features[feat_idx].end - features[feat_idx].start;
+ dist_to_feature = (double)(i->start - features[feat_idx].start) / hit_size;
+ }
+
+ if(features[feat_idx].strand == 1) {
+ dist_to_feature = 1.0 - dist_to_feature;
+ }
+
+ if(dist_to_feature >= 0 && dist_to_feature <= 1) {
+ features[feat_idx].count[idx]++;
+ }
+ end_loop:
+ ;
+ }
+}
+
+int main(int argc, char** argv) {
+ if(argc < 8) { cerr << "Usage: " << argv[0] << " TF label1 fearure1 reads1 label2 feature2 reads2\n"; exit(1); }
+
+ char TF[128]; strcpy(TF,argv[1]);
+
+ char label1[128]; strcpy(label1,argv[2]);
+ Hits features1; read_features(argv[3], features1);
+ Hits align1; read_align_file(argv[4], align1);
+
+ char label2[128]; strcpy(label2,argv[5]);
+ Hits features2; read_features(argv[6], features2);
+ Hits align2; read_align_file(argv[7], align2);
+
+ Hits feature_union;
+ Hits feature_intersect;
+ Hits feature_diff;
+
+ set_union(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_union),compare_hits);
+ set_intersection(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_intersect),compare_hits);
+ set_symmetric_difference(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_diff),compare_hits);
+
+ size_t u = feature_union.size();
+ size_t i = feature_intersect.size();
+ size_t d = feature_diff.size();
+
+ string str_1(label1);
+ string str_2(label2);
+
+ string union_out = str_1 + "_" + str_2 + ".union";
+ string intersect_out = str_1 + "_" + str_2 + ".intersect";
+ string diff_out = str_1 + "_" + str_2 + ".diff";
+
+ count_read_in_features(feature_union, align1,0);
+ count_read_in_features(feature_union, align2,1);
+ count_read_in_features(feature_intersect, align1,0);
+ count_read_in_features(feature_intersect, align2,1);
+ count_read_in_features(feature_diff, align1,0);
+ count_read_in_features(feature_diff, align2,1);
+
+ print_features( feature_union, union_out, align1.size(), align2.size());
+ print_features( feature_intersect, intersect_out, align1.size(), align2.size());
+ print_features( feature_diff, diff_out, align1.size(), align2.size());
+
+ double data1[feature_union.size()];
+ double data2[feature_union.size()];
+
+ for(unsigned int idx = 0; idx < feature_union.size(); idx++) {
+ data1[idx] = (double)feature_union[idx].count[0]/(double)align1.size();
+ data2[idx] = (double)feature_union[idx].count[1]/(double)align2.size();
+ }
+ double corr = gsl_stats_correlation(data1,1,data2,1,feature_union.size());
+
+ cout << TF << "\t" << label1 << "(" << features1.size() <<")\t" << label2 << "(" << features2.size() << ")\t" << u << "\t" << i << "\t" << d << "\t" << (double)i/(double)u << "\t" << corr << endl;
+}
+
+int chromstr_to_int(string chromstr) {
+ const char* chrom = chromstr.c_str();
+ if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; }
+ if(chrom[3] == 'X') { return 23; } else
+ if(chrom[3] == 'Y') { return 24; } else
+ if(chrom[3] == 'M') { return 25; } else
+ if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else
+ { return -1; }
+}
+
+void int_to_chromstr(int chrom, char* chr) {
+ strcpy(chr,"chr");
+ switch(chrom) {
+ case 23: chr[3]='X'; break;
+ case 24: chr[3]='Y'; break;
+ case 25: chr[3]='M'; break;
+ default: if(chrom < 10) { chr[3]=(char)(48+chrom); chr[4] = '\0'; }
+ else { chr[3]=(char)(48+(int)(chrom/10));
+ chr[4]=(char)(48+(int)(chrom - (int)(chr[3]-48)));
+ chr[5]='\0';
+ }
+ }
+}
+
+
+void split (const string& text, const string& separators, vector<string>& words) {
+
+ size_t n = text.length ();
+ size_t start = text.find_first_not_of (separators);
+
+ while (start < n) {
+ size_t stop = text.find_first_of (separators, start);
+ if (stop > n) stop = n;
+ words.push_back (text.substr (start, stop-start));
+ start = text.find_first_not_of (separators, stop+1);
+ }
+}