Added a program to help compare libraries

author Tim Reddy Tim <treddy@hudsonalpha.org>

Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)

committer Tim Reddy Tim <treddy@hudsonalpha.org>

Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)
author Tim Reddy Tim <treddy@hudsonalpha.org>
Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)
committer Tim Reddy Tim <treddy@hudsonalpha.org>
Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)
diff --git a/htswanalysis/src/Makefile b/htswanalysis/src/Makefile

index b52b347759bdfb0b8498be50218c72f75ef4eaa9..73c099a0f1bffbf3151e4055db65a7b312a1c4a1 100644 (file)
--- a/htswanalysis/src/Makefile
+++ b/htswanalysis/src/Makefile
@@ -1,7 +1,7 @@
  CPPFLAGS=-g -Wall -O3 -I/opt/local/include
  LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib
  
-TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig
+TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks
  
  all: $(TARGETS)
  
diff --git a/htswanalysis/src/count_reads_in_peaks.cpp b/htswanalysis/src/count_reads_in_peaks.cpp

new file mode 100755 (executable)

index 0000000..67a78cf
--- /dev/null
+++ b/htswanalysis/src/count_reads_in_peaks.cpp
@@ -0,0 +1,249 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <queue>
+#include <math.h>
+#include <string>
+
+#include <gsl/gsl_statistics.h>
+
+#define WINDOW 25
+
+using namespace std;
+
+
+void int_to_chromstr(int chrom, char* chr);
+int chromstr_to_int(string chrom);
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+class Hit {
+  public:
+    int chr;
+    int start;
+    int end;
+    bool strand;
+
+    string name;
+
+    unsigned int count[2];
+
+    Hit(string name, int chr, int start, int end, bool strand) { this->name = name; this->chr = chr; this->start = start; this->end = end; this->strand = strand; count[0] = 0; count[1] = 0; }
+};
+
+ostream &operator<<( ostream &out, const Hit &h ) {
+  out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0] << "\t" << h.count[1];
+  return out;
+}
+
+typedef vector<Hit> Hits;
+
+bool compare_hits(const Hit &a, const Hit &b) { 
+  if(a.chr == b.chr) { 
+    return a.end < b.start;
+  } else {
+    return a.chr < b.chr; 
+  } 
+}
+
+void read_features(char* filename, Hits& features) {
+  string delim("\t");
+
+  ifstream feat(filename);
+  size_t N = 0;
+  while(feat.peek() != EOF) {
+    char line[1024];
+    feat.getline(line,1024,'\n');
+    N++;
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
+
+    string name = fields[0];
+    int chr = chromstr_to_int(fields[1]);
+    if(chr == -1) { continue; }
+    int start = atoi(fields[2].c_str());
+    int stop = atoi(fields[3].c_str());
+    bool strand = 0;
+
+    Hit feat(name,chr,start,stop,strand);
+    features.push_back(feat);
+  } 
+
+  //sort the features so we can run through it once
+  std::stable_sort(features.begin(),features.end(),compare_hits);
+  feat.close();
+}
+
+void read_align_file(char* filename, Hits& features) {
+  string delim = " \n";
+  string location_delim = ":";
+  char strand_str[2]; strand_str[1] = '\0';
+  ifstream seqs(filename);
+  string name("");
+  while(seqs.peek() != EOF) {
+    char line[2048];
+    seqs.getline(line,2048,'\n');
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() == 3) { continue; }
+ 
+    vector<string> location; split(fields[3], location_delim, location);
+    int chr = chromstr_to_int(location[0]);
+    if(chr == -1) { continue; }
+    int pos = atoi(location[1].c_str());
+    bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+    if(strand == 0) {
+      Hit hit(name,chr,pos,pos,strand); 
+      features.push_back(hit);
+    } else {
+      Hit hit(name,chr,pos+32,pos+32,strand); 
+      features.push_back(hit);
+    }
+  }
+  seqs.close(); 
+
+  //sort the data so we can run through it once
+  std::sort(features.begin(),features.end(),compare_hits);
+}
+
+void print_features(Hits& hits, string filename, unsigned int size_1, unsigned int size_2) {
+  ofstream out(filename.c_str());
+  for(Hits::iterator i = hits.begin(); i != hits.end(); ++i) {
+    Hit h = (*i);
+    out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0]/(double)(size_1/1000000.0) << "\t" << h.count[1]/(double)(size_2/1000000.0) << "\t" << (double)log(((double)h.count[0]/(double)size_1)/(double)(h.count[1]/(double)size_2))/log(2) << endl;
+  }
+  out.close();
+}
+
+void count_read_in_features(Hits& features, Hits& data, unsigned int idx) {
+  int chrom = -1; 
+  unsigned int feat_idx = 0;
+  int num_hits = 0;
+  for(Hits::iterator i = data.begin(); i != data.end(); ++i) {
+    num_hits++;
+    if(chrom == -1 || i->chr != chrom) {
+      chrom = i->chr;
+      feat_idx = 0;
+      while(feat_idx < features.size() && features[feat_idx].chr != chrom) { feat_idx++; }
+      if(feat_idx == features.size()) { break; }
+    }
+
+    double hit_size = (double)features[feat_idx].end - features[feat_idx].start;
+    double dist_to_feature = (double)(i->start - features[feat_idx].start) / hit_size; 
+    //if we have passed the last feature, fast forward to the next
+    while( feat_idx < features.size() && dist_to_feature > 1 ) {
+      feat_idx++; 
+      if(features[feat_idx].chr != i->chr ) { goto end_loop; }
+      hit_size = (double)features[feat_idx].end - features[feat_idx].start;
+      dist_to_feature = (double)(i->start - features[feat_idx].start) / hit_size; 
+    }
+
+    if(features[feat_idx].strand == 1) { 
+      dist_to_feature = 1.0 - dist_to_feature;
+    } 
+
+    if(dist_to_feature >= 0 && dist_to_feature <= 1) { 
+      features[feat_idx].count[idx]++;
+    }
+  end_loop:
+  ;
+  }
+}
+
+int main(int argc, char** argv) {
+  if(argc < 8) { cerr << "Usage: " << argv[0] << " TF label1 fearure1 reads1 label2 feature2 reads2\n"; exit(1); }
+
+  char TF[128]; strcpy(TF,argv[1]);
+
+  char label1[128]; strcpy(label1,argv[2]);
+  Hits features1; read_features(argv[3], features1); 
+  Hits align1; read_align_file(argv[4], align1); 
+
+  char label2[128]; strcpy(label2,argv[5]);
+  Hits features2; read_features(argv[6], features2);
+  Hits align2; read_align_file(argv[7], align2); 
+
+  Hits feature_union;
+  Hits feature_intersect;
+  Hits feature_diff;
+
+  set_union(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_union),compare_hits);
+  set_intersection(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_intersect),compare_hits);
+  set_symmetric_difference(features1.begin(), features1.end(), features2.begin(), features2.end(), back_inserter(feature_diff),compare_hits);
+
+  size_t u = feature_union.size();
+  size_t i = feature_intersect.size();
+  size_t d = feature_diff.size();
+
+  string str_1(label1);
+  string str_2(label2);
+ 
+  string union_out = str_1 + "_" + str_2 + ".union";
+  string intersect_out = str_1 + "_" + str_2 + ".intersect";
+  string diff_out = str_1 + "_" + str_2 + ".diff";
+
+  count_read_in_features(feature_union, align1,0);
+  count_read_in_features(feature_union, align2,1);
+  count_read_in_features(feature_intersect, align1,0);
+  count_read_in_features(feature_intersect, align2,1);
+  count_read_in_features(feature_diff, align1,0);
+  count_read_in_features(feature_diff, align2,1);
+
+  print_features( feature_union,     union_out,     align1.size(), align2.size());
+  print_features( feature_intersect, intersect_out, align1.size(), align2.size());
+  print_features( feature_diff,  diff_out,    align1.size(), align2.size());
+
+  double data1[feature_union.size()];
+  double data2[feature_union.size()];
+
+  for(unsigned int idx = 0; idx < feature_union.size(); idx++) {
+    data1[idx] = (double)feature_union[idx].count[0]/(double)align1.size();
+    data2[idx] = (double)feature_union[idx].count[1]/(double)align2.size();
+  }
+  double corr = gsl_stats_correlation(data1,1,data2,1,feature_union.size());
+
+  cout << TF << "\t" << label1  << "(" << features1.size() <<")\t" << label2 << "(" << features2.size() << ")\t" << u << "\t" << i << "\t" << d << "\t" << (double)i/(double)u << "\t" << corr << endl;
+}
+
+int chromstr_to_int(string chromstr) {
+  const char* chrom = chromstr.c_str();
+  if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; }
+  if(chrom[3] == 'X') { return 23; } else
+  if(chrom[3] == 'Y') { return 24; } else
+  if(chrom[3] == 'M') { return 25; } else
+  if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else 
+  { return -1; } 
+}
+
+void int_to_chromstr(int chrom, char* chr) {
+  strcpy(chr,"chr");
+  switch(chrom) {
+    case 23: chr[3]='X'; break;
+    case 24: chr[3]='Y'; break;
+    case 25: chr[3]='M'; break;
+    default: if(chrom < 10) { chr[3]=(char)(48+chrom); chr[4] = '\0'; }
+             else { chr[3]=(char)(48+(int)(chrom/10));
+                    chr[4]=(char)(48+(int)(chrom - (int)(chr[3]-48)));
+                    chr[5]='\0';
+                  }
+  }
+}
+  
+
+void split (const string& text, const string& separators, vector<string>& words) {
+
+    size_t n     = text.length ();
+    size_t start = text.find_first_not_of (separators);
+
+    while (start < n) {
+        size_t stop = text.find_first_of (separators, start);
+        if (stop > n) stop = n;
+        words.push_back (text.substr (start, stop-start));
+        start = text.find_first_not_of (separators, stop+1);
+    }
+}
author	Tim Reddy Tim <treddy@hudsonalpha.org>
	Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)
committer	Tim Reddy Tim <treddy@hudsonalpha.org>
	Thu, 28 Aug 2008 16:46:59 +0000 (16:46 +0000)
htswanalysis/src/Makefile		patch \| blob \| history
htswanalysis/src/count_reads_in_peaks.cpp	[new file with mode: 0755]	patch \| blob