htswanalysis/src/ValidationDesign/qPCR.cpp

   1 #include <sys/types.h>
   2 #include <dirent.h>
   3 #include <errno.h>
   4 #include <iostream>
   5 #include <fstream>
   6 #include <vector>
   7 #include <map>
   8 #include <queue>
   9 #include <math.h>
  10 #include <string>
  11
  12 #include <gsl/gsl_statistics.h>
  13
  14 #define WINDOW 25
  15
  16 using namespace std;
  17
  18
  19 void int_to_chromstr(int chrom, char* chr);
  20 int chromstr_to_int(string chrom);
  21
  22 void split (const string& text, const string& separators, vector<string>& words);
  23
  24 class Read {
  25   public:
  26     string chr;
  27     int pos;
  28     bool strand;
  29
  30     Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
  31     Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
  32     Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
  33 };
  34
  35 class Hit {
  36   public:
  37     int start;
  38     int end;
  39     bool strand;
  40
  41     string chr;
  42     string name;
  43
  44     unsigned int count[2];
  45
  46     Hit(string name, string chr, int start, int end, bool strand) {
  47       this->name = name;
  48       this->chr = chr;
  49       this->start = start;
  50       this->end = end;
  51       this->strand = strand;
  52       count[0] = 0;
  53       count[1] = 0;
  54     }
  55
  56     Hit(const Hit& h) {
  57       this->name = h.name;
  58       this->chr = h.chr;
  59       this->start = h.start;
  60       this->end = h.end;
  61       this->strand = h.strand;
  62       count[0] = h.count[0];
  63       count[1] = h.count[1];
  64     }
  65
  66     Hit& operator=(const Hit& h) {
  67       this->name = h.name;
  68       this->chr = h.chr;
  69       this->start = h.start;
  70       this->end = h.end;
  71       this->strand = h.strand;
  72       count[0] = h.count[0];
  73       count[1] = h.count[1];
  74       return *this;
  75     }
  76 };
  77
  78 bool Feat_lt_Read(const Hit& feat, const Read& read) {
  79   //First compare based on chromosomes
  80   if(read.chr == feat.chr) { return feat.end < read.pos; }
  81   else { return feat.chr < read.chr; }
  82 }
  83
  84 ostream &operator<<( ostream &out, const Hit &h ) {
  85   out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count;
  86   return out;
  87 }
  88
  89 typedef vector<Hit> Hits;
  90 typedef vector<Read> Reads;
  91
  92 bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
  93 bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
  94
  95 void read_features(const char* filename, Hits& features) {
  96   string delim("\t");
  97
  98   ifstream feat(filename);
  99   size_t N = 0;
 100   while(feat.peek() != EOF) {
 101     char line[1024];
 102     feat.getline(line,1024,'\n');
 103     N++;
 104     string line_str(line);
 105     vector<string> fields;
 106     split(line_str, delim, fields);
 107     if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
 108
 109     string name = fields[0];
 110     string chr = fields[1];
 111     if(chr == "NA") { continue; }
 112     int start = atoi(fields[2].c_str());
 113     int stop = atoi(fields[3].c_str());
 114     bool strand = 0;
 115
 116     Hit feat(name,chr,start,stop,strand);
 117     features.push_back(feat);
 118   }
 119
 120   //sort the features so we can run through it once
 121   std::stable_sort(features.begin(),features.end(),compare_hits);
 122   feat.close();
 123 }
 124
 125 void read_align_file(char* filename, Reads& features) {
 126   string delim(" \n");
 127   string location_delim(":");
 128   char strand_str[2]; strand_str[1] = '\0';
 129   ifstream seqs(filename);
 130   string name("");
 131   while(seqs.peek() != EOF) {
 132     char line[2048];
 133     seqs.getline(line,2048,'\n');
 134
 135     string line_str(line);
 136     vector<string> fields;
 137     split(line_str, delim, fields);
 138     if(fields.size() != 7) { continue; }
 139
 140     vector<string> location; split(fields[3], location_delim, location);
 141     string chr = location[0];
 142     if(chr == "NA") { continue; }
 143     int pos = atoi(location[1].c_str());
 144     bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
 145
 146     if(strand == 0) {
 147       Read read(chr,pos,false);
 148       features.push_back(read);
 149     } else {
 150       Read read(chr,pos+25,true);
 151       features.push_back(read);
 152     }
 153   }
 154   seqs.close();
 155
 156   //sort the data so we can run through it once
 157   std::sort(features.begin(),features.end(),compare_reads);
 158   cerr << "Reads sorted\n";
 159 }
 160
 161 void count_read_in_features(Hits& features, Reads& data, unsigned int idx) {
 162   Hits::iterator feat_it = features.begin();
 163
 164   for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
 165     //skip to first feature after read
 166     string start_chr = feat_it->chr;
 167     while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) {
 168       feat_it++;
 169     }
 170
 171     //stop if we have run out of features.
 172     if(feat_it == features.end()) { break; }
 173
 174     if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
 175       feat_it->count[idx]++;
 176     }
 177   }
 178 }
 179
 180 int getdir (string dir, vector<string> &files)
 181 {
 182     DIR *dp;
 183     struct dirent *dirp;
 184     if((dp  = opendir(dir.c_str())) == NULL) {
 185         cout << "Error(" << errno << ") opening " << dir << endl;
 186         return errno;
 187     }
 188
 189     while ((dirp = readdir(dp)) != NULL) {
 190         if(dirp->d_name[0] == '.') { continue; }
 191         files.push_back(dir + "/" + string(dirp->d_name));
 192     }
 193     closedir(dp);
 194     return 0;
 195 }
 196
 197 int compare_double(const void* a, const void* b) {
 198   return *(double*)a - *(double*)b;
 199 }
 200
 201 int main(int argc, char** argv) {
 202   if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); }
 203
 204   char read_filename[1024]; strcpy(read_filename,argv[1]);
 205   char bg_filename[1024]; strcpy(bg_filename,argv[2]);
 206   string testdir(argv[3]);
 207   Reads reads; read_align_file(read_filename, reads);
 208   Reads bg; read_align_file(bg_filename, bg);
 209
 210   vector<string> tests;
 211   getdir(testdir,tests);
 212
 213   for(unsigned int i = 0; i < tests.size(); i++) {
 214     Hits features;
 215     read_features(tests[i].c_str(), features);
 216     count_read_in_features(features, reads,0);
 217     count_read_in_features(features, bg,1);
 218     double read_data[features.size()];
 219     double bg_data[features.size()];
 220     //double readZ = reads.size()/1000000.0;
 221     //double bgZ = bg.size()/1000000.0;
 222
 223     // temporary hack to get raw read counts
 224     double readZ = 1.0; double bgZ = 1.0;
 225     for(unsigned int idx = 0; idx < features.size(); idx++) {
 226       read_data[idx] = ((double)features[idx].count[0]/readZ);
 227       bg_data[idx] = ((double)features[idx].count[1]/bgZ);
 228       cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl;
 229     }
 230     double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size());
 231     //qsort(data, features.size(), sizeof(double), compare_double);
 232     //double median = gsl_stats_median_from_sorted_data(data, 1, features.size());
 233     //double var = gsl_stats_variance_m(data,1,features.size(),mean);
 234     cout << tests[i].c_str() << "\t" << enrichment << endl;
 235   }
 236 }
 237
 238 void split (const string& text, const string& separators, vector<string>& words) {
 239
 240     size_t n     = text.length ();
 241     size_t start = text.find_first_not_of (separators);
 242
 243     while (start < n) {
 244         size_t stop = text.find_first_of (separators, start);
 245         if (stop > n) stop = n;
 246         words.push_back (text.substr (start, stop-start));
 247         start = text.find_first_not_of (separators, stop+1);
 248     }
 249 }