Added qPCR Validation design code
authorTim Reddy Tim <treddy@hudsonalpha.org>
Tue, 25 Nov 2008 01:23:20 +0000 (01:23 +0000)
committerTim Reddy Tim <treddy@hudsonalpha.org>
Tue, 25 Nov 2008 01:23:20 +0000 (01:23 +0000)
17 files changed:
htswanalysis/src/Makefile
htswanalysis/src/ValidationDesign/Makefile [new file with mode: 0644]
htswanalysis/src/ValidationDesign/Read.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/Read.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/Window.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/chrom_list.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/chrom_list.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/defs.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/main.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/peak.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/peak.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/qPCR.cpp [new file with mode: 0755]
htswanalysis/src/ValidationDesign/refine_peaks.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/util.cpp [new file with mode: 0644]
htswanalysis/src/ValidationDesign/util.h [new file with mode: 0644]
htswanalysis/src/ValidationDesign/window.cpp [new file with mode: 0644]
htswanalysis/src/profile_reads_against_features.cpp

index 9c8ca297aa48e10d4c1835e5d05f531650ee4cf4..35df627a90754f89088707496727c71b11321ebe 100644 (file)
@@ -1,10 +1,13 @@
 CPPFLAGS=-g -Wall -O3 -I/opt/local/include
 LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib
 
-TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks
+TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks ValidationDesign/refine_peaks
 
 all: $(TARGETS)
 
+ValidationDesign/refine_peaks:
+       $(MAKE) -C ValidationDesign
+
 install: $(TARGETS)
        cp $^ ../bin/
 
diff --git a/htswanalysis/src/ValidationDesign/Makefile b/htswanalysis/src/ValidationDesign/Makefile
new file mode 100644 (file)
index 0000000..f0a184c
--- /dev/null
@@ -0,0 +1,5 @@
+%.o: %.cpp
+       g++ -g -Wall -O3 $< -c -o $@
+
+refine_peaks: refine_peaks.cpp window.o util.o Read.o chrom_list.o
+       g++ -g -Wall -O3 $^ -o $@
diff --git a/htswanalysis/src/ValidationDesign/Read.cpp b/htswanalysis/src/ValidationDesign/Read.cpp
new file mode 100644 (file)
index 0000000..b76c5f8
--- /dev/null
@@ -0,0 +1,5 @@
+#include <string>
+
+#include "Read.h"
+
+bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
diff --git a/htswanalysis/src/ValidationDesign/Read.h b/htswanalysis/src/ValidationDesign/Read.h
new file mode 100644 (file)
index 0000000..5155872
--- /dev/null
@@ -0,0 +1,22 @@
+#include <vector>
+#include <string>
+
+using namespace std;
+
+#ifndef READ_H
+#define READ_H
+
+class Read {
+  public:
+    string chr;
+    unsigned int pos;
+    bool strand;
+
+    Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
+    Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
+    Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
+};
+
+typedef vector<Read> Reads;
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/Window.h b/htswanalysis/src/ValidationDesign/Window.h
new file mode 100644 (file)
index 0000000..9be5c61
--- /dev/null
@@ -0,0 +1,35 @@
+#include <iostream>
+#include <vector>
+
+#ifndef WINDOW_H
+#define WINDOW_H
+
+using namespace std;
+
+class Window {
+  public:
+    string chr;
+    string name;
+    unsigned int start;
+    unsigned int end;
+    double score;
+
+    string sequence;
+
+    vector<unsigned int> counts;
+    unsigned int total;
+
+    void count_hits();
+    void reset(unsigned int new_start, unsigned int new_end);
+
+    Window(string name, string chr, int start, int end, double score);
+    Window(string line);
+    Window(const Window& r);
+    Window& operator=(const Window& r);
+
+    bool operator<(const Window& b) const;
+};
+
+typedef vector<Window> Windows;
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/chrom_list.cpp b/htswanalysis/src/ValidationDesign/chrom_list.cpp
new file mode 100644 (file)
index 0000000..a3fd8a0
--- /dev/null
@@ -0,0 +1,22 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "chrom_list.h"
+#include "util.h"
+
+using namespace std;
+
+ChromList::ChromList(char* filename) {
+  char buffer[1024]; string delim("\t");
+  ifstream chrom_file(filename);
+  while(chrom_file.getline(buffer, 1024)) {
+    string temp(buffer); 
+    vector<string> words;
+    split(temp, delim,words);
+    if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); }
+    (*this)[ words[0] ] = words[1]; 
+  }
+  chrom_file.close();
+}
diff --git a/htswanalysis/src/ValidationDesign/chrom_list.h b/htswanalysis/src/ValidationDesign/chrom_list.h
new file mode 100644 (file)
index 0000000..6e5446b
--- /dev/null
@@ -0,0 +1,27 @@
+#include <ext/hash_map>
+
+#ifndef CHROMLIST_H
+#define CHROMLIST_H
+
+using namespace std;
+using namespace __gnu_cxx;
+
+struct eq_str {
+  bool operator() (const string& a, const string& b) const {
+    return(strcmp(a.c_str(), b.c_str()) == 0);
+  }
+};
+
+struct hash_str {
+  size_t operator() (const string& a) const {
+    hash<const char*> h; 
+    return(h(a.c_str())); 
+  }
+};
+
+class ChromList : public hash_map<string, string, hash_str, eq_str> {
+  public:
+    ChromList(char* filename);
+};
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/defs.h b/htswanalysis/src/ValidationDesign/defs.h
new file mode 100644 (file)
index 0000000..e2be395
--- /dev/null
@@ -0,0 +1 @@
+#define PRIMER_3_PATH "/Library/WebServer/CGI-Executables/ValidationDesign/primer3_core"
diff --git a/htswanalysis/src/ValidationDesign/main.cpp b/htswanalysis/src/ValidationDesign/main.cpp
new file mode 100644 (file)
index 0000000..5a6d1cb
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ *  main.cpp
+ *  WindowLocator
+ *
+ *  Created by Alan You on 8/11/08.
+ *  Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <cmath>
+
+#include <ext/hash_map>
+
+#include "window.h"
+#include "util.h"
+
+using namespace std;
+using namespace __gnu_cxx;
+
+struct eq_str {
+  bool operator() (const string& a, const string& b) const {
+    return(strcmp(a.c_str(), b.c_str()) == 0);
+  }
+};
+
+struct hash_str {
+  size_t operator() (const string& a) const {
+    hash<const char*> h; 
+    return(h(a.c_str())); 
+  }
+};
+
+typedef hash_map<string, string, hash_str, eq_str> ChromList;
+
+void importWindows(string& filename, string& peakDataTitle, vector<Window>& peaks);
+void retrieveSequenceData(ChromList chrom_files, vector<Window>& peaks);
+void outputSequenceData(string peakDataTitle, vector<Window>& peaks);
+void outputPrimer3Input(vector<Window>& peaks);
+
+int main (int argc, char * const argv[])
+{
+        if(argc < 3) { 
+          cout << "Usage: " << argv[0] << " bed_file_of_peaks chromosome_file" << endl;
+          exit(1);
+        }
+
+        string filename(argv[1]);
+        string chrom_list(argv[2]);
+
+       pid_t pid;
+       int rv;
+       int input_pipe[2];
+       setvbuf(stdout, (char*)NULL, _IONBF, 0);
+       if (pipe(input_pipe)) {
+         cout << "Pipe error!" << endl;
+          exit(1);
+       }
+       
+       if ((pid = fork()) == -1) {
+               cout << "Fork error!" << endl;
+               exit(1);
+       }
+       
+        //child 
+       if (pid == 0) {
+         dup2(input_pipe[0], 0); close(input_pipe[1]);
+          if (execlp("primer3_core",  "primer3_core", NULL) == -1){
+            cerr << "execl error!" << endl;
+            exit(1);
+          }
+          cerr << "Done with primer3\n";
+       } else {
+         ChromList chromosome_files;
+         char buffer[1024]; string delim("\t");
+         ifstream chrom_file(chrom_list.c_str());
+         while(chrom_file.getline(buffer, 1024)) {
+           string temp(buffer); 
+           vector<string> words;
+           split(temp, delim,words);
+           if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); }
+           chromosome_files[ words[0] ] = words[1]; 
+         }
+         chrom_file.close();
+       // ChIP-seq peak data (BED format required)
+       ifstream peakDataFile;
+       
+       // Title line from ChIP-seq peak data
+       string peakDataTitle;
+       
+       // All peak calls.  Each peak call contains chromomosme number, starting base pair, post-ending base
+       // pair, peak length, score, and corresponding sequence.
+       vector<Window> peaks;
+       
+       dup2(input_pipe[1], 1); close(input_pipe[0]);
+       importWindows(filename, peakDataTitle, peaks);
+       retrieveSequenceData(chromosome_files, peaks);
+       outputPrimer3Input(peaks);
+        close(input_pipe[1]);
+        close(1);
+       wait(&rv);
+      }
+
+       return 0;
+}
+
+void importWindows(string& filename, string& peakDataTitle, vector<Window>& peaks)
+{
+        ifstream peakDataFile;
+       peakDataFile.open(filename.c_str());
+
+        char temp[1024];
+
+       peakDataFile.getline(temp, 1024);
+        peakDataTitle = temp;
+
+       while(peakDataFile.getline(temp, 1024)) {
+          if(peakDataFile.eof()) { peakDataFile.close(); break; }
+          Window inputCall(temp);
+         peaks.push_back(inputCall);
+       }       
+       
+        sort(peaks.begin(), peaks.end());
+
+}
+
+void retrieveSequenceData(ChromList chrom_filenames, vector<Window>& peaks)
+{
+        char temp[1024];
+
+       string chrom = peaks[0].chr;
+        string chrom_filename = chrom_filenames[chrom];
+        ifstream chrom_file(chrom_filename.c_str());
+       chrom_file.getline(temp, 1024);
+        size_t offset = chrom_file.gcount();
+        for(vector<Window>::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+          if(i->chr != chrom) { 
+            chrom = i->chr; 
+            chrom_filename = chrom_filenames[chrom];
+            chrom_file.close(); chrom_file.open(chrom_filename.c_str());
+           chrom_file.getline(temp, 1024);
+            offset = chrom_file.gcount();
+          }
+          unsigned int begin = i->start;
+          unsigned int end   = i->end;
+          unsigned int begin_pos = offset + (int)begin/50 + begin;      
+          unsigned int end_pos = offset + (int)end/50 + end;      
+
+          unsigned int read_len = end_pos - begin_pos;
+          char buffer[read_len+1];
+         chrom_file.seekg(begin_pos, ios_base::beg);
+         chrom_file.read(buffer, read_len+1);
+          buffer[read_len] = '\0';
+         i->sequence = buffer;
+          unsigned int a;
+          while( (a= ((i->sequence).find("\n"))) != string::npos) {
+            i->sequence.erase(a,1);
+          }
+        } 
+        chrom_file.close(); 
+}
+
+void outputPrimer3Input(vector<Window>& peaks)
+{
+  cout <<   "PRIMER_NUM_RETURN=1" << endl;
+  cout <<   "PRIMER_LOWERCASE_MASKING=1" << endl;
+  //cout <<   "PRIMER_TM_SANTALUCIA=1" << endl;
+  cout <<   "PRIMER_SELF_END=2" << endl;
+  cout <<   "PRIMER_SELF_ANY=6" << endl;
+  cout <<   "PRIMER_OPT_SIZE=23" << endl;
+  cout <<   "PRIMER_MIN_SIZE=19" << endl;
+  cout <<   "PRIMER_MAX_SIZE=26" << endl;
+  cout <<   "PRIMER_MIN_TM=63" << endl;
+  cout <<   "PRIMER_OPT_TM=65" << endl;
+  cout <<   "PRIMER_MAX_TM=67" << endl;
+  cout <<   "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl;
+                 
+  unsigned int idx = 0;
+  for(vector<Window>::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+    //cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << idx++ << "_" << i->score << endl;
+    cout << "PRIMER_SEQUENCE_ID=" << i->name << i->score << endl;
+    cout << "SEQUENCE=" << i->sequence.c_str() << endl;
+    cout << "=" << endl;
+  }
+}
diff --git a/htswanalysis/src/ValidationDesign/peak.cpp b/htswanalysis/src/ValidationDesign/peak.cpp
new file mode 100644 (file)
index 0000000..239ce19
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ *  peak.cpp
+ *  PeakLocator
+ *
+ *  Created by Alan You on 8/11/08.
+ *  Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+
+#include <vector>
+#include <string>
+
+#include "peak.h"
+#include "util.h"
+
+using namespace std;
+
+Peak::Peak()
+{
+       this->startBP = 0;
+       this->endBP = 0;
+       this->length = 0;
+       this->chrom = "";
+       this->score = 0;        
+       this->sequence = "";
+}
+
+Peak::Peak(string line)
+{
+       istringstream outStream;
+       
+        vector<string> fields;
+        string delim("\t");
+        split(line,delim,fields);
+
+        this->chrom = fields[0];
+        this->startBP = atoi(fields[1].c_str());
+        this->endBP = atoi(fields[2].c_str());
+        this->score = strtod(fields[3].c_str(), NULL);
+       this->length = this->endBP - this->startBP;
+       this->sequence = "";
+}
+
+Peak::Peak(const Peak& p)
+{
+       this->startBP = p.startBP;
+       this->endBP = p.endBP;
+       this->length = p.length;
+       this->chrom = p.chrom;
+       this->score = p.score;
+       this->sequence = p.sequence;
+}
+
+Peak::~Peak()
+{
+;
+}
+
+void Peak::print()
+{
+       cout << "Chromosome:\t\t" << this->chrom.c_str() << endl;
+       cout << "Starting Base Pair:\t" << this->startBP << endl;
+       cout << "Ending Base Pair:\t" << this->endBP << endl;
+       cout << "Sequence Length:\t" << this->length << endl;
+       cout << "Sequence Score:\t\t" << this->score << endl;
+       cout << "Sequence:\t\t" << this->sequence << endl;
+}
+
+void Peak::printToFile(ofstream &outputFile)
+{
+       outputFile << "Chromosome:\t\t" << this->chrom.c_str() << endl;
+       outputFile << "Starting Base Pair:\t" << this->startBP << endl;
+       outputFile << "Ending Base Pair:\t" << this->endBP << endl;
+       outputFile << "Sequence Length:\t" << this->length << endl;
+       outputFile << "Sequence Score:\t\t" << this->score << endl;
+       outputFile << "Sequence:\t\t" << this->sequence << endl;
+       outputFile << endl;
+}
+
+bool Peak::operator<(const Peak& that) const
+{
+       if (this->chrom == that.chrom) { return(this->startBP < that.startBP);
+       } else { return(this->chrom < that.chrom); }
+}
+
+bool Peak::operator>(const Peak& that) const
+{
+       if (this->chrom == that.chrom) { return(this->startBP > that.startBP);
+       } else { return(this->chrom > that.chrom); }
+       
+}
+
+Peak& Peak:: operator=(const Peak& that)
+{
+       if (this != &that)
+       {
+               this->chrom = that.chrom;
+               this->startBP = that.startBP;
+               this->endBP = that.endBP;
+               this->length = that.length;
+               this->score = that.score;
+               this->sequence = that.sequence;
+       }
+       
+       return *this;
+}
diff --git a/htswanalysis/src/ValidationDesign/peak.h b/htswanalysis/src/ValidationDesign/peak.h
new file mode 100644 (file)
index 0000000..7238de7
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+ *  peak.h
+ *  PeakLocator
+ *
+ *  Created by Alan You on 8/11/08.
+ *  Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+#ifndef PEAK_H
+#define PEAK_H
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <memory.h>
+
+using namespace std;
+
+class Peak
+{
+public:
+       Peak();
+       Peak(string line);
+       Peak(const Peak& p);
+       ~Peak();
+       
+       bool operator<(const Peak& that) const;
+       bool operator>(const Peak& that) const;
+       Peak& operator=(const Peak& that);
+};
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/qPCR.cpp b/htswanalysis/src/ValidationDesign/qPCR.cpp
new file mode 100755 (executable)
index 0000000..00f12ae
--- /dev/null
@@ -0,0 +1,249 @@
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <queue>
+#include <math.h>
+#include <string>
+
+#include <gsl/gsl_statistics.h>
+
+#define WINDOW 25
+
+using namespace std;
+
+
+void int_to_chromstr(int chrom, char* chr);
+int chromstr_to_int(string chrom);
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+class Read {
+  public:
+    string chr;
+    int pos;
+    bool strand;
+
+    Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
+    Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
+    Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
+};
+
+class Hit {
+  public:
+    int start;
+    int end;
+    bool strand;
+
+    string chr;
+    string name;
+
+    unsigned int count[2];
+
+    Hit(string name, string chr, int start, int end, bool strand) { 
+      this->name = name; 
+      this->chr = chr; 
+      this->start = start; 
+      this->end = end; 
+      this->strand = strand; 
+      count[0] = 0; 
+      count[1] = 0; 
+    }
+
+    Hit(const Hit& h) {
+      this->name = h.name; 
+      this->chr = h.chr; 
+      this->start = h.start; 
+      this->end = h.end; 
+      this->strand = h.strand; 
+      count[0] = h.count[0];
+      count[1] = h.count[1]; 
+    }
+
+    Hit& operator=(const Hit& h) {
+      this->name = h.name; 
+      this->chr = h.chr; 
+      this->start = h.start; 
+      this->end = h.end; 
+      this->strand = h.strand; 
+      count[0] = h.count[0];
+      count[1] = h.count[1]; 
+      return *this;
+    }
+};
+
+bool Feat_lt_Read(const Hit& feat, const Read& read) {
+  //First compare based on chromosomes
+  if(read.chr == feat.chr) { return feat.end < read.pos; }
+  else { return feat.chr < read.chr; }
+}
+
+ostream &operator<<( ostream &out, const Hit &h ) {
+  out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count;
+  return out;
+}
+
+typedef vector<Hit> Hits;
+typedef vector<Read> Reads;
+
+bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
+bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
+
+void read_features(const char* filename, Hits& features) {
+  string delim("\t");
+
+  ifstream feat(filename);
+  size_t N = 0;
+  while(feat.peek() != EOF) {
+    char line[1024];
+    feat.getline(line,1024,'\n');
+    N++;
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
+
+    string name = fields[0];
+    string chr = fields[1];
+    if(chr == "NA") { continue; }
+    int start = atoi(fields[2].c_str());
+    int stop = atoi(fields[3].c_str());
+    bool strand = 0;
+
+    Hit feat(name,chr,start,stop,strand);
+    features.push_back(feat);
+  } 
+
+  //sort the features so we can run through it once
+  std::stable_sort(features.begin(),features.end(),compare_hits);
+  feat.close();
+}
+
+void read_align_file(char* filename, Reads& features) {
+  string delim(" \n");
+  string location_delim(":");
+  char strand_str[2]; strand_str[1] = '\0';
+  ifstream seqs(filename);
+  string name("");
+  while(seqs.peek() != EOF) {
+    char line[2048];
+    seqs.getline(line,2048,'\n');
+
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() != 7) { continue; }
+    vector<string> location; split(fields[3], location_delim, location);
+    string chr = location[0];
+    if(chr == "NA") { continue; }
+    int pos = atoi(location[1].c_str());
+    bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+    if(strand == 0) {
+      Read read(chr,pos,false); 
+      features.push_back(read);
+    } else {
+      Read read(chr,pos+25,true); 
+      features.push_back(read);
+    }
+  }
+  seqs.close(); 
+
+  //sort the data so we can run through it once
+  std::sort(features.begin(),features.end(),compare_reads);
+  cerr << "Reads sorted\n";
+}
+
+void count_read_in_features(Hits& features, Reads& data, unsigned int idx) {
+  Hits::iterator feat_it = features.begin();
+  
+  for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
+    //skip to first feature after read
+    string start_chr = feat_it->chr;
+    while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) {
+      feat_it++;
+    }
+    
+    //stop if we have run out of features.
+    if(feat_it == features.end()) { break; }
+
+    if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
+      feat_it->count[idx]++;
+    }
+  }
+}
+
+int getdir (string dir, vector<string> &files)
+{
+    DIR *dp;
+    struct dirent *dirp;
+    if((dp  = opendir(dir.c_str())) == NULL) {
+        cout << "Error(" << errno << ") opening " << dir << endl;
+        return errno;
+    }
+
+    while ((dirp = readdir(dp)) != NULL) {
+        if(dirp->d_name[0] == '.') { continue; }
+        files.push_back(dir + "/" + string(dirp->d_name));
+    }
+    closedir(dp);
+    return 0;
+}
+
+int compare_double(const void* a, const void* b) {
+  return *(double*)a - *(double*)b;
+}
+
+int main(int argc, char** argv) {
+  if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); }
+
+  char read_filename[1024]; strcpy(read_filename,argv[1]);
+  char bg_filename[1024]; strcpy(bg_filename,argv[2]);
+  string testdir(argv[3]);
+  Reads reads; read_align_file(read_filename, reads);
+  Reads bg; read_align_file(bg_filename, bg);
+
+  vector<string> tests;
+  getdir(testdir,tests);
+
+  for(unsigned int i = 0; i < tests.size(); i++) {
+    Hits features;
+    read_features(tests[i].c_str(), features);
+    count_read_in_features(features, reads,0);
+    count_read_in_features(features, bg,1);
+    double read_data[features.size()];
+    double bg_data[features.size()];
+    //double readZ = reads.size()/1000000.0;
+    //double bgZ = bg.size()/1000000.0;
+
+    // temporary hack to get raw read counts
+    double readZ = 1.0; double bgZ = 1.0;
+    for(unsigned int idx = 0; idx < features.size(); idx++) {
+      read_data[idx] = ((double)features[idx].count[0]/readZ);
+      bg_data[idx] = ((double)features[idx].count[1]/bgZ);
+      cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl;
+    }
+    double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size());
+    //qsort(data, features.size(), sizeof(double), compare_double);
+    //double median = gsl_stats_median_from_sorted_data(data, 1, features.size());
+    //double var = gsl_stats_variance_m(data,1,features.size(),mean);
+    cout << tests[i].c_str() << "\t" << enrichment << endl;
+  }
+}
+
+void split (const string& text, const string& separators, vector<string>& words) {
+
+    size_t n     = text.length ();
+    size_t start = text.find_first_not_of (separators);
+
+    while (start < n) {
+        size_t stop = text.find_first_of (separators, start);
+        if (stop > n) stop = n;
+        words.push_back (text.substr (start, stop-start));
+        start = text.find_first_not_of (separators, stop+1);
+    }
+}
diff --git a/htswanalysis/src/ValidationDesign/refine_peaks.cpp b/htswanalysis/src/ValidationDesign/refine_peaks.cpp
new file mode 100644 (file)
index 0000000..a100922
--- /dev/null
@@ -0,0 +1,278 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <queue>
+#include <math.h>
+
+#include "defs.h"
+#include "window.h"
+#include "read.h"
+#include "util.h"
+#include "chrom_list.h"
+
+using namespace std;
+
+bool Wind_lt_Read(const Window& feat, const Read& read) { if(read.chr == feat.chr) { return feat.end < read.pos; } else { return feat.chr < read.chr; } }
+
+extern bool compare_windows(const Window &a, const Window &b);
+extern bool compare_reads(const Read &a, const Read &b);
+
+void readFromPipe( int fd );
+void read_windows(const char* filename, Windows& windows);
+void read_align_file(char* filename, Reads& features);
+void count_reads_in_windows(Reads& data, Windows& windows);
+void recenter_windows(Windows& windows, unsigned int width);
+void resize_windows(Windows& windows, unsigned int width);
+
+void retrieveSequenceData(ChromList chrom_files, Windows& peaks);
+void outputPrimer3Input(Windows& peaks, Windows& calls);
+
+int main(int argc, char** argv) {
+  if(argc != 4) { cerr << "Usage: " << argv[0] << " peaks.bed aligned_reads chromosome_file" << endl; return(0); }
+  char window_filename[1024]; strcpy(window_filename,argv[1]);
+  char reads_filename[1024]; strcpy(reads_filename,argv[2]);
+  char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
+
+  pid_t pid;
+  int rv;
+  int pipe_to_primer[2];
+  //int pipe_from_primer[2];
+  setvbuf(stdout, (char*)NULL, _IONBF, 0);
+
+  if (pipe(pipe_to_primer)) { cout << "Pipe error!" << endl; exit(1); }
+  //if (pipe(pipe_from_primer)) { cout << "Pipe error!" << endl; exit(1); }
+  if ((pid = fork()) == -1) { cout << "Fork error!" << endl; exit(1); }
+       
+  //child 
+  if (pid == 0) {
+    dup2(pipe_to_primer[0], 0); close(pipe_to_primer[1]);
+    //dup2(pipe_from_primer[1], 1); close(pipe_from_primer[0]);
+    if (execl(PRIMER_3_PATH,  "primer3_core", NULL) == -1){ cerr << "execl error!" << endl; exit(1); }
+  } else {
+    dup2(pipe_to_primer[1], 1); close(pipe_to_primer[0]);
+    //dup2(pipe_from_primer[0], 0); close(pipe_from_primer[1]);
+
+    Reads reads;
+    ChromList chromosome_files(chromosome_filename);
+
+    // Title line from ChIP-seq peak data
+    string peakDataTitle;
+       
+    // All peak calls.  Each peak call contains chromomosme number, starting base pair, post-ending base
+    // pair, peak length, score, and corresponding sequence.
+    Windows windows;
+    read_windows(window_filename, windows);
+
+    read_align_file(reads_filename, reads);
+    count_reads_in_windows(reads, windows);
+    for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) { i->count_hits(); }
+
+    Windows targets(windows);
+    recenter_windows(targets,60);
+    count_reads_in_windows(reads, targets);
+  
+    unsigned int max_reads = 0;
+    unsigned int min_reads = 1e8;
+    Windows::iterator max, min;
+    for(Windows::iterator i = targets.begin(); i != targets.end(); ++i) {
+      i->count_hits(); 
+      if(i->total > max_reads) { max_reads = i->total; max = i; }
+      if(i->total < min_reads) { min_reads = i->total; min = i; }
+      cerr << i->total << " at " << i->chr << " " << i->start << " " << i->end << endl;
+    }
+
+    cerr << "Max: " << max_reads << " at " << max->chr << " " << max->start << " " << max->end << endl << "Min: " << min_reads << " at " << min->chr << " " << min->start << " " << min->end << endl;
+
+    retrieveSequenceData(chromosome_files, targets);
+    outputPrimer3Input(targets, windows);
+    close(pipe_to_primer[1]);
+    close(1);
+    //readFromPipe(pipe_from_primer[0]);
+    //close(pipe_from_primer[0]);
+    wait(&rv);
+  }
+  return 0;
+}
+
+void readFromPipe( int fd )
+{
+    FILE *stream;
+    int ch;
+    if ( (stream = fdopen( fd, "r" )) == NULL )
+    {
+        perror( "fdopen() r" );
+        exit(255);
+    }
+    while ( (ch = getc( stream )) != EOF )
+        putc( ch, stdout );
+    fflush( stdout );
+    fclose( stream );
+}
+
+void read_windows(const char* filename, Windows& windows) {
+  string delim("\t");
+
+  ifstream feat(filename);
+  unsigned int N = 0;
+  while(feat.peek() != EOF) {
+    char line[1024];
+    feat.getline(line,1024,'\n');
+    N++;
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() < 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n";
+      continue;
+     }
+
+    string chr = fields[0];
+    if(chr == "NA") { continue; }
+    int start = atoi(fields[1].c_str());
+    int stop = atoi(fields[2].c_str());
+    double score = strtod(fields[3].c_str(),NULL);
+
+    char name[100]; sprintf(name,"Window %d",N);
+    Window w(string(name),chr,start-300,stop+300,score);
+    windows.push_back(w);
+  } 
+
+  //sort the features so we can run through it once
+  std::stable_sort(windows.begin(),windows.end(),compare_windows);
+  feat.close();
+}
+
+void count_reads_in_windows(Reads& data, Windows& windows) {
+  Windows::iterator wind_it = windows.begin();
+  
+  for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
+    //skip to first window after read
+    while(wind_it != windows.end() && Wind_lt_Read(*wind_it, *i)) {
+      wind_it++;
+    }
+    
+    //stop if we have run out of windows.
+    if(wind_it == windows.end()) { break; }
+
+    if(i->pos >= wind_it->start && i->pos < wind_it->end) {
+      int pos = i->pos - wind_it->start;
+      wind_it->counts[pos]++;
+    }
+  }
+}
+
+void read_align_file(char* filename, Reads& features) {
+  string delim(" \n");
+  string location_delim(":");
+  char strand_str[2]; strand_str[1] = '\0';
+  ifstream seqs(filename);
+  string name("");
+  while(seqs.peek() != EOF) {
+    char line[2048];
+    seqs.getline(line,2048,'\n');
+
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() != 7) { continue; }
+    vector<string> location; split(fields[3], location_delim, location);
+    string chr = location[0];
+    if(chr == "NA") { continue; }
+    int pos = atoi(location[1].c_str());
+    bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+    if(strand == 0) {
+      Read read(chr,pos,strand); 
+      features.push_back(read);
+    } else {
+      Read read(chr,pos+25,strand); 
+      features.push_back(read);
+    }
+  }
+  seqs.close(); 
+
+  //sort the data so we can run through it once
+  std::sort(features.begin(),features.end(),compare_reads);
+  cerr << "Reads sorted\n";
+}
+
+//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region.
+void recenter_windows(Windows& windows, unsigned int width) {
+  for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) {
+    unsigned int max_count = 0; unsigned int max_offset = 0;
+    for(unsigned int count_i = 0; count_i < i->counts.size(); count_i++) {
+      if(i->counts[count_i] > max_count) { max_count = i->counts[count_i]; max_offset = count_i; }
+    }
+    unsigned int newstart = i->start + max_offset - width;
+    unsigned int newend   = i->start + max_offset + width;
+    i->reset(newstart, newend);
+  }
+}
+
+//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region.
+void resize_windows(Windows& windows, unsigned int width) {
+  for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) {
+    unsigned int middle = i->start + i->end / 2.0;
+    i->reset(middle - width, middle + width);
+  }
+}
+
+
+void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) {
+        char temp[1024];
+
+       string chrom = peaks[0].chr;
+        string chrom_filename = chrom_filenames[chrom];
+        ifstream chrom_file(chrom_filename.c_str());
+       chrom_file.getline(temp, 1024);
+        size_t offset = chrom_file.gcount();
+        for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+          if(i->chr != chrom) { 
+            chrom = i->chr; 
+            chrom_filename = chrom_filenames[chrom];
+            chrom_file.close(); chrom_file.open(chrom_filename.c_str());
+           chrom_file.getline(temp, 1024);
+            offset = chrom_file.gcount();
+          }
+          unsigned int begin = i->start;
+          unsigned int end   = i->end;
+          unsigned int begin_pos = offset + (int)begin/50 + begin;      
+          unsigned int end_pos = offset + (int)end/50 + end;      
+
+          unsigned int read_len = end_pos - begin_pos;
+          char buffer[read_len+1];
+         chrom_file.seekg(begin_pos, ios_base::beg);
+         chrom_file.read(buffer, read_len+1);
+          buffer[read_len] = '\0';
+         i->sequence = buffer;
+          unsigned int a;
+          while( (a= ((i->sequence).find("\n"))) != string::npos) {
+            i->sequence.erase(a,1);
+          }
+        } 
+        chrom_file.close(); 
+}
+
+void outputPrimer3Input(Windows& peaks, Windows& calls) {
+  cout <<   "PRIMER_NUM_RETURN=1" << endl;
+  cout <<   "PRIMER_LOWERCASE_MASKING=1" << endl;
+  cout <<   "PRIMER_SELF_END=2" << endl;
+  cout <<   "PRIMER_SELF_ANY=6" << endl;
+  cout <<   "PRIMER_OPT_SIZE=23" << endl;
+  cout <<   "PRIMER_MIN_SIZE=19" << endl;
+  cout <<   "PRIMER_MAX_SIZE=26" << endl;
+  cout <<   "PRIMER_MIN_TM=63" << endl;
+  cout <<   "PRIMER_OPT_TM=65" << endl;
+  cout <<   "PRIMER_MAX_TM=67" << endl;
+  cout <<   "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl;
+
+  Windows::iterator j = calls.begin();
+  for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+    cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << i->start << "_" << i->end << "_" << i->score << "_" << i->total << "_" << j->start << "_" << j->end << "_" << j->total << endl;
+    cout << "SEQUENCE=" << i->sequence.c_str() << endl;
+    cout << "=" << endl;
+    j++;
+  }
+}
diff --git a/htswanalysis/src/ValidationDesign/util.cpp b/htswanalysis/src/ValidationDesign/util.cpp
new file mode 100644 (file)
index 0000000..dcaf73e
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words) {
+    size_t n     = text.length ();
+    size_t start = text.find_first_not_of (separators);
+
+    while (start < n) {
+        size_t stop = text.find_first_of (separators, start);
+        if (stop > n) stop = n;
+        words.push_back (text.substr (start, stop-start));
+        start = text.find_first_not_of (separators, stop+1);
+    }
+}
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/util.h b/htswanalysis/src/ValidationDesign/util.h
new file mode 100644 (file)
index 0000000..8a73b63
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+#endif
diff --git a/htswanalysis/src/ValidationDesign/window.cpp b/htswanalysis/src/ValidationDesign/window.cpp
new file mode 100644 (file)
index 0000000..4459004
--- /dev/null
@@ -0,0 +1,79 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "window.h"
+#include "util.h"
+
+void Window::count_hits() {
+  total = 0;
+  for(vector<unsigned int>::iterator i = counts.begin(); i != counts.end(); ++i) {
+  total += *i;
+  }
+}
+
+Window::Window(string name, string chr, int start, int end, double score) { 
+  this->name = name;
+  this->chr = chr; 
+  this->start = start; 
+  this->end = end; 
+  this->score = score;
+  counts.clear();
+  counts.resize(end-start+1,0);
+  this->sequence = "";
+}
+
+Window::Window(const Window& r) { 
+  this->name = r.name;
+  this->score= r.score; 
+  this->chr = r.chr; 
+  this->sequence = r.sequence;
+  this->start = r.start; 
+  this->end = r.end; 
+  this->counts = r.counts; 
+}
+
+Window& Window::operator=(const Window& r) {
+  this->name = r.name;
+  this->score = r.score; 
+  this->chr = r.chr; 
+  this->sequence = r.sequence;
+  this->start = r.start; 
+  this->end = r.end; 
+  this->counts = r.counts; 
+  return *this;
+}
+
+ostream &operator<<( ostream &out, const Window &w ) {
+  out << w.chr.c_str() << "\t" << w.start << "\t" << w.end << "\t" << w.total;
+  return out;
+}
+
+bool Window::operator<(const Window& b) const { 
+  if(this->chr == b.chr) { return this->start < b.start; } else { return this->chr < b.chr; } }
+
+
+Window::Window(string line) {
+       istringstream outStream;
+       
+        vector<string> fields;
+        string delim("\t");
+        split(line,delim,fields);
+
+        this->name = fields[0];
+        this->chr = fields[1];
+        this->start = atoi(fields[2].c_str());
+        this->end = atoi(fields[3].c_str());
+        this->score = strtod(fields[4].c_str(), NULL);
+       this->sequence = "";
+}
+
+void Window::reset(unsigned int new_start, unsigned int new_end) {
+  this->start = new_start;
+  this->end = new_end;
+  this->counts.clear();
+  this->counts.resize(end-start+1,0);
+}
+
+bool compare_windows(const Window &a, const Window &b) { if(a.chr == b.chr) { return a.start < b.start; } else { return a.chr < b.chr; } }
index 8cc62ef920d9e9c404cb397a53649c69f5bdcdc0..7f32b931ef602d3b42b4570ebcbd6237701a4cf5 100755 (executable)
@@ -11,18 +11,15 @@ using namespace std;
 
 vector<int> profile(2000,0);
 
-void int_to_chromstr(int chrom, char* chr);
-int chromstr_to_int(string chrom);
-
 void split (const string& text, const string& separators, vector<string>& words);
 
 class Hit {
   public:
-    int chr;
+    string chr;
     unsigned int pos;
     bool strand;
 
-    Hit(int chr, unsigned int pos, bool strand) {
+    Hit(string chr, unsigned int pos, bool strand) {
       this->chr = chr; this->pos = pos; this->strand = strand;
     }
 };
@@ -60,8 +57,8 @@ int main(int argc, char** argv) {
     split(line_str, delim, fields);
     if(fields.size() != 3) { cerr << "Error: wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
 
-    int chr = chromstr_to_int(fields[0]);
-    if(chr == -1) { continue; }
+    string chr = fields[0];
+
     int pos = atoi(fields[1].c_str());
     bool strand = (bool)(atoi(fields[2].c_str()));
 
@@ -78,7 +75,9 @@ int main(int argc, char** argv) {
   location_delim = ":";
   char strand_str[2]; strand_str[1] = '\0';
   ifstream seqs(filename);
+  unsigned int linenum = 0;
   while(seqs.peek() != EOF) {
+    cerr << ++linenum << endl;
     char line[2048];
     seqs.getline(line,2048,'\n');
     string line_str(line);
@@ -87,8 +86,8 @@ int main(int argc, char** argv) {
     if(fields.size() == 3) { continue; }
  
     vector<string> location; split(fields[3], location_delim, location);
-    int chr = chromstr_to_int(location[0]);
-    if(chr == -1) { continue; }
+    string chr = location[0];
+
     int pos = atoi(location[1].c_str());
     bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
 
@@ -102,17 +101,17 @@ int main(int argc, char** argv) {
   //sort the data so we can run through it once
   std::sort(data.begin(),data.end(),compare_hits);
   
-  int chrom = -1
+  string chrom = ""
   unsigned int feat_idx = 0;
   int num_hits = 0;
   for(Hits::iterator i = data.begin(); i != data.end(); ++i) {
     num_hits++;
-    if(chrom == -1 || i->chr != chrom) {
+    if(chrom == "" || i->chr != chrom) {
       chrom = i->chr;
       feat_idx = 0;
       while(feat_idx < features.size() && features[feat_idx].chr != chrom) { feat_idx++; }
       if(feat_idx == features.size()) { break; }
-      cerr << chrom << " feat_idx: " << feat_idx << endl;
+      cerr << chrom.c_str() << " feat_idx: " << feat_idx << endl;
     }
 
     int dist_to_feature = i->pos - features[feat_idx].pos;
@@ -137,29 +136,6 @@ int main(int argc, char** argv) {
   }
 }
 
-int chromstr_to_int(string chromstr) {
-  const char* chrom = chromstr.c_str();
-  if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; }
-  if(chrom[3] == 'X') { return 23; } else
-  if(chrom[3] == 'Y') { return 24; } else
-  if(chrom[3] == 'M') { return 25; } else
-  if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else 
-  { return -1; } 
-}
-
-void int_to_chromstr(int chrom, char* chr) {
-  strcpy(chr,"chr\0\0\0");
-  switch(chrom) {
-    case 23: chr[3]='X'; break;
-    case 24: chr[3]='Y'; break;
-    case 25: chr[3]='M'; break;
-    default: chr[3]=(char)(48+(int)(chrom/10));
-             chr[4]=(char)(48+(int)(chrom - 10*(int)(chrom/10)));
-             if(chr[3]=='0') { chr[3]=chr[4]; chr[4]='\0'; }
-  }
-}
-  
-
 void split (const string& text, const string& separators, vector<string>& words) {
 
     size_t n     = text.length ();