CPPFLAGS=-g -Wall -O3 -I/opt/local/include
LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib
-TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks
+TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks ValidationDesign/refine_peaks
all: $(TARGETS)
+ValidationDesign/refine_peaks:
+ $(MAKE) -C ValidationDesign
+
install: $(TARGETS)
cp $^ ../bin/
--- /dev/null
+%.o: %.cpp
+ g++ -g -Wall -O3 $< -c -o $@
+
+refine_peaks: refine_peaks.cpp window.o util.o Read.o chrom_list.o
+ g++ -g -Wall -O3 $^ -o $@
--- /dev/null
+#include <string>
+
+#include "Read.h"
+
+bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
--- /dev/null
+#include <vector>
+#include <string>
+
+using namespace std;
+
+#ifndef READ_H
+#define READ_H
+
+class Read {
+ public:
+ string chr;
+ unsigned int pos;
+ bool strand;
+
+ Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
+ Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
+ Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
+};
+
+typedef vector<Read> Reads;
+
+#endif
--- /dev/null
+#include <iostream>
+#include <vector>
+
+#ifndef WINDOW_H
+#define WINDOW_H
+
+using namespace std;
+
+class Window {
+ public:
+ string chr;
+ string name;
+ unsigned int start;
+ unsigned int end;
+ double score;
+
+ string sequence;
+
+ vector<unsigned int> counts;
+ unsigned int total;
+
+ void count_hits();
+ void reset(unsigned int new_start, unsigned int new_end);
+
+ Window(string name, string chr, int start, int end, double score);
+ Window(string line);
+ Window(const Window& r);
+ Window& operator=(const Window& r);
+
+ bool operator<(const Window& b) const;
+};
+
+typedef vector<Window> Windows;
+
+#endif
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "chrom_list.h"
+#include "util.h"
+
+using namespace std;
+
+ChromList::ChromList(char* filename) {
+ char buffer[1024]; string delim("\t");
+ ifstream chrom_file(filename);
+ while(chrom_file.getline(buffer, 1024)) {
+ string temp(buffer);
+ vector<string> words;
+ split(temp, delim,words);
+ if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); }
+ (*this)[ words[0] ] = words[1];
+ }
+ chrom_file.close();
+}
--- /dev/null
+#include <ext/hash_map>
+
+#ifndef CHROMLIST_H
+#define CHROMLIST_H
+
+using namespace std;
+using namespace __gnu_cxx;
+
+struct eq_str {
+ bool operator() (const string& a, const string& b) const {
+ return(strcmp(a.c_str(), b.c_str()) == 0);
+ }
+};
+
+struct hash_str {
+ size_t operator() (const string& a) const {
+ hash<const char*> h;
+ return(h(a.c_str()));
+ }
+};
+
+class ChromList : public hash_map<string, string, hash_str, eq_str> {
+ public:
+ ChromList(char* filename);
+};
+
+#endif
--- /dev/null
+#define PRIMER_3_PATH "/Library/WebServer/CGI-Executables/ValidationDesign/primer3_core"
--- /dev/null
+/*
+ * main.cpp
+ * WindowLocator
+ *
+ * Created by Alan You on 8/11/08.
+ * Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <cmath>
+
+#include <ext/hash_map>
+
+#include "window.h"
+#include "util.h"
+
+using namespace std;
+using namespace __gnu_cxx;
+
+struct eq_str {
+ bool operator() (const string& a, const string& b) const {
+ return(strcmp(a.c_str(), b.c_str()) == 0);
+ }
+};
+
+struct hash_str {
+ size_t operator() (const string& a) const {
+ hash<const char*> h;
+ return(h(a.c_str()));
+ }
+};
+
+typedef hash_map<string, string, hash_str, eq_str> ChromList;
+
+void importWindows(string& filename, string& peakDataTitle, vector<Window>& peaks);
+void retrieveSequenceData(ChromList chrom_files, vector<Window>& peaks);
+void outputSequenceData(string peakDataTitle, vector<Window>& peaks);
+void outputPrimer3Input(vector<Window>& peaks);
+
+int main (int argc, char * const argv[])
+{
+ if(argc < 3) {
+ cout << "Usage: " << argv[0] << " bed_file_of_peaks chromosome_file" << endl;
+ exit(1);
+ }
+
+ string filename(argv[1]);
+ string chrom_list(argv[2]);
+
+ pid_t pid;
+ int rv;
+ int input_pipe[2];
+
+ setvbuf(stdout, (char*)NULL, _IONBF, 0);
+ if (pipe(input_pipe)) {
+ cout << "Pipe error!" << endl;
+ exit(1);
+ }
+
+ if ((pid = fork()) == -1) {
+ cout << "Fork error!" << endl;
+ exit(1);
+ }
+
+ //child
+ if (pid == 0) {
+ dup2(input_pipe[0], 0); close(input_pipe[1]);
+ if (execlp("primer3_core", "primer3_core", NULL) == -1){
+ cerr << "execl error!" << endl;
+ exit(1);
+ }
+ cerr << "Done with primer3\n";
+ } else {
+ ChromList chromosome_files;
+ char buffer[1024]; string delim("\t");
+ ifstream chrom_file(chrom_list.c_str());
+ while(chrom_file.getline(buffer, 1024)) {
+ string temp(buffer);
+ vector<string> words;
+ split(temp, delim,words);
+ if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); }
+ chromosome_files[ words[0] ] = words[1];
+ }
+ chrom_file.close();
+
+ // ChIP-seq peak data (BED format required)
+ ifstream peakDataFile;
+
+ // Title line from ChIP-seq peak data
+ string peakDataTitle;
+
+ // All peak calls. Each peak call contains chromomosme number, starting base pair, post-ending base
+ // pair, peak length, score, and corresponding sequence.
+ vector<Window> peaks;
+
+ dup2(input_pipe[1], 1); close(input_pipe[0]);
+ importWindows(filename, peakDataTitle, peaks);
+ retrieveSequenceData(chromosome_files, peaks);
+ outputPrimer3Input(peaks);
+ close(input_pipe[1]);
+ close(1);
+ wait(&rv);
+ }
+
+ return 0;
+}
+
+void importWindows(string& filename, string& peakDataTitle, vector<Window>& peaks)
+{
+ ifstream peakDataFile;
+ peakDataFile.open(filename.c_str());
+
+ char temp[1024];
+
+ peakDataFile.getline(temp, 1024);
+ peakDataTitle = temp;
+
+ while(peakDataFile.getline(temp, 1024)) {
+ if(peakDataFile.eof()) { peakDataFile.close(); break; }
+ Window inputCall(temp);
+ peaks.push_back(inputCall);
+ }
+
+ sort(peaks.begin(), peaks.end());
+
+}
+
+void retrieveSequenceData(ChromList chrom_filenames, vector<Window>& peaks)
+{
+ char temp[1024];
+
+ string chrom = peaks[0].chr;
+ string chrom_filename = chrom_filenames[chrom];
+ ifstream chrom_file(chrom_filename.c_str());
+ chrom_file.getline(temp, 1024);
+ size_t offset = chrom_file.gcount();
+ for(vector<Window>::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+ if(i->chr != chrom) {
+ chrom = i->chr;
+ chrom_filename = chrom_filenames[chrom];
+ chrom_file.close(); chrom_file.open(chrom_filename.c_str());
+ chrom_file.getline(temp, 1024);
+ offset = chrom_file.gcount();
+ }
+ unsigned int begin = i->start;
+ unsigned int end = i->end;
+
+ unsigned int begin_pos = offset + (int)begin/50 + begin;
+ unsigned int end_pos = offset + (int)end/50 + end;
+
+ unsigned int read_len = end_pos - begin_pos;
+ char buffer[read_len+1];
+ chrom_file.seekg(begin_pos, ios_base::beg);
+ chrom_file.read(buffer, read_len+1);
+ buffer[read_len] = '\0';
+ i->sequence = buffer;
+ unsigned int a;
+ while( (a= ((i->sequence).find("\n"))) != string::npos) {
+ i->sequence.erase(a,1);
+ }
+ }
+ chrom_file.close();
+}
+
+void outputPrimer3Input(vector<Window>& peaks)
+{
+ cout << "PRIMER_NUM_RETURN=1" << endl;
+ cout << "PRIMER_LOWERCASE_MASKING=1" << endl;
+ //cout << "PRIMER_TM_SANTALUCIA=1" << endl;
+ cout << "PRIMER_SELF_END=2" << endl;
+ cout << "PRIMER_SELF_ANY=6" << endl;
+ cout << "PRIMER_OPT_SIZE=23" << endl;
+ cout << "PRIMER_MIN_SIZE=19" << endl;
+ cout << "PRIMER_MAX_SIZE=26" << endl;
+ cout << "PRIMER_MIN_TM=63" << endl;
+ cout << "PRIMER_OPT_TM=65" << endl;
+ cout << "PRIMER_MAX_TM=67" << endl;
+ cout << "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl;
+
+ unsigned int idx = 0;
+ for(vector<Window>::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+ //cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << idx++ << "_" << i->score << endl;
+ cout << "PRIMER_SEQUENCE_ID=" << i->name << i->score << endl;
+ cout << "SEQUENCE=" << i->sequence.c_str() << endl;
+ cout << "=" << endl;
+ }
+}
--- /dev/null
+/*
+ * peak.cpp
+ * PeakLocator
+ *
+ * Created by Alan You on 8/11/08.
+ * Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+
+#include <vector>
+#include <string>
+
+#include "peak.h"
+#include "util.h"
+
+using namespace std;
+
+Peak::Peak()
+{
+ this->startBP = 0;
+ this->endBP = 0;
+ this->length = 0;
+ this->chrom = "";
+ this->score = 0;
+ this->sequence = "";
+}
+
+Peak::Peak(string line)
+{
+ istringstream outStream;
+
+ vector<string> fields;
+ string delim("\t");
+ split(line,delim,fields);
+
+ this->chrom = fields[0];
+ this->startBP = atoi(fields[1].c_str());
+ this->endBP = atoi(fields[2].c_str());
+ this->score = strtod(fields[3].c_str(), NULL);
+ this->length = this->endBP - this->startBP;
+ this->sequence = "";
+}
+
+Peak::Peak(const Peak& p)
+{
+ this->startBP = p.startBP;
+ this->endBP = p.endBP;
+ this->length = p.length;
+ this->chrom = p.chrom;
+ this->score = p.score;
+ this->sequence = p.sequence;
+}
+
+Peak::~Peak()
+{
+;
+}
+
+void Peak::print()
+{
+ cout << "Chromosome:\t\t" << this->chrom.c_str() << endl;
+ cout << "Starting Base Pair:\t" << this->startBP << endl;
+ cout << "Ending Base Pair:\t" << this->endBP << endl;
+ cout << "Sequence Length:\t" << this->length << endl;
+ cout << "Sequence Score:\t\t" << this->score << endl;
+ cout << "Sequence:\t\t" << this->sequence << endl;
+}
+
+void Peak::printToFile(ofstream &outputFile)
+{
+ outputFile << "Chromosome:\t\t" << this->chrom.c_str() << endl;
+ outputFile << "Starting Base Pair:\t" << this->startBP << endl;
+ outputFile << "Ending Base Pair:\t" << this->endBP << endl;
+ outputFile << "Sequence Length:\t" << this->length << endl;
+ outputFile << "Sequence Score:\t\t" << this->score << endl;
+ outputFile << "Sequence:\t\t" << this->sequence << endl;
+ outputFile << endl;
+}
+
+bool Peak::operator<(const Peak& that) const
+{
+ if (this->chrom == that.chrom) { return(this->startBP < that.startBP);
+ } else { return(this->chrom < that.chrom); }
+}
+
+bool Peak::operator>(const Peak& that) const
+{
+ if (this->chrom == that.chrom) { return(this->startBP > that.startBP);
+ } else { return(this->chrom > that.chrom); }
+
+}
+
+Peak& Peak:: operator=(const Peak& that)
+{
+ if (this != &that)
+ {
+ this->chrom = that.chrom;
+ this->startBP = that.startBP;
+ this->endBP = that.endBP;
+ this->length = that.length;
+ this->score = that.score;
+ this->sequence = that.sequence;
+ }
+
+ return *this;
+}
--- /dev/null
+/*
+ * peak.h
+ * PeakLocator
+ *
+ * Created by Alan You on 8/11/08.
+ * Copyright 2008 Hudson Alpha. All rights reserved.
+ *
+ */
+#ifndef PEAK_H
+#define PEAK_H
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <memory.h>
+
+
+using namespace std;
+
+class Peak
+{
+public:
+ Peak();
+ Peak(string line);
+ Peak(const Peak& p);
+ ~Peak();
+
+ bool operator<(const Peak& that) const;
+ bool operator>(const Peak& that) const;
+ Peak& operator=(const Peak& that);
+};
+
+#endif
--- /dev/null
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <queue>
+#include <math.h>
+#include <string>
+
+#include <gsl/gsl_statistics.h>
+
+#define WINDOW 25
+
+using namespace std;
+
+
+void int_to_chromstr(int chrom, char* chr);
+int chromstr_to_int(string chrom);
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+class Read {
+ public:
+ string chr;
+ int pos;
+ bool strand;
+
+ Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
+ Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
+ Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
+};
+
+class Hit {
+ public:
+ int start;
+ int end;
+ bool strand;
+
+ string chr;
+ string name;
+
+ unsigned int count[2];
+
+ Hit(string name, string chr, int start, int end, bool strand) {
+ this->name = name;
+ this->chr = chr;
+ this->start = start;
+ this->end = end;
+ this->strand = strand;
+ count[0] = 0;
+ count[1] = 0;
+ }
+
+ Hit(const Hit& h) {
+ this->name = h.name;
+ this->chr = h.chr;
+ this->start = h.start;
+ this->end = h.end;
+ this->strand = h.strand;
+ count[0] = h.count[0];
+ count[1] = h.count[1];
+ }
+
+ Hit& operator=(const Hit& h) {
+ this->name = h.name;
+ this->chr = h.chr;
+ this->start = h.start;
+ this->end = h.end;
+ this->strand = h.strand;
+ count[0] = h.count[0];
+ count[1] = h.count[1];
+ return *this;
+ }
+};
+
+bool Feat_lt_Read(const Hit& feat, const Read& read) {
+ //First compare based on chromosomes
+ if(read.chr == feat.chr) { return feat.end < read.pos; }
+ else { return feat.chr < read.chr; }
+}
+
+ostream &operator<<( ostream &out, const Hit &h ) {
+ out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count;
+ return out;
+}
+
+typedef vector<Hit> Hits;
+typedef vector<Read> Reads;
+
+bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } }
+bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
+
+void read_features(const char* filename, Hits& features) {
+ string delim("\t");
+
+ ifstream feat(filename);
+ size_t N = 0;
+ while(feat.peek() != EOF) {
+ char line[1024];
+ feat.getline(line,1024,'\n');
+ N++;
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
+
+ string name = fields[0];
+ string chr = fields[1];
+ if(chr == "NA") { continue; }
+ int start = atoi(fields[2].c_str());
+ int stop = atoi(fields[3].c_str());
+ bool strand = 0;
+
+ Hit feat(name,chr,start,stop,strand);
+ features.push_back(feat);
+ }
+
+ //sort the features so we can run through it once
+ std::stable_sort(features.begin(),features.end(),compare_hits);
+ feat.close();
+}
+
+void read_align_file(char* filename, Reads& features) {
+ string delim(" \n");
+ string location_delim(":");
+ char strand_str[2]; strand_str[1] = '\0';
+ ifstream seqs(filename);
+ string name("");
+ while(seqs.peek() != EOF) {
+ char line[2048];
+ seqs.getline(line,2048,'\n');
+
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() != 7) { continue; }
+
+ vector<string> location; split(fields[3], location_delim, location);
+ string chr = location[0];
+ if(chr == "NA") { continue; }
+ int pos = atoi(location[1].c_str());
+ bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+ if(strand == 0) {
+ Read read(chr,pos,false);
+ features.push_back(read);
+ } else {
+ Read read(chr,pos+25,true);
+ features.push_back(read);
+ }
+ }
+ seqs.close();
+
+ //sort the data so we can run through it once
+ std::sort(features.begin(),features.end(),compare_reads);
+ cerr << "Reads sorted\n";
+}
+
+void count_read_in_features(Hits& features, Reads& data, unsigned int idx) {
+ Hits::iterator feat_it = features.begin();
+
+ for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
+ //skip to first feature after read
+ string start_chr = feat_it->chr;
+ while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) {
+ feat_it++;
+ }
+
+ //stop if we have run out of features.
+ if(feat_it == features.end()) { break; }
+
+ if(i->pos >= feat_it->start && i->pos <= feat_it->end) {
+ feat_it->count[idx]++;
+ }
+ }
+}
+
+int getdir (string dir, vector<string> &files)
+{
+ DIR *dp;
+ struct dirent *dirp;
+ if((dp = opendir(dir.c_str())) == NULL) {
+ cout << "Error(" << errno << ") opening " << dir << endl;
+ return errno;
+ }
+
+ while ((dirp = readdir(dp)) != NULL) {
+ if(dirp->d_name[0] == '.') { continue; }
+ files.push_back(dir + "/" + string(dirp->d_name));
+ }
+ closedir(dp);
+ return 0;
+}
+
+int compare_double(const void* a, const void* b) {
+ return *(double*)a - *(double*)b;
+}
+
+int main(int argc, char** argv) {
+ if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); }
+
+ char read_filename[1024]; strcpy(read_filename,argv[1]);
+ char bg_filename[1024]; strcpy(bg_filename,argv[2]);
+ string testdir(argv[3]);
+ Reads reads; read_align_file(read_filename, reads);
+ Reads bg; read_align_file(bg_filename, bg);
+
+ vector<string> tests;
+ getdir(testdir,tests);
+
+ for(unsigned int i = 0; i < tests.size(); i++) {
+ Hits features;
+ read_features(tests[i].c_str(), features);
+ count_read_in_features(features, reads,0);
+ count_read_in_features(features, bg,1);
+ double read_data[features.size()];
+ double bg_data[features.size()];
+ //double readZ = reads.size()/1000000.0;
+ //double bgZ = bg.size()/1000000.0;
+
+ // temporary hack to get raw read counts
+ double readZ = 1.0; double bgZ = 1.0;
+ for(unsigned int idx = 0; idx < features.size(); idx++) {
+ read_data[idx] = ((double)features[idx].count[0]/readZ);
+ bg_data[idx] = ((double)features[idx].count[1]/bgZ);
+ cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl;
+ }
+ double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size());
+ //qsort(data, features.size(), sizeof(double), compare_double);
+ //double median = gsl_stats_median_from_sorted_data(data, 1, features.size());
+ //double var = gsl_stats_variance_m(data,1,features.size(),mean);
+ cout << tests[i].c_str() << "\t" << enrichment << endl;
+ }
+}
+
+void split (const string& text, const string& separators, vector<string>& words) {
+
+ size_t n = text.length ();
+ size_t start = text.find_first_not_of (separators);
+
+ while (start < n) {
+ size_t stop = text.find_first_of (separators, start);
+ if (stop > n) stop = n;
+ words.push_back (text.substr (start, stop-start));
+ start = text.find_first_not_of (separators, stop+1);
+ }
+}
--- /dev/null
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <queue>
+#include <math.h>
+
+#include "defs.h"
+#include "window.h"
+#include "read.h"
+#include "util.h"
+#include "chrom_list.h"
+
+using namespace std;
+
+bool Wind_lt_Read(const Window& feat, const Read& read) { if(read.chr == feat.chr) { return feat.end < read.pos; } else { return feat.chr < read.chr; } }
+
+extern bool compare_windows(const Window &a, const Window &b);
+extern bool compare_reads(const Read &a, const Read &b);
+
+void readFromPipe( int fd );
+void read_windows(const char* filename, Windows& windows);
+void read_align_file(char* filename, Reads& features);
+void count_reads_in_windows(Reads& data, Windows& windows);
+void recenter_windows(Windows& windows, unsigned int width);
+void resize_windows(Windows& windows, unsigned int width);
+
+void retrieveSequenceData(ChromList chrom_files, Windows& peaks);
+void outputPrimer3Input(Windows& peaks, Windows& calls);
+
+int main(int argc, char** argv) {
+ if(argc != 4) { cerr << "Usage: " << argv[0] << " peaks.bed aligned_reads chromosome_file" << endl; return(0); }
+ char window_filename[1024]; strcpy(window_filename,argv[1]);
+ char reads_filename[1024]; strcpy(reads_filename,argv[2]);
+ char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
+
+ pid_t pid;
+ int rv;
+ int pipe_to_primer[2];
+ //int pipe_from_primer[2];
+
+ setvbuf(stdout, (char*)NULL, _IONBF, 0);
+
+ if (pipe(pipe_to_primer)) { cout << "Pipe error!" << endl; exit(1); }
+ //if (pipe(pipe_from_primer)) { cout << "Pipe error!" << endl; exit(1); }
+ if ((pid = fork()) == -1) { cout << "Fork error!" << endl; exit(1); }
+
+ //child
+ if (pid == 0) {
+ dup2(pipe_to_primer[0], 0); close(pipe_to_primer[1]);
+ //dup2(pipe_from_primer[1], 1); close(pipe_from_primer[0]);
+ if (execl(PRIMER_3_PATH, "primer3_core", NULL) == -1){ cerr << "execl error!" << endl; exit(1); }
+ } else {
+ dup2(pipe_to_primer[1], 1); close(pipe_to_primer[0]);
+ //dup2(pipe_from_primer[0], 0); close(pipe_from_primer[1]);
+
+ Reads reads;
+ ChromList chromosome_files(chromosome_filename);
+
+ // Title line from ChIP-seq peak data
+ string peakDataTitle;
+
+ // All peak calls. Each peak call contains chromomosme number, starting base pair, post-ending base
+ // pair, peak length, score, and corresponding sequence.
+ Windows windows;
+ read_windows(window_filename, windows);
+
+ read_align_file(reads_filename, reads);
+ count_reads_in_windows(reads, windows);
+ for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) { i->count_hits(); }
+
+ Windows targets(windows);
+ recenter_windows(targets,60);
+ count_reads_in_windows(reads, targets);
+
+ unsigned int max_reads = 0;
+ unsigned int min_reads = 1e8;
+ Windows::iterator max, min;
+ for(Windows::iterator i = targets.begin(); i != targets.end(); ++i) {
+ i->count_hits();
+ if(i->total > max_reads) { max_reads = i->total; max = i; }
+ if(i->total < min_reads) { min_reads = i->total; min = i; }
+ cerr << i->total << " at " << i->chr << " " << i->start << " " << i->end << endl;
+ }
+
+ cerr << "Max: " << max_reads << " at " << max->chr << " " << max->start << " " << max->end << endl << "Min: " << min_reads << " at " << min->chr << " " << min->start << " " << min->end << endl;
+
+ retrieveSequenceData(chromosome_files, targets);
+ outputPrimer3Input(targets, windows);
+ close(pipe_to_primer[1]);
+ close(1);
+ //readFromPipe(pipe_from_primer[0]);
+ //close(pipe_from_primer[0]);
+ wait(&rv);
+ }
+ return 0;
+}
+
+void readFromPipe( int fd )
+{
+ FILE *stream;
+ int ch;
+ if ( (stream = fdopen( fd, "r" )) == NULL )
+ {
+ perror( "fdopen() r" );
+ exit(255);
+ }
+ while ( (ch = getc( stream )) != EOF )
+ putc( ch, stdout );
+ fflush( stdout );
+ fclose( stream );
+}
+
+void read_windows(const char* filename, Windows& windows) {
+ string delim("\t");
+
+ ifstream feat(filename);
+ unsigned int N = 0;
+ while(feat.peek() != EOF) {
+ char line[1024];
+ feat.getline(line,1024,'\n');
+ N++;
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() < 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n";
+ continue;
+ }
+
+ string chr = fields[0];
+ if(chr == "NA") { continue; }
+ int start = atoi(fields[1].c_str());
+ int stop = atoi(fields[2].c_str());
+ double score = strtod(fields[3].c_str(),NULL);
+
+ char name[100]; sprintf(name,"Window %d",N);
+ Window w(string(name),chr,start-300,stop+300,score);
+ windows.push_back(w);
+ }
+
+ //sort the features so we can run through it once
+ std::stable_sort(windows.begin(),windows.end(),compare_windows);
+ feat.close();
+}
+
+void count_reads_in_windows(Reads& data, Windows& windows) {
+ Windows::iterator wind_it = windows.begin();
+
+ for(Reads::iterator i = data.begin(); i != data.end(); ++i) {
+ //skip to first window after read
+ while(wind_it != windows.end() && Wind_lt_Read(*wind_it, *i)) {
+ wind_it++;
+ }
+
+ //stop if we have run out of windows.
+ if(wind_it == windows.end()) { break; }
+
+ if(i->pos >= wind_it->start && i->pos < wind_it->end) {
+ int pos = i->pos - wind_it->start;
+ wind_it->counts[pos]++;
+ }
+ }
+}
+
+void read_align_file(char* filename, Reads& features) {
+ string delim(" \n");
+ string location_delim(":");
+ char strand_str[2]; strand_str[1] = '\0';
+ ifstream seqs(filename);
+ string name("");
+ while(seqs.peek() != EOF) {
+ char line[2048];
+ seqs.getline(line,2048,'\n');
+
+ string line_str(line);
+ vector<string> fields;
+ split(line_str, delim, fields);
+ if(fields.size() != 7) { continue; }
+
+ vector<string> location; split(fields[3], location_delim, location);
+ string chr = location[0];
+ if(chr == "NA") { continue; }
+ int pos = atoi(location[1].c_str());
+ bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+ if(strand == 0) {
+ Read read(chr,pos,strand);
+ features.push_back(read);
+ } else {
+ Read read(chr,pos+25,strand);
+ features.push_back(read);
+ }
+ }
+ seqs.close();
+
+ //sort the data so we can run through it once
+ std::sort(features.begin(),features.end(),compare_reads);
+ cerr << "Reads sorted\n";
+}
+
+//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region.
+void recenter_windows(Windows& windows, unsigned int width) {
+ for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) {
+ unsigned int max_count = 0; unsigned int max_offset = 0;
+ for(unsigned int count_i = 0; count_i < i->counts.size(); count_i++) {
+ if(i->counts[count_i] > max_count) { max_count = i->counts[count_i]; max_offset = count_i; }
+ }
+ unsigned int newstart = i->start + max_offset - width;
+ unsigned int newend = i->start + max_offset + width;
+ i->reset(newstart, newend);
+ }
+}
+
+//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region.
+void resize_windows(Windows& windows, unsigned int width) {
+ for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) {
+ unsigned int middle = i->start + i->end / 2.0;
+ i->reset(middle - width, middle + width);
+ }
+}
+
+
+void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) {
+ char temp[1024];
+
+ string chrom = peaks[0].chr;
+ string chrom_filename = chrom_filenames[chrom];
+ ifstream chrom_file(chrom_filename.c_str());
+ chrom_file.getline(temp, 1024);
+ size_t offset = chrom_file.gcount();
+ for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+ if(i->chr != chrom) {
+ chrom = i->chr;
+ chrom_filename = chrom_filenames[chrom];
+ chrom_file.close(); chrom_file.open(chrom_filename.c_str());
+ chrom_file.getline(temp, 1024);
+ offset = chrom_file.gcount();
+ }
+ unsigned int begin = i->start;
+ unsigned int end = i->end;
+
+ unsigned int begin_pos = offset + (int)begin/50 + begin;
+ unsigned int end_pos = offset + (int)end/50 + end;
+
+ unsigned int read_len = end_pos - begin_pos;
+ char buffer[read_len+1];
+ chrom_file.seekg(begin_pos, ios_base::beg);
+ chrom_file.read(buffer, read_len+1);
+ buffer[read_len] = '\0';
+ i->sequence = buffer;
+ unsigned int a;
+ while( (a= ((i->sequence).find("\n"))) != string::npos) {
+ i->sequence.erase(a,1);
+ }
+ }
+ chrom_file.close();
+}
+
+void outputPrimer3Input(Windows& peaks, Windows& calls) {
+ cout << "PRIMER_NUM_RETURN=1" << endl;
+ cout << "PRIMER_LOWERCASE_MASKING=1" << endl;
+ cout << "PRIMER_SELF_END=2" << endl;
+ cout << "PRIMER_SELF_ANY=6" << endl;
+ cout << "PRIMER_OPT_SIZE=23" << endl;
+ cout << "PRIMER_MIN_SIZE=19" << endl;
+ cout << "PRIMER_MAX_SIZE=26" << endl;
+ cout << "PRIMER_MIN_TM=63" << endl;
+ cout << "PRIMER_OPT_TM=65" << endl;
+ cout << "PRIMER_MAX_TM=67" << endl;
+ cout << "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl;
+
+ Windows::iterator j = calls.begin();
+ for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
+ cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << i->start << "_" << i->end << "_" << i->score << "_" << i->total << "_" << j->start << "_" << j->end << "_" << j->total << endl;
+ cout << "SEQUENCE=" << i->sequence.c_str() << endl;
+ cout << "=" << endl;
+ j++;
+ }
+}
--- /dev/null
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words) {
+ size_t n = text.length ();
+ size_t start = text.find_first_not_of (separators);
+
+ while (start < n) {
+ size_t stop = text.find_first_of (separators, start);
+ if (stop > n) stop = n;
+ words.push_back (text.substr (start, stop-start));
+ start = text.find_first_not_of (separators, stop+1);
+ }
+}
+
+#endif
--- /dev/null
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words);
+
+#endif
--- /dev/null
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "window.h"
+#include "util.h"
+
+void Window::count_hits() {
+ total = 0;
+ for(vector<unsigned int>::iterator i = counts.begin(); i != counts.end(); ++i) {
+ total += *i;
+ }
+}
+
+Window::Window(string name, string chr, int start, int end, double score) {
+ this->name = name;
+ this->chr = chr;
+ this->start = start;
+ this->end = end;
+ this->score = score;
+ counts.clear();
+ counts.resize(end-start+1,0);
+ this->sequence = "";
+}
+
+Window::Window(const Window& r) {
+ this->name = r.name;
+ this->score= r.score;
+ this->chr = r.chr;
+ this->sequence = r.sequence;
+ this->start = r.start;
+ this->end = r.end;
+ this->counts = r.counts;
+}
+
+Window& Window::operator=(const Window& r) {
+ this->name = r.name;
+ this->score = r.score;
+ this->chr = r.chr;
+ this->sequence = r.sequence;
+ this->start = r.start;
+ this->end = r.end;
+ this->counts = r.counts;
+ return *this;
+}
+
+ostream &operator<<( ostream &out, const Window &w ) {
+ out << w.chr.c_str() << "\t" << w.start << "\t" << w.end << "\t" << w.total;
+ return out;
+}
+
+bool Window::operator<(const Window& b) const {
+ if(this->chr == b.chr) { return this->start < b.start; } else { return this->chr < b.chr; } }
+
+
+Window::Window(string line) {
+ istringstream outStream;
+
+ vector<string> fields;
+ string delim("\t");
+ split(line,delim,fields);
+
+ this->name = fields[0];
+ this->chr = fields[1];
+ this->start = atoi(fields[2].c_str());
+ this->end = atoi(fields[3].c_str());
+ this->score = strtod(fields[4].c_str(), NULL);
+ this->sequence = "";
+}
+
+void Window::reset(unsigned int new_start, unsigned int new_end) {
+ this->start = new_start;
+ this->end = new_end;
+ this->counts.clear();
+ this->counts.resize(end-start+1,0);
+}
+
+bool compare_windows(const Window &a, const Window &b) { if(a.chr == b.chr) { return a.start < b.start; } else { return a.chr < b.chr; } }
vector<int> profile(2000,0);
-void int_to_chromstr(int chrom, char* chr);
-int chromstr_to_int(string chrom);
-
void split (const string& text, const string& separators, vector<string>& words);
class Hit {
public:
- int chr;
+ string chr;
unsigned int pos;
bool strand;
- Hit(int chr, unsigned int pos, bool strand) {
+ Hit(string chr, unsigned int pos, bool strand) {
this->chr = chr; this->pos = pos; this->strand = strand;
}
};
split(line_str, delim, fields);
if(fields.size() != 3) { cerr << "Error: wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; }
- int chr = chromstr_to_int(fields[0]);
- if(chr == -1) { continue; }
+ string chr = fields[0];
+
int pos = atoi(fields[1].c_str());
bool strand = (bool)(atoi(fields[2].c_str()));
location_delim = ":";
char strand_str[2]; strand_str[1] = '\0';
ifstream seqs(filename);
+ unsigned int linenum = 0;
while(seqs.peek() != EOF) {
+ cerr << ++linenum << endl;
char line[2048];
seqs.getline(line,2048,'\n');
string line_str(line);
if(fields.size() == 3) { continue; }
vector<string> location; split(fields[3], location_delim, location);
- int chr = chromstr_to_int(location[0]);
- if(chr == -1) { continue; }
+ string chr = location[0];
+
int pos = atoi(location[1].c_str());
bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
//sort the data so we can run through it once
std::sort(data.begin(),data.end(),compare_hits);
- int chrom = -1;
+ string chrom = "";
unsigned int feat_idx = 0;
int num_hits = 0;
for(Hits::iterator i = data.begin(); i != data.end(); ++i) {
num_hits++;
- if(chrom == -1 || i->chr != chrom) {
+ if(chrom == "" || i->chr != chrom) {
chrom = i->chr;
feat_idx = 0;
while(feat_idx < features.size() && features[feat_idx].chr != chrom) { feat_idx++; }
if(feat_idx == features.size()) { break; }
- cerr << chrom << " feat_idx: " << feat_idx << endl;
+ cerr << chrom.c_str() << " feat_idx: " << feat_idx << endl;
}
int dist_to_feature = i->pos - features[feat_idx].pos;
}
}
-int chromstr_to_int(string chromstr) {
- const char* chrom = chromstr.c_str();
- if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; }
- if(chrom[3] == 'X') { return 23; } else
- if(chrom[3] == 'Y') { return 24; } else
- if(chrom[3] == 'M') { return 25; } else
- if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else
- { return -1; }
-}
-
-void int_to_chromstr(int chrom, char* chr) {
- strcpy(chr,"chr\0\0\0");
- switch(chrom) {
- case 23: chr[3]='X'; break;
- case 24: chr[3]='Y'; break;
- case 25: chr[3]='M'; break;
- default: chr[3]=(char)(48+(int)(chrom/10));
- chr[4]=(char)(48+(int)(chrom - 10*(int)(chrom/10)));
- if(chr[3]=='0') { chr[3]=chr[4]; chr[4]='\0'; }
- }
-}
-
-
void split (const string& text, const string& separators, vector<string>& words) {
size_t n = text.length ();