From 290d469e502ebe814e9e38db127747c8d318a2d8 Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Tue, 25 Nov 2008 01:23:20 +0000 Subject: [PATCH] Added qPCR Validation design code --- htswanalysis/src/Makefile | 5 +- htswanalysis/src/ValidationDesign/Makefile | 5 + htswanalysis/src/ValidationDesign/Read.cpp | 5 + htswanalysis/src/ValidationDesign/Read.h | 22 ++ htswanalysis/src/ValidationDesign/Window.h | 35 +++ .../src/ValidationDesign/chrom_list.cpp | 22 ++ .../src/ValidationDesign/chrom_list.h | 27 ++ htswanalysis/src/ValidationDesign/defs.h | 1 + htswanalysis/src/ValidationDesign/main.cpp | 191 ++++++++++++ htswanalysis/src/ValidationDesign/peak.cpp | 106 +++++++ htswanalysis/src/ValidationDesign/peak.h | 34 +++ htswanalysis/src/ValidationDesign/qPCR.cpp | 249 ++++++++++++++++ .../src/ValidationDesign/refine_peaks.cpp | 278 ++++++++++++++++++ htswanalysis/src/ValidationDesign/util.cpp | 21 ++ htswanalysis/src/ValidationDesign/util.h | 11 + htswanalysis/src/ValidationDesign/window.cpp | 79 +++++ .../src/profile_reads_against_features.cpp | 46 +-- 17 files changed, 1101 insertions(+), 36 deletions(-) create mode 100644 htswanalysis/src/ValidationDesign/Makefile create mode 100644 htswanalysis/src/ValidationDesign/Read.cpp create mode 100644 htswanalysis/src/ValidationDesign/Read.h create mode 100644 htswanalysis/src/ValidationDesign/Window.h create mode 100644 htswanalysis/src/ValidationDesign/chrom_list.cpp create mode 100644 htswanalysis/src/ValidationDesign/chrom_list.h create mode 100644 htswanalysis/src/ValidationDesign/defs.h create mode 100644 htswanalysis/src/ValidationDesign/main.cpp create mode 100644 htswanalysis/src/ValidationDesign/peak.cpp create mode 100644 htswanalysis/src/ValidationDesign/peak.h create mode 100755 htswanalysis/src/ValidationDesign/qPCR.cpp create mode 100644 htswanalysis/src/ValidationDesign/refine_peaks.cpp create mode 100644 htswanalysis/src/ValidationDesign/util.cpp create mode 100644 htswanalysis/src/ValidationDesign/util.h create mode 100644 htswanalysis/src/ValidationDesign/window.cpp diff --git a/htswanalysis/src/Makefile b/htswanalysis/src/Makefile index 9c8ca29..35df627 100644 --- a/htswanalysis/src/Makefile +++ b/htswanalysis/src/Makefile @@ -1,10 +1,13 @@ CPPFLAGS=-g -Wall -O3 -I/opt/local/include LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib -TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks +TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks ValidationDesign/refine_peaks all: $(TARGETS) +ValidationDesign/refine_peaks: + $(MAKE) -C ValidationDesign + install: $(TARGETS) cp $^ ../bin/ diff --git a/htswanalysis/src/ValidationDesign/Makefile b/htswanalysis/src/ValidationDesign/Makefile new file mode 100644 index 0000000..f0a184c --- /dev/null +++ b/htswanalysis/src/ValidationDesign/Makefile @@ -0,0 +1,5 @@ +%.o: %.cpp + g++ -g -Wall -O3 $< -c -o $@ + +refine_peaks: refine_peaks.cpp window.o util.o Read.o chrom_list.o + g++ -g -Wall -O3 $^ -o $@ diff --git a/htswanalysis/src/ValidationDesign/Read.cpp b/htswanalysis/src/ValidationDesign/Read.cpp new file mode 100644 index 0000000..b76c5f8 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/Read.cpp @@ -0,0 +1,5 @@ +#include + +#include "Read.h" + +bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } diff --git a/htswanalysis/src/ValidationDesign/Read.h b/htswanalysis/src/ValidationDesign/Read.h new file mode 100644 index 0000000..5155872 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/Read.h @@ -0,0 +1,22 @@ +#include +#include + +using namespace std; + +#ifndef READ_H +#define READ_H + +class Read { + public: + string chr; + unsigned int pos; + bool strand; + + Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } + Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; } + Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;} +}; + +typedef vector Reads; + +#endif diff --git a/htswanalysis/src/ValidationDesign/Window.h b/htswanalysis/src/ValidationDesign/Window.h new file mode 100644 index 0000000..9be5c61 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/Window.h @@ -0,0 +1,35 @@ +#include +#include + +#ifndef WINDOW_H +#define WINDOW_H + +using namespace std; + +class Window { + public: + string chr; + string name; + unsigned int start; + unsigned int end; + double score; + + string sequence; + + vector counts; + unsigned int total; + + void count_hits(); + void reset(unsigned int new_start, unsigned int new_end); + + Window(string name, string chr, int start, int end, double score); + Window(string line); + Window(const Window& r); + Window& operator=(const Window& r); + + bool operator<(const Window& b) const; +}; + +typedef vector Windows; + +#endif diff --git a/htswanalysis/src/ValidationDesign/chrom_list.cpp b/htswanalysis/src/ValidationDesign/chrom_list.cpp new file mode 100644 index 0000000..a3fd8a0 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/chrom_list.cpp @@ -0,0 +1,22 @@ +#include +#include +#include +#include + +#include "chrom_list.h" +#include "util.h" + +using namespace std; + +ChromList::ChromList(char* filename) { + char buffer[1024]; string delim("\t"); + ifstream chrom_file(filename); + while(chrom_file.getline(buffer, 1024)) { + string temp(buffer); + vector words; + split(temp, delim,words); + if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); } + (*this)[ words[0] ] = words[1]; + } + chrom_file.close(); +} diff --git a/htswanalysis/src/ValidationDesign/chrom_list.h b/htswanalysis/src/ValidationDesign/chrom_list.h new file mode 100644 index 0000000..6e5446b --- /dev/null +++ b/htswanalysis/src/ValidationDesign/chrom_list.h @@ -0,0 +1,27 @@ +#include + +#ifndef CHROMLIST_H +#define CHROMLIST_H + +using namespace std; +using namespace __gnu_cxx; + +struct eq_str { + bool operator() (const string& a, const string& b) const { + return(strcmp(a.c_str(), b.c_str()) == 0); + } +}; + +struct hash_str { + size_t operator() (const string& a) const { + hash h; + return(h(a.c_str())); + } +}; + +class ChromList : public hash_map { + public: + ChromList(char* filename); +}; + +#endif diff --git a/htswanalysis/src/ValidationDesign/defs.h b/htswanalysis/src/ValidationDesign/defs.h new file mode 100644 index 0000000..e2be395 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/defs.h @@ -0,0 +1 @@ +#define PRIMER_3_PATH "/Library/WebServer/CGI-Executables/ValidationDesign/primer3_core" diff --git a/htswanalysis/src/ValidationDesign/main.cpp b/htswanalysis/src/ValidationDesign/main.cpp new file mode 100644 index 0000000..5a6d1cb --- /dev/null +++ b/htswanalysis/src/ValidationDesign/main.cpp @@ -0,0 +1,191 @@ +/* + * main.cpp + * WindowLocator + * + * Created by Alan You on 8/11/08. + * Copyright 2008 Hudson Alpha. All rights reserved. + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "window.h" +#include "util.h" + +using namespace std; +using namespace __gnu_cxx; + +struct eq_str { + bool operator() (const string& a, const string& b) const { + return(strcmp(a.c_str(), b.c_str()) == 0); + } +}; + +struct hash_str { + size_t operator() (const string& a) const { + hash h; + return(h(a.c_str())); + } +}; + +typedef hash_map ChromList; + +void importWindows(string& filename, string& peakDataTitle, vector& peaks); +void retrieveSequenceData(ChromList chrom_files, vector& peaks); +void outputSequenceData(string peakDataTitle, vector& peaks); +void outputPrimer3Input(vector& peaks); + +int main (int argc, char * const argv[]) +{ + if(argc < 3) { + cout << "Usage: " << argv[0] << " bed_file_of_peaks chromosome_file" << endl; + exit(1); + } + + string filename(argv[1]); + string chrom_list(argv[2]); + + pid_t pid; + int rv; + int input_pipe[2]; + + setvbuf(stdout, (char*)NULL, _IONBF, 0); + if (pipe(input_pipe)) { + cout << "Pipe error!" << endl; + exit(1); + } + + if ((pid = fork()) == -1) { + cout << "Fork error!" << endl; + exit(1); + } + + //child + if (pid == 0) { + dup2(input_pipe[0], 0); close(input_pipe[1]); + if (execlp("primer3_core", "primer3_core", NULL) == -1){ + cerr << "execl error!" << endl; + exit(1); + } + cerr << "Done with primer3\n"; + } else { + ChromList chromosome_files; + char buffer[1024]; string delim("\t"); + ifstream chrom_file(chrom_list.c_str()); + while(chrom_file.getline(buffer, 1024)) { + string temp(buffer); + vector words; + split(temp, delim,words); + if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); } + chromosome_files[ words[0] ] = words[1]; + } + chrom_file.close(); + + // ChIP-seq peak data (BED format required) + ifstream peakDataFile; + + // Title line from ChIP-seq peak data + string peakDataTitle; + + // All peak calls. Each peak call contains chromomosme number, starting base pair, post-ending base + // pair, peak length, score, and corresponding sequence. + vector peaks; + + dup2(input_pipe[1], 1); close(input_pipe[0]); + importWindows(filename, peakDataTitle, peaks); + retrieveSequenceData(chromosome_files, peaks); + outputPrimer3Input(peaks); + close(input_pipe[1]); + close(1); + wait(&rv); + } + + return 0; +} + +void importWindows(string& filename, string& peakDataTitle, vector& peaks) +{ + ifstream peakDataFile; + peakDataFile.open(filename.c_str()); + + char temp[1024]; + + peakDataFile.getline(temp, 1024); + peakDataTitle = temp; + + while(peakDataFile.getline(temp, 1024)) { + if(peakDataFile.eof()) { peakDataFile.close(); break; } + Window inputCall(temp); + peaks.push_back(inputCall); + } + + sort(peaks.begin(), peaks.end()); + +} + +void retrieveSequenceData(ChromList chrom_filenames, vector& peaks) +{ + char temp[1024]; + + string chrom = peaks[0].chr; + string chrom_filename = chrom_filenames[chrom]; + ifstream chrom_file(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + size_t offset = chrom_file.gcount(); + for(vector::iterator i = peaks.begin(); i != peaks.end(); ++i) { + if(i->chr != chrom) { + chrom = i->chr; + chrom_filename = chrom_filenames[chrom]; + chrom_file.close(); chrom_file.open(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + offset = chrom_file.gcount(); + } + unsigned int begin = i->start; + unsigned int end = i->end; + + unsigned int begin_pos = offset + (int)begin/50 + begin; + unsigned int end_pos = offset + (int)end/50 + end; + + unsigned int read_len = end_pos - begin_pos; + char buffer[read_len+1]; + chrom_file.seekg(begin_pos, ios_base::beg); + chrom_file.read(buffer, read_len+1); + buffer[read_len] = '\0'; + i->sequence = buffer; + unsigned int a; + while( (a= ((i->sequence).find("\n"))) != string::npos) { + i->sequence.erase(a,1); + } + } + chrom_file.close(); +} + +void outputPrimer3Input(vector& peaks) +{ + cout << "PRIMER_NUM_RETURN=1" << endl; + cout << "PRIMER_LOWERCASE_MASKING=1" << endl; + //cout << "PRIMER_TM_SANTALUCIA=1" << endl; + cout << "PRIMER_SELF_END=2" << endl; + cout << "PRIMER_SELF_ANY=6" << endl; + cout << "PRIMER_OPT_SIZE=23" << endl; + cout << "PRIMER_MIN_SIZE=19" << endl; + cout << "PRIMER_MAX_SIZE=26" << endl; + cout << "PRIMER_MIN_TM=63" << endl; + cout << "PRIMER_OPT_TM=65" << endl; + cout << "PRIMER_MAX_TM=67" << endl; + cout << "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl; + + unsigned int idx = 0; + for(vector::iterator i = peaks.begin(); i != peaks.end(); ++i) { + //cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << idx++ << "_" << i->score << endl; + cout << "PRIMER_SEQUENCE_ID=" << i->name << i->score << endl; + cout << "SEQUENCE=" << i->sequence.c_str() << endl; + cout << "=" << endl; + } +} diff --git a/htswanalysis/src/ValidationDesign/peak.cpp b/htswanalysis/src/ValidationDesign/peak.cpp new file mode 100644 index 0000000..239ce19 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/peak.cpp @@ -0,0 +1,106 @@ +/* + * peak.cpp + * PeakLocator + * + * Created by Alan You on 8/11/08. + * Copyright 2008 Hudson Alpha. All rights reserved. + * + */ + +#include +#include + +#include "peak.h" +#include "util.h" + +using namespace std; + +Peak::Peak() +{ + this->startBP = 0; + this->endBP = 0; + this->length = 0; + this->chrom = ""; + this->score = 0; + this->sequence = ""; +} + +Peak::Peak(string line) +{ + istringstream outStream; + + vector fields; + string delim("\t"); + split(line,delim,fields); + + this->chrom = fields[0]; + this->startBP = atoi(fields[1].c_str()); + this->endBP = atoi(fields[2].c_str()); + this->score = strtod(fields[3].c_str(), NULL); + this->length = this->endBP - this->startBP; + this->sequence = ""; +} + +Peak::Peak(const Peak& p) +{ + this->startBP = p.startBP; + this->endBP = p.endBP; + this->length = p.length; + this->chrom = p.chrom; + this->score = p.score; + this->sequence = p.sequence; +} + +Peak::~Peak() +{ +; +} + +void Peak::print() +{ + cout << "Chromosome:\t\t" << this->chrom.c_str() << endl; + cout << "Starting Base Pair:\t" << this->startBP << endl; + cout << "Ending Base Pair:\t" << this->endBP << endl; + cout << "Sequence Length:\t" << this->length << endl; + cout << "Sequence Score:\t\t" << this->score << endl; + cout << "Sequence:\t\t" << this->sequence << endl; +} + +void Peak::printToFile(ofstream &outputFile) +{ + outputFile << "Chromosome:\t\t" << this->chrom.c_str() << endl; + outputFile << "Starting Base Pair:\t" << this->startBP << endl; + outputFile << "Ending Base Pair:\t" << this->endBP << endl; + outputFile << "Sequence Length:\t" << this->length << endl; + outputFile << "Sequence Score:\t\t" << this->score << endl; + outputFile << "Sequence:\t\t" << this->sequence << endl; + outputFile << endl; +} + +bool Peak::operator<(const Peak& that) const +{ + if (this->chrom == that.chrom) { return(this->startBP < that.startBP); + } else { return(this->chrom < that.chrom); } +} + +bool Peak::operator>(const Peak& that) const +{ + if (this->chrom == that.chrom) { return(this->startBP > that.startBP); + } else { return(this->chrom > that.chrom); } + +} + +Peak& Peak:: operator=(const Peak& that) +{ + if (this != &that) + { + this->chrom = that.chrom; + this->startBP = that.startBP; + this->endBP = that.endBP; + this->length = that.length; + this->score = that.score; + this->sequence = that.sequence; + } + + return *this; +} diff --git a/htswanalysis/src/ValidationDesign/peak.h b/htswanalysis/src/ValidationDesign/peak.h new file mode 100644 index 0000000..7238de7 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/peak.h @@ -0,0 +1,34 @@ +/* + * peak.h + * PeakLocator + * + * Created by Alan You on 8/11/08. + * Copyright 2008 Hudson Alpha. All rights reserved. + * + */ +#ifndef PEAK_H +#define PEAK_H + +#include +#include +#include +#include +#include + + +using namespace std; + +class Peak +{ +public: + Peak(); + Peak(string line); + Peak(const Peak& p); + ~Peak(); + + bool operator<(const Peak& that) const; + bool operator>(const Peak& that) const; + Peak& operator=(const Peak& that); +}; + +#endif diff --git a/htswanalysis/src/ValidationDesign/qPCR.cpp b/htswanalysis/src/ValidationDesign/qPCR.cpp new file mode 100755 index 0000000..00f12ae --- /dev/null +++ b/htswanalysis/src/ValidationDesign/qPCR.cpp @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define WINDOW 25 + +using namespace std; + + +void int_to_chromstr(int chrom, char* chr); +int chromstr_to_int(string chrom); + +void split (const string& text, const string& separators, vector& words); + +class Read { + public: + string chr; + int pos; + bool strand; + + Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } + Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; } + Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;} +}; + +class Hit { + public: + int start; + int end; + bool strand; + + string chr; + string name; + + unsigned int count[2]; + + Hit(string name, string chr, int start, int end, bool strand) { + this->name = name; + this->chr = chr; + this->start = start; + this->end = end; + this->strand = strand; + count[0] = 0; + count[1] = 0; + } + + Hit(const Hit& h) { + this->name = h.name; + this->chr = h.chr; + this->start = h.start; + this->end = h.end; + this->strand = h.strand; + count[0] = h.count[0]; + count[1] = h.count[1]; + } + + Hit& operator=(const Hit& h) { + this->name = h.name; + this->chr = h.chr; + this->start = h.start; + this->end = h.end; + this->strand = h.strand; + count[0] = h.count[0]; + count[1] = h.count[1]; + return *this; + } +}; + +bool Feat_lt_Read(const Hit& feat, const Read& read) { + //First compare based on chromosomes + if(read.chr == feat.chr) { return feat.end < read.pos; } + else { return feat.chr < read.chr; } +} + +ostream &operator<<( ostream &out, const Hit &h ) { + out << h.name.c_str() << "\tchr" << h.chr.c_str() << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count; + return out; +} + +typedef vector Hits; +typedef vector Reads; + +bool compare_hits(const Hit &a, const Hit &b) { if(a.chr == b.chr) { return a.end < b.start; } else { return a.chr < b.chr; } } +bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } + +void read_features(const char* filename, Hits& features) { + string delim("\t"); + + ifstream feat(filename); + size_t N = 0; + while(feat.peek() != EOF) { + char line[1024]; + feat.getline(line,1024,'\n'); + N++; + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() < 5) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; } + + string name = fields[0]; + string chr = fields[1]; + if(chr == "NA") { continue; } + int start = atoi(fields[2].c_str()); + int stop = atoi(fields[3].c_str()); + bool strand = 0; + + Hit feat(name,chr,start,stop,strand); + features.push_back(feat); + } + + //sort the features so we can run through it once + std::stable_sort(features.begin(),features.end(),compare_hits); + feat.close(); +} + +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "NA") { continue; } + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + if(strand == 0) { + Read read(chr,pos,false); + features.push_back(read); + } else { + Read read(chr,pos+25,true); + features.push_back(read); + } + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end(),compare_reads); + cerr << "Reads sorted\n"; +} + +void count_read_in_features(Hits& features, Reads& data, unsigned int idx) { + Hits::iterator feat_it = features.begin(); + + for(Reads::iterator i = data.begin(); i != data.end(); ++i) { + //skip to first feature after read + string start_chr = feat_it->chr; + while(feat_it != features.end() && Feat_lt_Read(*feat_it, *i)) { + feat_it++; + } + + //stop if we have run out of features. + if(feat_it == features.end()) { break; } + + if(i->pos >= feat_it->start && i->pos <= feat_it->end) { + feat_it->count[idx]++; + } + } +} + +int getdir (string dir, vector &files) +{ + DIR *dp; + struct dirent *dirp; + if((dp = opendir(dir.c_str())) == NULL) { + cout << "Error(" << errno << ") opening " << dir << endl; + return errno; + } + + while ((dirp = readdir(dp)) != NULL) { + if(dirp->d_name[0] == '.') { continue; } + files.push_back(dir + "/" + string(dirp->d_name)); + } + closedir(dp); + return 0; +} + +int compare_double(const void* a, const void* b) { + return *(double*)a - *(double*)b; +} + +int main(int argc, char** argv) { + if(argc < 4) { cerr << "Usage: " << argv[0] << " reads bg test_directory\n"; exit(1); } + + char read_filename[1024]; strcpy(read_filename,argv[1]); + char bg_filename[1024]; strcpy(bg_filename,argv[2]); + string testdir(argv[3]); + Reads reads; read_align_file(read_filename, reads); + Reads bg; read_align_file(bg_filename, bg); + + vector tests; + getdir(testdir,tests); + + for(unsigned int i = 0; i < tests.size(); i++) { + Hits features; + read_features(tests[i].c_str(), features); + count_read_in_features(features, reads,0); + count_read_in_features(features, bg,1); + double read_data[features.size()]; + double bg_data[features.size()]; + //double readZ = reads.size()/1000000.0; + //double bgZ = bg.size()/1000000.0; + + // temporary hack to get raw read counts + double readZ = 1.0; double bgZ = 1.0; + for(unsigned int idx = 0; idx < features.size(); idx++) { + read_data[idx] = ((double)features[idx].count[0]/readZ); + bg_data[idx] = ((double)features[idx].count[1]/bgZ); + cout << features[idx].name << "\t" << features[idx].chr << ":" << features[idx].start << "-" << features[idx].end << "\t" << read_data[idx] << "\t" << bg_data[idx] << endl; + } + double enrichment = gsl_stats_mean(read_data,1,features.size())/gsl_stats_mean(bg_data,1,features.size()); + //qsort(data, features.size(), sizeof(double), compare_double); + //double median = gsl_stats_median_from_sorted_data(data, 1, features.size()); + //double var = gsl_stats_variance_m(data,1,features.size(),mean); + cout << tests[i].c_str() << "\t" << enrichment << endl; + } +} + +void split (const string& text, const string& separators, vector& words) { + + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} diff --git a/htswanalysis/src/ValidationDesign/refine_peaks.cpp b/htswanalysis/src/ValidationDesign/refine_peaks.cpp new file mode 100644 index 0000000..a100922 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/refine_peaks.cpp @@ -0,0 +1,278 @@ +#include +#include +#include +#include +#include + +#include "defs.h" +#include "window.h" +#include "read.h" +#include "util.h" +#include "chrom_list.h" + +using namespace std; + +bool Wind_lt_Read(const Window& feat, const Read& read) { if(read.chr == feat.chr) { return feat.end < read.pos; } else { return feat.chr < read.chr; } } + +extern bool compare_windows(const Window &a, const Window &b); +extern bool compare_reads(const Read &a, const Read &b); + +void readFromPipe( int fd ); +void read_windows(const char* filename, Windows& windows); +void read_align_file(char* filename, Reads& features); +void count_reads_in_windows(Reads& data, Windows& windows); +void recenter_windows(Windows& windows, unsigned int width); +void resize_windows(Windows& windows, unsigned int width); + +void retrieveSequenceData(ChromList chrom_files, Windows& peaks); +void outputPrimer3Input(Windows& peaks, Windows& calls); + +int main(int argc, char** argv) { + if(argc != 4) { cerr << "Usage: " << argv[0] << " peaks.bed aligned_reads chromosome_file" << endl; return(0); } + char window_filename[1024]; strcpy(window_filename,argv[1]); + char reads_filename[1024]; strcpy(reads_filename,argv[2]); + char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]); + + pid_t pid; + int rv; + int pipe_to_primer[2]; + //int pipe_from_primer[2]; + + setvbuf(stdout, (char*)NULL, _IONBF, 0); + + if (pipe(pipe_to_primer)) { cout << "Pipe error!" << endl; exit(1); } + //if (pipe(pipe_from_primer)) { cout << "Pipe error!" << endl; exit(1); } + if ((pid = fork()) == -1) { cout << "Fork error!" << endl; exit(1); } + + //child + if (pid == 0) { + dup2(pipe_to_primer[0], 0); close(pipe_to_primer[1]); + //dup2(pipe_from_primer[1], 1); close(pipe_from_primer[0]); + if (execl(PRIMER_3_PATH, "primer3_core", NULL) == -1){ cerr << "execl error!" << endl; exit(1); } + } else { + dup2(pipe_to_primer[1], 1); close(pipe_to_primer[0]); + //dup2(pipe_from_primer[0], 0); close(pipe_from_primer[1]); + + Reads reads; + ChromList chromosome_files(chromosome_filename); + + // Title line from ChIP-seq peak data + string peakDataTitle; + + // All peak calls. Each peak call contains chromomosme number, starting base pair, post-ending base + // pair, peak length, score, and corresponding sequence. + Windows windows; + read_windows(window_filename, windows); + + read_align_file(reads_filename, reads); + count_reads_in_windows(reads, windows); + for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) { i->count_hits(); } + + Windows targets(windows); + recenter_windows(targets,60); + count_reads_in_windows(reads, targets); + + unsigned int max_reads = 0; + unsigned int min_reads = 1e8; + Windows::iterator max, min; + for(Windows::iterator i = targets.begin(); i != targets.end(); ++i) { + i->count_hits(); + if(i->total > max_reads) { max_reads = i->total; max = i; } + if(i->total < min_reads) { min_reads = i->total; min = i; } + cerr << i->total << " at " << i->chr << " " << i->start << " " << i->end << endl; + } + + cerr << "Max: " << max_reads << " at " << max->chr << " " << max->start << " " << max->end << endl << "Min: " << min_reads << " at " << min->chr << " " << min->start << " " << min->end << endl; + + retrieveSequenceData(chromosome_files, targets); + outputPrimer3Input(targets, windows); + close(pipe_to_primer[1]); + close(1); + //readFromPipe(pipe_from_primer[0]); + //close(pipe_from_primer[0]); + wait(&rv); + } + return 0; +} + +void readFromPipe( int fd ) +{ + FILE *stream; + int ch; + if ( (stream = fdopen( fd, "r" )) == NULL ) + { + perror( "fdopen() r" ); + exit(255); + } + while ( (ch = getc( stream )) != EOF ) + putc( ch, stdout ); + fflush( stdout ); + fclose( stream ); +} + +void read_windows(const char* filename, Windows& windows) { + string delim("\t"); + + ifstream feat(filename); + unsigned int N = 0; + while(feat.peek() != EOF) { + char line[1024]; + feat.getline(line,1024,'\n'); + N++; + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() < 4) { cerr << "Error (" << filename << "): wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; + continue; + } + + string chr = fields[0]; + if(chr == "NA") { continue; } + int start = atoi(fields[1].c_str()); + int stop = atoi(fields[2].c_str()); + double score = strtod(fields[3].c_str(),NULL); + + char name[100]; sprintf(name,"Window %d",N); + Window w(string(name),chr,start-300,stop+300,score); + windows.push_back(w); + } + + //sort the features so we can run through it once + std::stable_sort(windows.begin(),windows.end(),compare_windows); + feat.close(); +} + +void count_reads_in_windows(Reads& data, Windows& windows) { + Windows::iterator wind_it = windows.begin(); + + for(Reads::iterator i = data.begin(); i != data.end(); ++i) { + //skip to first window after read + while(wind_it != windows.end() && Wind_lt_Read(*wind_it, *i)) { + wind_it++; + } + + //stop if we have run out of windows. + if(wind_it == windows.end()) { break; } + + if(i->pos >= wind_it->start && i->pos < wind_it->end) { + int pos = i->pos - wind_it->start; + wind_it->counts[pos]++; + } + } +} + +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "NA") { continue; } + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + if(strand == 0) { + Read read(chr,pos,strand); + features.push_back(read); + } else { + Read read(chr,pos+25,strand); + features.push_back(read); + } + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end(),compare_reads); + cerr << "Reads sorted\n"; +} + +//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region. +void recenter_windows(Windows& windows, unsigned int width) { + for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) { + unsigned int max_count = 0; unsigned int max_offset = 0; + for(unsigned int count_i = 0; count_i < i->counts.size(); count_i++) { + if(i->counts[count_i] > max_count) { max_count = i->counts[count_i]; max_offset = count_i; } + } + unsigned int newstart = i->start + max_offset - width; + unsigned int newend = i->start + max_offset + width; + i->reset(newstart, newend); + } +} + +//Look at reads in each window, and pick out the max. Return target regions of width-bp before and after the region. +void resize_windows(Windows& windows, unsigned int width) { + for(Windows::iterator i = windows.begin(); i != windows.end(); ++i) { + unsigned int middle = i->start + i->end / 2.0; + i->reset(middle - width, middle + width); + } +} + + +void retrieveSequenceData(ChromList chrom_filenames, Windows& peaks) { + char temp[1024]; + + string chrom = peaks[0].chr; + string chrom_filename = chrom_filenames[chrom]; + ifstream chrom_file(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + size_t offset = chrom_file.gcount(); + for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) { + if(i->chr != chrom) { + chrom = i->chr; + chrom_filename = chrom_filenames[chrom]; + chrom_file.close(); chrom_file.open(chrom_filename.c_str()); + chrom_file.getline(temp, 1024); + offset = chrom_file.gcount(); + } + unsigned int begin = i->start; + unsigned int end = i->end; + + unsigned int begin_pos = offset + (int)begin/50 + begin; + unsigned int end_pos = offset + (int)end/50 + end; + + unsigned int read_len = end_pos - begin_pos; + char buffer[read_len+1]; + chrom_file.seekg(begin_pos, ios_base::beg); + chrom_file.read(buffer, read_len+1); + buffer[read_len] = '\0'; + i->sequence = buffer; + unsigned int a; + while( (a= ((i->sequence).find("\n"))) != string::npos) { + i->sequence.erase(a,1); + } + } + chrom_file.close(); +} + +void outputPrimer3Input(Windows& peaks, Windows& calls) { + cout << "PRIMER_NUM_RETURN=1" << endl; + cout << "PRIMER_LOWERCASE_MASKING=1" << endl; + cout << "PRIMER_SELF_END=2" << endl; + cout << "PRIMER_SELF_ANY=6" << endl; + cout << "PRIMER_OPT_SIZE=23" << endl; + cout << "PRIMER_MIN_SIZE=19" << endl; + cout << "PRIMER_MAX_SIZE=26" << endl; + cout << "PRIMER_MIN_TM=63" << endl; + cout << "PRIMER_OPT_TM=65" << endl; + cout << "PRIMER_MAX_TM=67" << endl; + cout << "PRIMER_PRODUCT_SIZE_RANGE=60-80 50-100" << endl; + + Windows::iterator j = calls.begin(); + for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) { + cout << "PRIMER_SEQUENCE_ID=" << i->chr << "_" << i->start << "_" << i->end << "_" << i->score << "_" << i->total << "_" << j->start << "_" << j->end << "_" << j->total << endl; + cout << "SEQUENCE=" << i->sequence.c_str() << endl; + cout << "=" << endl; + j++; + } +} diff --git a/htswanalysis/src/ValidationDesign/util.cpp b/htswanalysis/src/ValidationDesign/util.cpp new file mode 100644 index 0000000..dcaf73e --- /dev/null +++ b/htswanalysis/src/ValidationDesign/util.cpp @@ -0,0 +1,21 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include + +using namespace std; + +void split (const string& text, const string& separators, vector& words) { + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} + +#endif diff --git a/htswanalysis/src/ValidationDesign/util.h b/htswanalysis/src/ValidationDesign/util.h new file mode 100644 index 0000000..8a73b63 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/util.h @@ -0,0 +1,11 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include + +using namespace std; + +void split (const string& text, const string& separators, vector& words); + +#endif diff --git a/htswanalysis/src/ValidationDesign/window.cpp b/htswanalysis/src/ValidationDesign/window.cpp new file mode 100644 index 0000000..4459004 --- /dev/null +++ b/htswanalysis/src/ValidationDesign/window.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include + +#include "window.h" +#include "util.h" + +void Window::count_hits() { + total = 0; + for(vector::iterator i = counts.begin(); i != counts.end(); ++i) { + total += *i; + } +} + +Window::Window(string name, string chr, int start, int end, double score) { + this->name = name; + this->chr = chr; + this->start = start; + this->end = end; + this->score = score; + counts.clear(); + counts.resize(end-start+1,0); + this->sequence = ""; +} + +Window::Window(const Window& r) { + this->name = r.name; + this->score= r.score; + this->chr = r.chr; + this->sequence = r.sequence; + this->start = r.start; + this->end = r.end; + this->counts = r.counts; +} + +Window& Window::operator=(const Window& r) { + this->name = r.name; + this->score = r.score; + this->chr = r.chr; + this->sequence = r.sequence; + this->start = r.start; + this->end = r.end; + this->counts = r.counts; + return *this; +} + +ostream &operator<<( ostream &out, const Window &w ) { + out << w.chr.c_str() << "\t" << w.start << "\t" << w.end << "\t" << w.total; + return out; +} + +bool Window::operator<(const Window& b) const { + if(this->chr == b.chr) { return this->start < b.start; } else { return this->chr < b.chr; } } + + +Window::Window(string line) { + istringstream outStream; + + vector fields; + string delim("\t"); + split(line,delim,fields); + + this->name = fields[0]; + this->chr = fields[1]; + this->start = atoi(fields[2].c_str()); + this->end = atoi(fields[3].c_str()); + this->score = strtod(fields[4].c_str(), NULL); + this->sequence = ""; +} + +void Window::reset(unsigned int new_start, unsigned int new_end) { + this->start = new_start; + this->end = new_end; + this->counts.clear(); + this->counts.resize(end-start+1,0); +} + +bool compare_windows(const Window &a, const Window &b) { if(a.chr == b.chr) { return a.start < b.start; } else { return a.chr < b.chr; } } diff --git a/htswanalysis/src/profile_reads_against_features.cpp b/htswanalysis/src/profile_reads_against_features.cpp index 8cc62ef..7f32b93 100755 --- a/htswanalysis/src/profile_reads_against_features.cpp +++ b/htswanalysis/src/profile_reads_against_features.cpp @@ -11,18 +11,15 @@ using namespace std; vector profile(2000,0); -void int_to_chromstr(int chrom, char* chr); -int chromstr_to_int(string chrom); - void split (const string& text, const string& separators, vector& words); class Hit { public: - int chr; + string chr; unsigned int pos; bool strand; - Hit(int chr, unsigned int pos, bool strand) { + Hit(string chr, unsigned int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } }; @@ -60,8 +57,8 @@ int main(int argc, char** argv) { split(line_str, delim, fields); if(fields.size() != 3) { cerr << "Error: wrong number of fields in feature list (line " << N << " has " << fields.size() << " fields)\n"; } - int chr = chromstr_to_int(fields[0]); - if(chr == -1) { continue; } + string chr = fields[0]; + int pos = atoi(fields[1].c_str()); bool strand = (bool)(atoi(fields[2].c_str())); @@ -78,7 +75,9 @@ int main(int argc, char** argv) { location_delim = ":"; char strand_str[2]; strand_str[1] = '\0'; ifstream seqs(filename); + unsigned int linenum = 0; while(seqs.peek() != EOF) { + cerr << ++linenum << endl; char line[2048]; seqs.getline(line,2048,'\n'); string line_str(line); @@ -87,8 +86,8 @@ int main(int argc, char** argv) { if(fields.size() == 3) { continue; } vector location; split(fields[3], location_delim, location); - int chr = chromstr_to_int(location[0]); - if(chr == -1) { continue; } + string chr = location[0]; + int pos = atoi(location[1].c_str()); bool strand = ((fields[4].c_str())[0] == 'F')?0:1; @@ -102,17 +101,17 @@ int main(int argc, char** argv) { //sort the data so we can run through it once std::sort(data.begin(),data.end(),compare_hits); - int chrom = -1; + string chrom = ""; unsigned int feat_idx = 0; int num_hits = 0; for(Hits::iterator i = data.begin(); i != data.end(); ++i) { num_hits++; - if(chrom == -1 || i->chr != chrom) { + if(chrom == "" || i->chr != chrom) { chrom = i->chr; feat_idx = 0; while(feat_idx < features.size() && features[feat_idx].chr != chrom) { feat_idx++; } if(feat_idx == features.size()) { break; } - cerr << chrom << " feat_idx: " << feat_idx << endl; + cerr << chrom.c_str() << " feat_idx: " << feat_idx << endl; } int dist_to_feature = i->pos - features[feat_idx].pos; @@ -137,29 +136,6 @@ int main(int argc, char** argv) { } } -int chromstr_to_int(string chromstr) { - const char* chrom = chromstr.c_str(); - if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; } - if(chrom[3] == 'X') { return 23; } else - if(chrom[3] == 'Y') { return 24; } else - if(chrom[3] == 'M') { return 25; } else - if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else - { return -1; } -} - -void int_to_chromstr(int chrom, char* chr) { - strcpy(chr,"chr\0\0\0"); - switch(chrom) { - case 23: chr[3]='X'; break; - case 24: chr[3]='Y'; break; - case 25: chr[3]='M'; break; - default: chr[3]=(char)(48+(int)(chrom/10)); - chr[4]=(char)(48+(int)(chrom - 10*(int)(chrom/10))); - if(chr[3]=='0') { chr[3]=chr[4]; chr[4]='\0'; } - } -} - - void split (const string& text, const string& separators, vector& words) { size_t n = text.length (); -- 2.30.2