From: Tim Reddy Tim Date: Sat, 3 Jan 2009 20:21:32 +0000 (+0000) Subject: Added new program to profile reads using bedgraph format. Deep sequencing projects... X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=93cbbdfdc05a251c1f01c4d9d4e09956ff99f421 Added new program to profile reads using bedgraph format. Deep sequencing projects go beyong the limits of a wiggle file size set by ucsc --- diff --git a/htswanalysis/src/Makefile b/htswanalysis/src/Makefile index 4b22aa9..dbeb066 100644 --- a/htswanalysis/src/Makefile +++ b/htswanalysis/src/Makefile @@ -1,7 +1,7 @@ CPPFLAGS=-g -Wall -O3 -I/opt/local/include LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib -TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks ValidationDesign/refine_peaks Methylseq/Methylseq_Analysis +TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks GetReadsInSnps/getReadsInSnps ValidationDesign/refine_peaks Methylseq/Methylseq_Analysis count_reads_in_regions profile_reads_bedgraph all: $(TARGETS) @@ -11,6 +11,9 @@ ValidationDesign/refine_peaks: ValidationDesign/refine_peaks.cpp Methhylseq/Methylseq_Analysis: Methylseq/Methylseq_Analysis.cpp cd Methylseq && $(MAKE) +GetReadsInSnps/getReadsInSnps: GetReadsInSnps/getReadsInSnps.cpp + cd GetReadsInSnps && $(MAKE) + install: $(TARGETS) cp $^ ../bin/ diff --git a/htswanalysis/src/profile_reads_bedgraph.cpp b/htswanalysis/src/profile_reads_bedgraph.cpp new file mode 100644 index 0000000..a94c147 --- /dev/null +++ b/htswanalysis/src/profile_reads_bedgraph.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include + +#define WINDOW 25 + +using namespace std; + +class Read { + public: + string chr; + int pos; + bool strand; + + Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } + Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; } + Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;} +}; + +typedef vector Reads; + +void print_wig_header(char* name, char* desc) { + //cout << "browser position chr1" << endl; + cout << "browser hide all" << endl; + cout << "browser full altGraph" << endl; + cout << "track type=bedGraph name=" << name << " description=" << desc << " visibility=full autoScale=on color=0,0,0 priority=10" << endl; +} + +bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } + +void split (const string& text, const string& separators, vector& words); +void read_align_file(char* filename, Reads& features); + +int main(int argc, char** argv) { + if(argc != 4) { cerr << "Usage: " << argv[0] << " Aligned_reads Wiggle_Track_Name Wiggle_Track_Description" << endl; return(0); } + char filename[1024]; strcpy(filename,argv[1]); + Reads data; + + char track_name[1024]; strcpy(track_name, argv[2]); + char track_desc[1024]; strcpy(track_desc, argv[3]); + + print_wig_header(track_name, track_desc); + read_align_file(filename, data); + + unsigned int n_aligned_reads = data.size(); + cerr << n_aligned_reads/1000000.00 << "M reads\n"; + + queue Q; + for(Reads::iterator i = data.begin(); i != data.end();) { + if(Q.empty() || Q.front().chr != (*i).chr) { + while(!Q.empty()) { Q.pop(); } + } + + string chr_string = i->chr; + int posnum = i->pos; + while(i != data.end() && i->chr == chr_string && i->pos == posnum) { + Q.push(*i); ++i; + } + + while(!Q.empty() && Q.front().pos + WINDOW < Q.back().pos) { + Q.pop(); + } + + if(Q.size() > 1 && (i+1) != data.end() && Q.back().chr == (i+1)->chr) { + cout << Q.back().chr << " " << Q.back().pos << " " << (i+1)->pos << " " << Q.size() / (double)(n_aligned_reads/1000000.0) << endl; + } + } +} + +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "NA") { continue; } + if(chr.substr(0,3) != "chr") { continue; } + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + if(strand == 0) { + Read read(chr,pos,false); + features.push_back(read); + } else { + Read read(chr,pos+25,true); + features.push_back(read); + } + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end(),compare_reads); + cerr << "Reads sorted\n"; +} + +void split (const string& text, const string& separators, vector& words) { + + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +}