From 516c261bb3730e06198c7aaa3a2b384dad0b218b Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Sun, 4 Jan 2009 01:27:11 +0000 Subject: [PATCH] Added new profile program to make a bedgraph file. This is partically the result of size limits imposed on the wig files by ucsc. As an added feature, two bedgraph tracks are actually propduced, which give the forward and reverse strand reads in different colors. --- htswanalysis/src/Makefile | 4 +- htswanalysis/src/SRLib/loci.cpp | 6 +- htswanalysis/src/SRLib/loci.h | 1 + htswanalysis/src/profile_reads_bedgraph.cpp | 147 +++++++++++--------- 4 files changed, 92 insertions(+), 66 deletions(-) diff --git a/htswanalysis/src/Makefile b/htswanalysis/src/Makefile index dbeb066..bbbaf5f 100644 --- a/htswanalysis/src/Makefile +++ b/htswanalysis/src/Makefile @@ -1,5 +1,5 @@ CPPFLAGS=-g -Wall -O3 -I/opt/local/include -LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib +LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib -L./SRLib/ TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks GetReadsInSnps/getReadsInSnps ValidationDesign/refine_peaks Methylseq/Methylseq_Analysis count_reads_in_regions profile_reads_bedgraph @@ -14,6 +14,8 @@ Methhylseq/Methylseq_Analysis: Methylseq/Methylseq_Analysis.cpp GetReadsInSnps/getReadsInSnps: GetReadsInSnps/getReadsInSnps.cpp cd GetReadsInSnps && $(MAKE) +profile_reads_bedgraph: profile_reads_bedgraph.cpp SRLib/loci.cpp SRLib/util.cpp + install: $(TARGETS) cp $^ ../bin/ diff --git a/htswanalysis/src/SRLib/loci.cpp b/htswanalysis/src/SRLib/loci.cpp index c846f89..779a9a4 100644 --- a/htswanalysis/src/SRLib/loci.cpp +++ b/htswanalysis/src/SRLib/loci.cpp @@ -1,6 +1,6 @@ #include #include -#include "Loci.h" +#include "loci.h" using namespace std; @@ -26,6 +26,10 @@ bool Loci::operator<(const Loci& a) const { return((this->chr == a.chr)?(this->pos < a.pos):(this->chr < a.chr)); } +bool Loci::operator==(const Loci& a) const { + return(this->chr == a.chr && this->pos == a.pos); +} + bool Loci::operator<=(const Loci& a) const { return((this->chr == a.chr)?(this->pos <= a.pos):(this->chr < a.chr)); } diff --git a/htswanalysis/src/SRLib/loci.h b/htswanalysis/src/SRLib/loci.h index cfa9562..b196a1e 100644 --- a/htswanalysis/src/SRLib/loci.h +++ b/htswanalysis/src/SRLib/loci.h @@ -14,6 +14,7 @@ class Loci { Loci(const Loci& l); Loci& operator=(const Loci& l); + bool operator==(const Loci& a) const; bool operator<(const Loci& a) const; bool operator<=(const Loci& a) const; bool operator>=(const Loci& a) const; diff --git a/htswanalysis/src/profile_reads_bedgraph.cpp b/htswanalysis/src/profile_reads_bedgraph.cpp index a94c147..be54749 100644 --- a/htswanalysis/src/profile_reads_bedgraph.cpp +++ b/htswanalysis/src/profile_reads_bedgraph.cpp @@ -1,75 +1,103 @@ #include +#include #include #include -#include #include +#include "SRLib/loci.h" +#include "SRLib/util.h" + #define WINDOW 25 using namespace std; -class Read { +class EndPoint : public Loci { public: - string chr; - int pos; - bool strand; + bool type; //0 = start, 1 = end + + EndPoint(string chr, unsigned int pos, bool type) : Loci(chr,pos) { this->type = type; } + EndPoint(const EndPoint &e) : Loci(e) { this->type = e.type; } + EndPoint& operator=(const EndPoint& e) { + if(this == &e) return *this; - Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } - Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; } - Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;} + Loci::operator=(e); + this->type = e.type; + return *this; + } }; -typedef vector Reads; +typedef vector Ends; -void print_wig_header(char* name, char* desc) { - //cout << "browser position chr1" << endl; - cout << "browser hide all" << endl; - cout << "browser full altGraph" << endl; - cout << "track type=bedGraph name=" << name << " description=" << desc << " visibility=full autoScale=on color=0,0,0 priority=10" << endl; +void print_browser_line() { + cout << "browser hide all" << endl << "browser pack refGene encodeRegions" << endl << "browser full altGraph" << endl; } -bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } +void print_wig_header(string name, string desc, string color = "200,100,0", string altColor = "0,100,200") { + //cout << "browser position chr1" << endl; + cout << "track type=bedGraph name=" << name << " description=" << desc << " visibility=full "; + cout << "color=" << color << " altColor="<< altColor << " priority=10 maxHeightPixels=128:64:11" << endl; +} -void split (const string& text, const string& separators, vector& words); -void read_align_file(char* filename, Reads& features); +void read_align_file(char* filename, Ends& f, Ends& r); +void print_data(Ends::iterator i, Ends::iterator end, double Z); int main(int argc, char** argv) { if(argc != 4) { cerr << "Usage: " << argv[0] << " Aligned_reads Wiggle_Track_Name Wiggle_Track_Description" << endl; return(0); } char filename[1024]; strcpy(filename,argv[1]); - Reads data; - char track_name[1024]; strcpy(track_name, argv[2]); - char track_desc[1024]; strcpy(track_desc, argv[3]); + Ends f, r; - print_wig_header(track_name, track_desc); - read_align_file(filename, data); - - unsigned int n_aligned_reads = data.size(); - cerr << n_aligned_reads/1000000.00 << "M reads\n"; - - queue Q; - for(Reads::iterator i = data.begin(); i != data.end();) { - if(Q.empty() || Q.front().chr != (*i).chr) { - while(!Q.empty()) { Q.pop(); } - } - - string chr_string = i->chr; - int posnum = i->pos; - while(i != data.end() && i->chr == chr_string && i->pos == posnum) { - Q.push(*i); ++i; - } + string track_name = argv[2]; + string track_desc = argv[3]; - while(!Q.empty() && Q.front().pos + WINDOW < Q.back().pos) { - Q.pop(); - } + read_align_file(filename, f, r); + + unsigned int n_aligned_reads = (f.size() + r.size()) / 2; + double Z = n_aligned_reads/1000000.0; + cerr << Z << "M reads\n"; + + print_browser_line(); + print_wig_header(track_name+"_forward", track_desc+"_forward"); + print_data(f.begin(),f.end(), Z); + print_wig_header(track_name+"_reverse", track_desc+"_reverse"); + print_data(r.begin(),r.end(), -1.0*Z); +} - if(Q.size() > 1 && (i+1) != data.end() && Q.back().chr == (i+1)->chr) { - cout << Q.back().chr << " " << Q.back().pos << " " << (i+1)->pos << " " << Q.size() / (double)(n_aligned_reads/1000000.0) << endl; +void print_data(Ends::iterator i, Ends::iterator end, double Z) { + EndPoint begin_mark("start",0,0); + EndPoint prev = begin_mark; + int count = 0; + do { + + //ok, first look at the next endpoint, and output the end of the corrent window. + i++; + if(i->chr != prev.chr) + count = 0; + + // Using a very low noise threshold to help make for sane file sizes. Can override if need to see everything + // but for large datasets, no threshold can easily produce gigabyte sized files. This threshold drops the file + // size to about 1/8th the original size(of course, depending on the dataset). + else if(count > 2) + cout << prev.chr << "\t" << prev.pos << "\t" << i->pos << "\t" << count / Z << endl; + + //now, count the new endpoint, and reset the prev marker + if(i->type == 0) { count++; } else { count--; } + prev = *i; + + //finally, mark the current point, and while new points are the same position, keep adding them + while(i+1 != end && ((Loci)(*(i+1)) == (Loci)prev)) { + i++; + if(i->type == 0) { count++; } else { count--; } } - } + } while(i+1 != end); } -void read_align_file(char* filename, Reads& features) { +/* example align file format +CTTCAATGACCACTGGCCTCCCTG 12500 1 chr22:15254780 R CAGGGAGGCCAGTGGTCATTGAAGC 11453 +AGCGTGGGCAACAGGGAACGTGATG 11453 1 chr22:44584170 R CATCCCGTTCCCTGTTGCCCACGCT 9359 +*/ + +void read_align_file(char* filename, Ends& f, Ends& r) { string delim(" \n"); string location_delim(":"); char strand_str[2]; strand_str[1] = '\0'; @@ -85,36 +113,27 @@ void read_align_file(char* filename, Reads& features) { if(fields.size() != 7) { continue; } vector location; split(fields[3], location_delim, location); + string chr = location[0]; - if(chr == "NA") { continue; } - if(chr.substr(0,3) != "chr") { continue; } int pos = atoi(location[1].c_str()); bool strand = ((fields[4].c_str())[0] == 'F')?0:1; if(strand == 0) { - Read read(chr,pos,false); - features.push_back(read); + EndPoint s(chr,pos,0); + EndPoint e(chr,pos+25,1); + f.push_back(s); + f.push_back(e); } else { - Read read(chr,pos+25,true); - features.push_back(read); + EndPoint s(chr,pos,0); + EndPoint e(chr,pos+25,1); + r.push_back(s); + r.push_back(e); } } seqs.close(); //sort the data so we can run through it once - std::sort(features.begin(),features.end(),compare_reads); + std::sort(f.begin(),f.end()); + std::sort(r.begin(),r.end()); cerr << "Reads sorted\n"; } - -void split (const string& text, const string& separators, vector& words) { - - size_t n = text.length (); - size_t start = text.find_first_not_of (separators); - - while (start < n) { - size_t stop = text.find_first_of (separators, start); - if (stop > n) stop = n; - words.push_back (text.substr (start, stop-start)); - start = text.find_first_not_of (separators, stop+1); - } -} -- 2.30.2