From ae36eae6b593cc36c01135d86aa46c4c510d3178 Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Wed, 3 Sep 2008 14:32:09 +0000 Subject: [PATCH] Updates to profile reads to be more general in how chromosomes are handled --- htswanalysis/src/profile_reads_wig.cpp | 147 +++++++++++++------------ 1 file changed, 75 insertions(+), 72 deletions(-) diff --git a/htswanalysis/src/profile_reads_wig.cpp b/htswanalysis/src/profile_reads_wig.cpp index 58be9dd..4e58b45 100644 --- a/htswanalysis/src/profile_reads_wig.cpp +++ b/htswanalysis/src/profile_reads_wig.cpp @@ -8,8 +8,18 @@ using namespace std; -typedef pair Hit; -typedef vector Hits; +class Read { + public: + string chr; + int pos; + bool strand; + + Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; } + Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; } + Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;} +}; + +typedef vector Reads; void print_wig_header(char* name, char* desc) { //cout << "browser position chr1" << endl; @@ -18,100 +28,93 @@ void print_wig_header(char* name, char* desc) { cout << "track type=wig name=" << name << " description=" << desc << " visibility=full autoScale=on color=0,0,0 priority=10" << endl; } -bool compare_hits(const Hit &a, const Hit &b) -{ - if(a.first == b.first) { - return a.second < b.second; - } else { - return a.first < b.first; - } -} +bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } } + +void split (const string& text, const string& separators, vector& words); +void read_align_file(char* filename, Reads& features); int main(int argc, char** argv) { if(argc != 4) { cerr << "Usage: " << argv[0] << " Aligned_reads Wiggle_Track_Name Wiggle_Track_Description" << endl; return(0); } char filename[1024]; strcpy(filename,argv[1]); - Hits data; + Reads data; char track_name[1024]; strcpy(track_name, argv[2]); char track_desc[1024]; strcpy(track_desc, argv[3]); print_wig_header(track_name, track_desc); + read_align_file(filename, data); - ifstream seqs(filename); - while(seqs.peek() != EOF) { - char chr_str[32]; - char pos_str[32]; - char n_match_str[32]; - char chr_start[4]; - - seqs.ignore(128,' '); - seqs.ignore(128,' '); - seqs.getline(n_match_str,32,' '); - n_match_str[seqs.gcount()] = '\0'; - unsigned int nmatch = atoi(n_match_str); - if(nmatch != 1) { seqs.ignore(1024,'\n'); continue; } - - seqs.get(chr_start,4); - chr_start[3] = '\0'; - if(chr_start[0] != 'c' || chr_start[1] != 'h' || chr_start[2] != 'r') { seqs.ignore(2048,'\n'); continue; } - seqs.getline(chr_str,32,':'); - chr_str[seqs.gcount()] = '\0'; - seqs.getline(pos_str,32,' '); - pos_str[seqs.gcount()] = '\0'; - seqs.ignore(1024,'\n'); - - unsigned int chr; - - if(chr_str[0] == 'X') { chr = 23; } else - if(chr_str[0] == 'Y') { chr = 24; } else - if(chr_str[0] == 'M' || chr_str[0] == 'm') { chr = 25; } else - if(chr_str[0] == 'R') { continue; } else - { chr = atoi(chr_str); } - - unsigned int pos = atoi(pos_str); - - Hit hit(chr,pos); - data.push_back(hit); - } - seqs.close(); - - std::sort(data.begin(),data.end(),compare_hits); - unsigned int n_aligned_reads = data.size(); cerr << n_aligned_reads/1000000.00 << "M reads\n"; - queue Q; - for(Hits::iterator i = data.begin(); i != data.end();) { - if(Q.empty() || Q.front().first != (*i).first) { + queue Q; + for(Reads::iterator i = data.begin(); i != data.end();) { + if(Q.empty() || Q.front().chr != (*i).chr) { while(!Q.empty()) { Q.pop(); } - char chr[3]; - switch(i->first) { - case 23: chr[0]='X'; chr[1]='\0'; break; - case 24: chr[0]='Y'; chr[1]='\0'; break; - case 25: chr[0]='M'; chr[1]='\0'; break; - case 26: chr[0]='R'; chr[1]='\0'; break; - default: chr[0]=(char)(48+(int)(i->first/10)); - chr[1]=(char)(48+(int)(i->first - 10*(int)(i->first/10))); - if(chr[0]=='0') { chr[0]=chr[1]; chr[1]='\0'; } - } - chr[2] = '\0'; - cout << "variableStep chrom=chr" << chr << endl; + cout << "variableStep chrom=" << i->chr.c_str() << endl; } - unsigned int chrnum = i->first; - unsigned int posnum = i->second; - while(i != data.end() && i->first == chrnum && i->second == posnum) { + string chr_string = i->chr; + int posnum = i->pos; + while(i != data.end() && i->chr == chr_string && i->pos == posnum) { Q.push(*i); ++i; } - while(!Q.empty() && Q.front().second + WINDOW < Q.back().second) { + while(!Q.empty() && Q.front().pos + WINDOW < Q.back().pos) { Q.pop(); } if(Q.size() > 1) { - //cout << Q.back().second << " " << log(Q.size()) << endl; - cout << Q.back().second << " " << Q.size() / (double)(n_aligned_reads/1000000.0) << endl; + cout << Q.back().pos << " " << Q.size() / (double)(n_aligned_reads/1000000.0) << endl; } } } +void read_align_file(char* filename, Reads& features) { + string delim(" \n"); + string location_delim(":"); + char strand_str[2]; strand_str[1] = '\0'; + ifstream seqs(filename); + string name(""); + while(seqs.peek() != EOF) { + char line[2048]; + seqs.getline(line,2048,'\n'); + + string line_str(line); + vector fields; + split(line_str, delim, fields); + if(fields.size() != 7) { continue; } + + vector location; split(fields[3], location_delim, location); + string chr = location[0]; + if(chr == "NA") { continue; } + int pos = atoi(location[1].c_str()); + bool strand = ((fields[4].c_str())[0] == 'F')?0:1; + + if(strand == 0) { + Read read(chr,pos,false); + features.push_back(read); + } else { + Read read(chr,pos+25,true); + features.push_back(read); + } + } + seqs.close(); + + //sort the data so we can run through it once + std::sort(features.begin(),features.end(),compare_reads); + cerr << "Reads sorted\n"; +} + +void split (const string& text, const string& separators, vector& words) { + + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} -- 2.30.2