Added new program to profile reads using bedgraph format. Deep sequencing projects...
authorTim Reddy Tim <treddy@hudsonalpha.org>
Sat, 3 Jan 2009 20:21:32 +0000 (20:21 +0000)
committerTim Reddy Tim <treddy@hudsonalpha.org>
Sat, 3 Jan 2009 20:21:32 +0000 (20:21 +0000)
htswanalysis/src/Makefile
htswanalysis/src/profile_reads_bedgraph.cpp [new file with mode: 0644]

index 4b22aa9406be97b3d5e28c09722d3d9a7670529a..dbeb0669ffd52374183bc150ba4ace57598b2dab 100644 (file)
@@ -1,7 +1,7 @@
 CPPFLAGS=-g -Wall -O3 -I/opt/local/include
 LDFLAGS=-lgsl -lgslcblas -lm -L/opt/local/lib
 
-TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks ValidationDesign/refine_peaks Methylseq/Methylseq_Analysis
+TARGETS=complexity_count qPCR profile_reads_against_features profile_reads_wig count_reads_in_peaks GetReadsInSnps/getReadsInSnps ValidationDesign/refine_peaks Methylseq/Methylseq_Analysis count_reads_in_regions profile_reads_bedgraph
 
 all: $(TARGETS)
 
@@ -11,6 +11,9 @@ ValidationDesign/refine_peaks: ValidationDesign/refine_peaks.cpp
 Methhylseq/Methylseq_Analysis: Methylseq/Methylseq_Analysis.cpp
        cd Methylseq && $(MAKE)
 
+GetReadsInSnps/getReadsInSnps: GetReadsInSnps/getReadsInSnps.cpp
+       cd GetReadsInSnps && $(MAKE)
+
 install: $(TARGETS)
        cp $^ ../bin/
 
diff --git a/htswanalysis/src/profile_reads_bedgraph.cpp b/htswanalysis/src/profile_reads_bedgraph.cpp
new file mode 100644 (file)
index 0000000..a94c147
--- /dev/null
@@ -0,0 +1,120 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <queue>
+#include <math.h>
+
+#define WINDOW 25
+
+using namespace std;
+
+class Read {
+  public:
+    string chr;
+    int pos;
+    bool strand;
+
+    Read(string chr, int pos, bool strand) { this->chr = chr; this->pos = pos; this->strand = strand; }
+    Read(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; }
+    Read& operator=(const Read& r) {this->chr = r.chr; this->pos = r.pos; this->strand = r.strand; return *this;}
+};
+
+typedef vector<Read> Reads;
+
+void print_wig_header(char* name, char* desc) {
+  //cout << "browser position chr1" << endl;
+  cout << "browser hide all" << endl;
+  cout << "browser full altGraph" << endl;
+  cout << "track type=bedGraph name=" << name << " description=" << desc << " visibility=full autoScale=on color=0,0,0 priority=10" << endl;
+}
+
+bool compare_reads(const Read &a, const Read &b) { if(a.chr == b.chr) { return a.pos < b.pos; } else { return a.chr < b.chr; } }
+
+void split (const string& text, const string& separators, vector<string>& words);
+void read_align_file(char* filename, Reads& features);
+
+int main(int argc, char** argv) {
+  if(argc != 4) { cerr << "Usage: " << argv[0] << " Aligned_reads Wiggle_Track_Name Wiggle_Track_Description" << endl; return(0); }
+  char filename[1024]; strcpy(filename,argv[1]);
+  Reads data;
+
+  char track_name[1024]; strcpy(track_name, argv[2]);
+  char track_desc[1024]; strcpy(track_desc, argv[3]);
+
+  print_wig_header(track_name, track_desc);
+  read_align_file(filename, data);
+  
+  unsigned int n_aligned_reads = data.size();
+  cerr << n_aligned_reads/1000000.00 << "M reads\n";
+  queue<Read> Q;
+  for(Reads::iterator i = data.begin(); i != data.end();) {
+    if(Q.empty() || Q.front().chr != (*i).chr) {
+      while(!Q.empty()) { Q.pop(); }
+    }
+
+    string chr_string = i->chr;
+    int posnum = i->pos;
+    while(i != data.end() && i->chr == chr_string && i->pos == posnum) {
+      Q.push(*i); ++i; 
+    }
+
+    while(!Q.empty() && Q.front().pos + WINDOW < Q.back().pos) { 
+      Q.pop(); 
+    }
+
+    if(Q.size() > 1 && (i+1) != data.end() && Q.back().chr == (i+1)->chr) {
+      cout << Q.back().chr << " " << Q.back().pos << " " << (i+1)->pos << " " << Q.size() / (double)(n_aligned_reads/1000000.0) << endl;
+    }
+  }
+}
+
+void read_align_file(char* filename, Reads& features) {
+  string delim(" \n");
+  string location_delim(":");
+  char strand_str[2]; strand_str[1] = '\0';
+  ifstream seqs(filename);
+  string name("");
+  while(seqs.peek() != EOF) {
+    char line[2048];
+    seqs.getline(line,2048,'\n');
+
+    string line_str(line);
+    vector<string> fields;
+    split(line_str, delim, fields);
+    if(fields.size() != 7) { continue; }
+    vector<string> location; split(fields[3], location_delim, location);
+    string chr = location[0];
+    if(chr == "NA") { continue; }
+    if(chr.substr(0,3) != "chr") { continue; }
+    int pos = atoi(location[1].c_str());
+    bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
+
+    if(strand == 0) {
+      Read read(chr,pos,false); 
+      features.push_back(read);
+    } else {
+      Read read(chr,pos+25,true); 
+      features.push_back(read);
+    }
+  }
+  seqs.close(); 
+
+  //sort the data so we can run through it once
+  std::sort(features.begin(),features.end(),compare_reads);
+  cerr << "Reads sorted\n";
+}
+
+void split (const string& text, const string& separators, vector<string>& words) {
+
+    size_t n     = text.length ();
+    size_t start = text.find_first_not_of (separators);
+
+    while (start < n) {
+        size_t stop = text.find_first_of (separators, start);
+        if (stop > n) stop = n;
+        words.push_back (text.substr (start, stop-start));
+        start = text.find_first_not_of (separators, stop+1);
+    }
+}