#include <gsl/gsl_statistics.h>
-#include "chrom_list.h"
-#include "util.h"
+#include "../SRLib/chrom_list.h"
+#include "../SRLib/util.h"
+#include "../SRLib/loci.h"
+#include "../SRLib/read.h"
+#include "../SRLib/nuc.h"
#define WINDOW 25
#define PI 3.14159265358979323846
using namespace std;
-void strrevcomp(string& output, const string& input);
-
-double norm_prob(double x, double mu, double s) { return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); }
-
-class Loci {
- public:
- string chr;
- unsigned int pos;
-
- Loci(string chr, unsigned int pos) { this->chr = chr; this->pos = pos; }
- Loci(const Loci& l) { this->chr = l.chr; this->pos = l.pos; }
- Loci& operator=(const Loci& l) { this->chr = l.chr; this->pos = l.pos; return *this; }
-
- bool operator<(const Loci& a) const { if(this->chr == a.chr) { return this->pos < a.pos; } else { return this->chr < a.chr; } }
- bool operator<=(const Loci& a) const { if(this->chr == a.chr) { return this->pos <= a.pos; } else { return this->chr < a.chr; } }
-
- bool operator>=(const Loci& a) const { if(this->chr == a.chr) { return this->pos >= a.pos; } else { return this->chr > a.chr; } }
- bool operator>(const Loci& a) const { if(this->chr == a.chr) { return this->pos > a.pos; } else { return this->chr > a.chr; } }
-
- int operator-(const Loci& a) const { if(this->chr == a.chr) { return this->pos - a.pos; } else { return INT_MAX; } }
-
-};
-
-
-class Read : public Loci {
- public:
- string seq;
-
- unsigned int length() const { return seq.length(); }
-
- Read(string chr, unsigned int pos, string seq) : Loci(chr,pos) { this->seq = seq; }
- Read(const Read& r) : Loci(r) { this->seq = r.seq; }
- Read& operator=(const Read& r) { this->chr = r.chr; this->pos = r.pos; this->seq = r.seq; return *this;}
-
- char operator[](size_t off) const {
- if(off < seq.length()) { return seq[off]; } else { return -1; }
- }
-};
-
-typedef vector<Read> Reads;
-
-class Nuc {
- protected:
- unsigned int n[4];
-
- //background nucleotide probabilities
- //
- //can change this to a background model class later if needed
- static double qA;
- static double qC;
- static double qG;
- static double qT;
-
- // pseudocount to avoid divide-by-zero errors
- static double pseudocount;
-
- public:
-
- Nuc() {
- n[0] = 0;
- n[1] = 0;
- n[2] = 0;
- n[3] = 0;
- }
-
- Nuc(const Nuc& n) {
- this->n[0] = n.at(0);
- this->n[1] = n.at(1);
- this->n[2] = n.at(2);
- this->n[3] = n.at(3);
- }
-
- Nuc& operator=(const Nuc& n) {
- if (this != &n) {
- this->n[0] = n.at(0);
- this->n[1] = n.at(1);
- this->n[2] = n.at(2);
- this->n[3] = n.at(3);
- }
- return *this;
- }
-
- void add_nuc(char b) {
- switch(b) {
- case 'a': case 'A': n[0]++; break;
- case 'c': case 'C': n[1]++; break;
- case 'g': case 'G': n[2]++; break;
- case 't': case 'T': n[3]++; break;
- };
- }
-
- char nth_nuc(unsigned int i) {
- if(i >= size()) { return 'N'; }
- else if(i < n[0]) { return 'A'; }
- else if(i < n[0] + n[1]) { return 'C'; }
- else if(i < n[0] + n[1] + n[2]) { return 'G'; }
- else { return 'T'; }
- }
-
- unsigned int size() { return n[0] + n[1] + n[2] + n[3]; }
-
- unsigned int& operator[](size_t b) { return n[b]; }
- unsigned int at(size_t b) const { return n[b]; }
-
- double RE() {
-
- /*
- double total = n[0] + n[1] + n[2] + n[3] + 4*Nuc::pseudocount;
- double pA = (Nuc::pseudocount + n[0]) / total;
- double pC = (Nuc::pseudocount + n[1]) / total;
- double pG = (Nuc::pseudocount + n[2]) / total;
- double pT = (Nuc::pseudocount + n[3]) / total;
-
- return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT);
- */
-
- unsigned int max = 0; unsigned int max_idx = 0;
- for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
- unsigned int max2 = 0; unsigned int max_idx2 = 0;
- for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } }
-
- if(max_idx == max_idx2) { max_idx2++; }
-
- double total = n[max_idx] + n[max_idx2];
- double p1 = (Nuc::pseudocount + n[max_idx]) / total;
- double p2 = (Nuc::pseudocount + n[max_idx2]) / total;
-
- return p1*log2(p1/Nuc::qA) + p2*log2(p2/Nuc::qC);
- }
-
- char consensus() {
- unsigned int N = size();
- if(N == 0) { return ' '; }
- unsigned int max = 0; unsigned int max_idx = 0;
- for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
-
- unsigned int max2 = 0; unsigned int max_idx2 = 0;
- for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } }
-
- //For now pick arbitrary zygosity thresholds. Later, update to use mixture model.
- char c = '\0';
- if(RE() >= 1.25) {
- //homozygous
- switch(max_idx) {
- case 0: c = 'A'; break;
- case 1: c = 'C'; break;
- case 2: c = 'G'; break;
- case 3: c = 'T'; break;
- }
- } else {
- switch(max_idx | max_idx2) {
- case 1: c = 'M'; break; //A,C
- case 2: c = 'R'; break; //A,G
- case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G
- case 4: c = 'Y'; break; //C,T
- case 5: c = 'K'; break; //G,T
- }
- }
-
- if(N < 10) { return tolower(c); } else { return c; }
- }
-};
-
-double Nuc::pseudocount = 1e-10;
-double Nuc::qA = 0.25;
-double Nuc::qC = 0.25;
-double Nuc::qG = 0.25;
-double Nuc::qT = 0.25;
-
class Window : public Loci {
public:
//optional name for the window
Window(string name, string chr, unsigned int pos, unsigned int length) : Loci(chr,pos) {
this->name = name;
this->length = length;
- this->sequence = "";
-
if(length > 10000) { cerr << "ERROR: window of size " << length << endl; exit(1); }
- seq.resize(length);
+ this->sequence = string(length,'\0');
+ seq.resize(this->sequence.length());
this->reads = 0;
}
unsigned int a;
//clear out endlines
while( (a = (sequence.find("\n"))) != string::npos) { sequence.erase(a,1); }
+ seq.clear(); seq.resize(sequence.length());
}
string get_sequence() {
void add_read(const Read& r) {
if(this->chr != r.chr) return;
- int offset = r - (*this);
- this->reads++;
- for(unsigned int i = 0; i < r.length(); i++) {
- int seq_idx = offset + i;
- if(seq_idx > 0 && (unsigned)seq_idx >= seq.size()) { cerr << "Error: offset: " << offset << " seq_idx " << seq_idx << " size: " << seq.size() << " seqlen: " << sequence.length() << endl; }
- if(seq_idx < 0 || (seq_idx >= 0 && (unsigned)seq_idx > this->length) ) { continue; }
- seq[offset + i].add_nuc(r[i]);
+
+ int w_offset = 0;
+ int r_offset = 0;
+ int overlap_len = r.length();
+
+ //if the read begins before the window starts:
+ // ---(====[====)------]------
+ if(r.pos < this->pos) {
+ r_offset = this->pos - r.pos;
+ w_offset = 0;
+ } else {
+ w_offset = r.pos - this->pos;
+ r_offset = 0;
+ }
+
+ //if the read ends after the window ends
+ // ----[----(====]======)-----
+ if(r.pos + r.length() >= this->pos + this->length) {
+ overlap_len = (this->pos + this->length) - r.pos;
+ } else {
+ overlap_len = r.length();
}
+
+ for(; overlap_len > 0; overlap_len--) {
+ seq[w_offset++].add_nuc(r[r_offset++]);
+ }
+
+ this->reads++;
}
void print_consensus(ostream& o) {
void print_fasta(ostream& o) {
unsigned int line_len = 100;
+
+ if(sequence.length() != seq.size()) {
+ cerr << "Size mismatch: sequence is " << sequence.length() << " but data has " << seq.size() << endl;
+ }
string output = "";
vector<string> variants;
char ref = toupper(sequence[i]);
char con = toupper(seq[i].consensus());
if(con != ' ' && con != ref) {
- o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i][0] << "," << seq[i][1] << "," << seq[i][2] << "," << seq[i][3] << "]" << endl;
+ o << i << ":" << seq[i].consensus() << " (" << seq[i].RE() << ") -- [" << seq[i].A() << "," << seq[i].C() << "," << seq[i].G() << "," << seq[i].T() << "]" << endl;
}
}
}
void print_logo(ostream& o) {
unsigned int max = 0;
for(unsigned int i = 0; i < sequence.length(); i++) {
- if(seq[i].size() > max) { max = seq[i].size(); }
+ if(seq[i].N() > max) { max = seq[i].N(); }
}
for(unsigned int i = 0; i < max; i++) {
std::stable_sort(snps.begin(),snps.end());
feat.close();
- cerr << "Found and sorted " << snps.size() << " snps." << endl;
+ cerr << "Found AND sorted " << snps.size() << " snps." << endl;
}
void read_align_file(char* filename, Reads& features) {
bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
string seq;
- if(strand == 0) { seq = fields[0]; } else { strrevcomp(seq,fields[0]); }
- Read read(chr,pos,seq);
+ if(strand == 0) { seq = fields[0]; } else { revcomp(seq,fields[0]); }
+ Read read(chr,pos,0,seq);
features.push_back(read);
}
seqs.close();
for(Windows::iterator i = peaks.begin(); i != peaks.end(); ++i) {
if(i->chr != chrom) {
chrom = i->chr;
+ cout << "XXX: " << (*(chrom_filenames.find(chrom))).first << endl;
+ if(chrom_filenames.find(chrom) == chrom_filenames.end()) {
+ cerr << "Chrom: " << chrom << " not found\n";
+ }
chrom_filename = chrom_filenames[chrom];
chrom_file.close(); chrom_file.open(chrom_filename.c_str());
chrom_file.getline(temp, 1024);
char chromosome_filename[1024]; strcpy(chromosome_filename,argv[3]);
Windows windows; read_window_file(window_filename, windows);
+
+ if(windows.size() == 0) { cout << "No windows loaded." << endl; exit(0); }
ChromList reference_seq(chromosome_filename);
retrieveSequenceData(reference_seq, windows);
Reads reads; read_align_file(read_filename, reads);
+ if(reads.size() == 0) { cout << "No reads loaded." << endl; exit(0); }
+
count_read_in_features(windows, reads);
for(Windows::iterator w = windows.begin(); w != windows.end(); ++w) {
w->print_fasta(cout);
}
}
-
-void strrevcomp(string& output, const string& input)
-{
- output = input;
- unsigned int i;
-
- for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; }
-
- for (unsigned int p1 = 0; p1 < output.length(); ++p1) {
- if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; }
- else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; }
- else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; }
- else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; }
- }
-}
-