From 77ec969e1ed55530be8d8682a807cff4801834d8 Mon Sep 17 00:00:00 2001 From: Tim Reddy Tim Date: Wed, 24 Dec 2008 21:12:44 +0000 Subject: [PATCH] Initial check in of c++ short read programming library. These classes give functionality to store and to basic processing on short read data and gneomic coordinates. Functions include loading in a list of reads, mapping them to selected windows, identifying variants and heterozygosities in the regions, and outputting the sequences and variants. The goal is to start to consolidate the various versions of these classes scattered amongst the existing code in order to create a more robust and easy to improve code base. --- htswanalysis/src/SRLib/Makefile | 4 + htswanalysis/src/SRLib/chrom_list.cpp | 27 +++++ htswanalysis/src/SRLib/chrom_list.h | 31 ++++++ htswanalysis/src/SRLib/defs.h | 1 + htswanalysis/src/SRLib/loci.cpp | 45 +++++++++ htswanalysis/src/SRLib/loci.h | 23 +++++ htswanalysis/src/SRLib/nuc.cpp | 136 ++++++++++++++++++++++++++ htswanalysis/src/SRLib/nuc.h | 75 ++++++++++++++ htswanalysis/src/SRLib/read.cpp | 45 +++++++++ htswanalysis/src/SRLib/read.h | 35 +++++++ htswanalysis/src/SRLib/util.cpp | 38 +++++++ htswanalysis/src/SRLib/util.h | 15 +++ 12 files changed, 475 insertions(+) create mode 100644 htswanalysis/src/SRLib/Makefile create mode 100644 htswanalysis/src/SRLib/chrom_list.cpp create mode 100644 htswanalysis/src/SRLib/chrom_list.h create mode 100644 htswanalysis/src/SRLib/defs.h create mode 100644 htswanalysis/src/SRLib/loci.cpp create mode 100644 htswanalysis/src/SRLib/loci.h create mode 100644 htswanalysis/src/SRLib/nuc.cpp create mode 100644 htswanalysis/src/SRLib/nuc.h create mode 100644 htswanalysis/src/SRLib/read.cpp create mode 100644 htswanalysis/src/SRLib/read.h create mode 100644 htswanalysis/src/SRLib/util.cpp create mode 100644 htswanalysis/src/SRLib/util.h diff --git a/htswanalysis/src/SRLib/Makefile b/htswanalysis/src/SRLib/Makefile new file mode 100644 index 0000000..a2a4ddd --- /dev/null +++ b/htswanalysis/src/SRLib/Makefile @@ -0,0 +1,4 @@ +LDFLAGS=-lm +CPPFLAGS=-g -Wall -O3 + +all: loci.o read.o util.o nuc.o diff --git a/htswanalysis/src/SRLib/chrom_list.cpp b/htswanalysis/src/SRLib/chrom_list.cpp new file mode 100644 index 0000000..eee95d3 --- /dev/null +++ b/htswanalysis/src/SRLib/chrom_list.cpp @@ -0,0 +1,27 @@ +/* ChromList: + * Reads in a tab delimited file of the format: + * + */ + +#include +#include +#include +#include + +#include "chrom_list.h" +#include "util.h" + +using namespace std; + +ChromList::ChromList(char* filename) { + char buffer[1024]; string delim("\t"); + ifstream chrom_file(filename); + while(chrom_file.getline(buffer, 1024)) { + string temp(buffer); + vector words; + split(temp, delim,words); + if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); } + (*this)[ words[0] ] = words[1]; + } + chrom_file.close(); +} diff --git a/htswanalysis/src/SRLib/chrom_list.h b/htswanalysis/src/SRLib/chrom_list.h new file mode 100644 index 0000000..6a4579a --- /dev/null +++ b/htswanalysis/src/SRLib/chrom_list.h @@ -0,0 +1,31 @@ +/* chrom_list: + * Class used to store a mapping of chromosome name (as a string), to the corresponding file. + */ + +#include + +#ifndef CHROMLIST_H +#define CHROMLIST_H + +using namespace std; +using namespace __gnu_cxx; + +struct eq_str { + bool operator() (const string& a, const string& b) const { + return(strcmp(a.c_str(), b.c_str()) == 0); + } +}; + +struct hash_str { + size_t operator() (const string& a) const { + hash h; + return(h(a.c_str())); + } +}; + +class ChromList : public hash_map { + public: + ChromList(char* filename); +}; + +#endif diff --git a/htswanalysis/src/SRLib/defs.h b/htswanalysis/src/SRLib/defs.h new file mode 100644 index 0000000..b71c839 --- /dev/null +++ b/htswanalysis/src/SRLib/defs.h @@ -0,0 +1 @@ +#define PRIMER_3_PATH "/opt/local/bin/primer3_core" diff --git a/htswanalysis/src/SRLib/loci.cpp b/htswanalysis/src/SRLib/loci.cpp new file mode 100644 index 0000000..c846f89 --- /dev/null +++ b/htswanalysis/src/SRLib/loci.cpp @@ -0,0 +1,45 @@ +#include +#include +#include "Loci.h" + +using namespace std; + +Loci::Loci(string chr, unsigned int pos) { + this->chr = chr; + this->pos = pos; +} + +Loci::Loci(const Loci& l) { + this->chr = l.chr; + this->pos = l.pos; +} + +Loci& Loci::operator=(const Loci& l) { + if(this != &l) { + this->chr = l.chr; + this->pos = l.pos; + } + return *this; +} + +bool Loci::operator<(const Loci& a) const { + return((this->chr == a.chr)?(this->pos < a.pos):(this->chr < a.chr)); +} + +bool Loci::operator<=(const Loci& a) const { + return((this->chr == a.chr)?(this->pos <= a.pos):(this->chr < a.chr)); +} + +bool Loci::operator>=(const Loci& a) const { + return((this->chr == a.chr)?(this->pos >= a.pos):(this->chr > a.chr)); +} + +bool Loci::operator>(const Loci& a) const { + return((this->chr == a.chr)?(this->pos > a.pos):(this->chr > a.chr)); +} + + +ostream &operator<<( ostream &out, const Loci &h ) { + out << h.chr << "\t" << h.pos; + return out; +} diff --git a/htswanalysis/src/SRLib/loci.h b/htswanalysis/src/SRLib/loci.h new file mode 100644 index 0000000..cfa9562 --- /dev/null +++ b/htswanalysis/src/SRLib/loci.h @@ -0,0 +1,23 @@ +#include + +#ifndef LOCI +#define LOCI + +using namespace std; + +class Loci { + public: + string chr; + unsigned int pos; + + Loci(string chr, unsigned int pos); + Loci(const Loci& l); + Loci& operator=(const Loci& l); + + bool operator<(const Loci& a) const; + bool operator<=(const Loci& a) const; + bool operator>=(const Loci& a) const; + bool operator>(const Loci& a) const; +}; + +#endif diff --git a/htswanalysis/src/SRLib/nuc.cpp b/htswanalysis/src/SRLib/nuc.cpp new file mode 100644 index 0000000..59a70d5 --- /dev/null +++ b/htswanalysis/src/SRLib/nuc.cpp @@ -0,0 +1,136 @@ +#include +#include + +#include "nuc.h" + +using namespace std; + +double Nuc::pseudocount = 1e-10; + +unsigned int Nuc::strong_coverage_threshold = 10; + +double Nuc::qA = 0.25; +double Nuc::qC = 0.25; +double Nuc::qG = 0.25; +double Nuc::qT = 0.25; + +Nuc::Nuc() { + n[0] = 0; //A + n[1] = 0; //C + n[2] = 0; //G + n[3] = 0; //T +} + +Nuc::Nuc(const Nuc& n) { + this->n[0] = n.at(0); //A + this->n[1] = n.at(1); //C + this->n[2] = n.at(2); //G + this->n[3] = n.at(3); //T +} + +Nuc& Nuc::operator=(const Nuc& n) { + if (this != &n) { + this->n[0] = n.at(0); //A + this->n[1] = n.at(1); //C + this->n[2] = n.at(2); //G + this->n[3] = n.at(3); //T + } + return *this; +} + +//For internal (protected) use only +unsigned int& Nuc::operator[](size_t b) { return n[b]; } + +//TODO: Handle degeneracy? +void Nuc::add_nuc(char b) { + switch(b) { + case 'a': case 'A': n[0]++; break; + case 'c': case 'C': n[1]++; break; + case 'g': case 'G': n[2]++; break; + case 't': case 'T': n[3]++; break; + }; +} + +//nth_nuc +//======= +//Useful to get the listing of all the nucleotides sequenced for this position. +//Note that this function doesn't return the nucleotide in the order presented +char Nuc::nth_nuc(unsigned int i) { + if(i >= N()) { cerr << "Error: requested nucleotide " << i << " but only " << N() << " nucleotides have been recorded\n"; } + else if(i < n[0]) { return 'A'; } + else if(i < n[0] + n[1]) { return 'C'; } + else if(i < n[0] + n[1] + n[2]) { return 'G'; } + return 'T'; +} + +//N +//==== +//returns the number of nucleotides at the position +//todo: may want to make this a cached value so we don't always recalculate +unsigned int Nuc::N() { return n[0] + n[1] + n[2] + n[3]; } + +unsigned int Nuc::at(size_t b) const { return n[b]; } +unsigned int Nuc::A() const { return this->at(0); } +unsigned int Nuc::C() const { return this->at(1); } +unsigned int Nuc::G() const { return this->at(2); } +unsigned int Nuc::T() const { return this->at(3); } + +double Nuc::RE() { + double total = this->N() + 4*Nuc::pseudocount; + double pA = (Nuc::pseudocount + n[0]) / total; + double pC = (Nuc::pseudocount + n[1]) / total; + double pG = (Nuc::pseudocount + n[2]) / total; + double pT = (Nuc::pseudocount + n[3]) / total; + return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT); +} + +double Nuc::chastity() { + unsigned int max = 0; unsigned int max_idx = 0; + for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } + + unsigned int max2 = 0; unsigned int max_idx2 = 0; + for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } } + + if(max_idx == max_idx2) { max_idx2++; } + + double total = (double)n[max_idx] + (double)n[max_idx2]; + return( 2 * (double)n[max_idx]/total ); +} + +char Nuc::consensus() { + unsigned int N = this->N(); + if(N == 0) { return ' '; } + + unsigned int max = 0; unsigned int max_idx = 0; + for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } } + + unsigned int max2 = 0; unsigned int max_idx2 = 0; + for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } } + + //For now pick arbitrary zygosity thresholds. Later, update to use mixture model. + char c = '\0'; + + //homozygous + if(chastity() >= 1.25) { + switch(max_idx) { + case 0: c = 'A'; break; + case 1: c = 'C'; break; + case 2: c = 'G'; break; + case 3: c = 'T'; break; + } + } + + //heterozygous + else { + switch(max_idx | max_idx2) { + case 1: c = 'M'; break; //A,C + case 2: c = 'R'; break; //A,G + case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G + case 4: c = 'Y'; break; //C,T + case 5: c = 'K'; break; //G,T + } + } + + //strong_coverage_threshold is the number of reads needed to make a confident call + if(N < this->strong_coverage_threshold) { return tolower(c); } else { return c; } +} diff --git a/htswanalysis/src/SRLib/nuc.h b/htswanalysis/src/SRLib/nuc.h new file mode 100644 index 0000000..0f1c0b1 --- /dev/null +++ b/htswanalysis/src/SRLib/nuc.h @@ -0,0 +1,75 @@ +#include + +#ifndef NUC_H +#define NUC_H + +class Nuc { + protected: + unsigned int n[4]; + + //background nucleotide probabilities + // + //can change this to a background model class later if needed + static double qA; + static double qC; + static double qG; + static double qT; + + // pseudocount to avoid divide-by-zero errors + static double pseudocount; + static unsigned int strong_coverage_threshold; + + // protected access directly to the data arrray + unsigned int& operator[](size_t b); + + public: + + Nuc(); + Nuc(const Nuc& n); + Nuc& operator=(const Nuc& n); + + void add_nuc(char b); + + //nth_nuc + //======= + //Useful to get the listing of all the nucleotides sequenced for this position. + //Note that this function doesn't return the nucleotide in the order presented + char nth_nuc(unsigned int i); + + //N + //==== + //returns the number of nucleotides at the position + unsigned int N(); + + //at + //== + //allows read-only access to the dta array + unsigned int at(size_t b) const; + unsigned int A() const; + unsigned int C() const; + unsigned int G() const; + unsigned int T() const; + + //RE + //== + //returns the relative entropy of the nucleotide, taking into + //account the background model and pseudocounts + double RE(); + + //chastity + //== + //returns 2 x the ratio of the most prominent nucleotide to the first two nucleotides. + //i.e. a homozygous position has chastity 1, and a heterozygous position has + // chastity ~0.5 + //note, the value is doubled to put it on the same scale and with roughly the same + //interpretation as relative entropy (RE) + double chastity(); + + //consensus: + //========= + //returns the consensus sequence, including IUPAC degenerate bases to represent heterozygosity + char consensus(); +}; + +#endif + diff --git a/htswanalysis/src/SRLib/read.cpp b/htswanalysis/src/SRLib/read.cpp new file mode 100644 index 0000000..e22e834 --- /dev/null +++ b/htswanalysis/src/SRLib/read.cpp @@ -0,0 +1,45 @@ +/* + * Read.cpp: + * + * Implementation of Read class. Since the class consists of public variables (more of a struct, really) there is not much + * here. + * + * compare_reads: function to compare two reads, first be chromosome, then by position. Useful to sort a vector of reads. + */ + +#include +#include + +#include "Read.h" + +Read::Read(string chr, unsigned int pos, bool strand, string seq) : Loci(chr,pos) { + this->strand = strand; + this->seq = seq; +} + +Read::Read(const Read& r) : Loci(r) { + this->strand = strand; + this->seq = r.seq; +} + +Read& Read::operator=(const Read& r) { + if(this == &r) return *this; + + Loci::operator=(r); + this->strand = r.strand; + this->seq = r.seq; + return *this; +} + +unsigned int Read::length() const { + return seq.length(); +} + +//Errors if the offset exceeds the length of the read +char Read::operator[](size_t off) const { + if(off >= this->length()) { + cerr << "Error: index " << off << " out of bounds for read of length " << this->length() << endl; + } + + return this->seq[off]; +} diff --git a/htswanalysis/src/SRLib/read.h b/htswanalysis/src/SRLib/read.h new file mode 100644 index 0000000..1f1fcc0 --- /dev/null +++ b/htswanalysis/src/SRLib/read.h @@ -0,0 +1,35 @@ +/* + * Read.h: + * Class to handle short read data. For generalizty, the chromosome is represented as a string. + * It may make sense to save memeory to introduce a chromosome class, and using a pointer to reference it. + * + * chr: name of the chromosome on which the read maps. + * pos: posiiton of hte read on the chromosome + * strand: diretionality of the read. 0 -> forward, 1 -> reverse. + */ + +#include +#include +#include "loci.h" + +#ifndef READ_H +#define READ_H + +using namespace std; + +class Read : public Loci { + public: + string seq; + bool strand; + + Read(string chr, unsigned int pos, bool strand, string seq); + Read(const Read& r); + Read& operator=(const Read& r); + + char operator[](size_t off) const; + unsigned int length() const; +}; + +typedef vector Reads; + +#endif diff --git a/htswanalysis/src/SRLib/util.cpp b/htswanalysis/src/SRLib/util.cpp new file mode 100644 index 0000000..7359ac2 --- /dev/null +++ b/htswanalysis/src/SRLib/util.cpp @@ -0,0 +1,38 @@ +#include +#include +#include + +#include "util.h" + +using namespace std; + +void split (const string& text, const string& separators, vector& words) { + size_t n = text.length (); + size_t start = text.find_first_not_of (separators); + + while (start < n) { + size_t stop = text.find_first_of (separators, start); + if (stop > n) stop = n; + words.push_back (text.substr (start, stop-start)); + start = text.find_first_not_of (separators, stop+1); + } +} + +void revcomp(string& output, const string& input) +{ + output = input; + unsigned int i; + + for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; } + + for (unsigned int p1 = 0; p1 < output.length(); ++p1) { + if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; } + else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; } + else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; } + else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; } + } +} + +double norm_prob(double x, double mu, double s) { + return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s)); +} diff --git a/htswanalysis/src/SRLib/util.h b/htswanalysis/src/SRLib/util.h new file mode 100644 index 0000000..822a76f --- /dev/null +++ b/htswanalysis/src/SRLib/util.h @@ -0,0 +1,15 @@ +#ifndef UTIL_H +#define UTIL_H + +#include +#include + +#define PI 3.14159265358979323846 + +using namespace std; + +void split (const string& text, const string& separators, vector& words); +void revcomp(string& output, const string& input); +double norm_prob(double x, double mu, double s); + +#endif -- 2.30.2