--- /dev/null
+LDFLAGS=-lm
+CPPFLAGS=-g -Wall -O3
+
+all: loci.o read.o util.o nuc.o
--- /dev/null
+/* ChromList:
+ * Reads in a tab delimited file of the format:
+ * <chromosome> <filename>
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "chrom_list.h"
+#include "util.h"
+
+using namespace std;
+
+ChromList::ChromList(char* filename) {
+ char buffer[1024]; string delim("\t");
+ ifstream chrom_file(filename);
+ while(chrom_file.getline(buffer, 1024)) {
+ string temp(buffer);
+ vector<string> words;
+ split(temp, delim,words);
+ if(words.size() != 2) { cerr << "Error in chrom list." << endl; exit(1); }
+ (*this)[ words[0] ] = words[1];
+ }
+ chrom_file.close();
+}
--- /dev/null
+/* chrom_list:
+ * Class used to store a mapping of chromosome name (as a string), to the corresponding file.
+ */
+
+#include <ext/hash_map>
+
+#ifndef CHROMLIST_H
+#define CHROMLIST_H
+
+using namespace std;
+using namespace __gnu_cxx;
+
+struct eq_str {
+ bool operator() (const string& a, const string& b) const {
+ return(strcmp(a.c_str(), b.c_str()) == 0);
+ }
+};
+
+struct hash_str {
+ size_t operator() (const string& a) const {
+ hash<const char*> h;
+ return(h(a.c_str()));
+ }
+};
+
+class ChromList : public hash_map<string, string, hash_str, eq_str> {
+ public:
+ ChromList(char* filename);
+};
+
+#endif
--- /dev/null
+#define PRIMER_3_PATH "/opt/local/bin/primer3_core"
--- /dev/null
+#include <string>
+#include <iostream>
+#include "Loci.h"
+
+using namespace std;
+
+Loci::Loci(string chr, unsigned int pos) {
+ this->chr = chr;
+ this->pos = pos;
+}
+
+Loci::Loci(const Loci& l) {
+ this->chr = l.chr;
+ this->pos = l.pos;
+}
+
+Loci& Loci::operator=(const Loci& l) {
+ if(this != &l) {
+ this->chr = l.chr;
+ this->pos = l.pos;
+ }
+ return *this;
+}
+
+bool Loci::operator<(const Loci& a) const {
+ return((this->chr == a.chr)?(this->pos < a.pos):(this->chr < a.chr));
+}
+
+bool Loci::operator<=(const Loci& a) const {
+ return((this->chr == a.chr)?(this->pos <= a.pos):(this->chr < a.chr));
+}
+
+bool Loci::operator>=(const Loci& a) const {
+ return((this->chr == a.chr)?(this->pos >= a.pos):(this->chr > a.chr));
+}
+
+bool Loci::operator>(const Loci& a) const {
+ return((this->chr == a.chr)?(this->pos > a.pos):(this->chr > a.chr));
+}
+
+
+ostream &operator<<( ostream &out, const Loci &h ) {
+ out << h.chr << "\t" << h.pos;
+ return out;
+}
--- /dev/null
+#include <string>
+
+#ifndef LOCI
+#define LOCI
+
+using namespace std;
+
+class Loci {
+ public:
+ string chr;
+ unsigned int pos;
+
+ Loci(string chr, unsigned int pos);
+ Loci(const Loci& l);
+ Loci& operator=(const Loci& l);
+
+ bool operator<(const Loci& a) const;
+ bool operator<=(const Loci& a) const;
+ bool operator>=(const Loci& a) const;
+ bool operator>(const Loci& a) const;
+};
+
+#endif
--- /dev/null
+#include <iostream>
+#include <math.h>
+
+#include "nuc.h"
+
+using namespace std;
+
+double Nuc::pseudocount = 1e-10;
+
+unsigned int Nuc::strong_coverage_threshold = 10;
+
+double Nuc::qA = 0.25;
+double Nuc::qC = 0.25;
+double Nuc::qG = 0.25;
+double Nuc::qT = 0.25;
+
+Nuc::Nuc() {
+ n[0] = 0; //A
+ n[1] = 0; //C
+ n[2] = 0; //G
+ n[3] = 0; //T
+}
+
+Nuc::Nuc(const Nuc& n) {
+ this->n[0] = n.at(0); //A
+ this->n[1] = n.at(1); //C
+ this->n[2] = n.at(2); //G
+ this->n[3] = n.at(3); //T
+}
+
+Nuc& Nuc::operator=(const Nuc& n) {
+ if (this != &n) {
+ this->n[0] = n.at(0); //A
+ this->n[1] = n.at(1); //C
+ this->n[2] = n.at(2); //G
+ this->n[3] = n.at(3); //T
+ }
+ return *this;
+}
+
+//For internal (protected) use only
+unsigned int& Nuc::operator[](size_t b) { return n[b]; }
+
+//TODO: Handle degeneracy?
+void Nuc::add_nuc(char b) {
+ switch(b) {
+ case 'a': case 'A': n[0]++; break;
+ case 'c': case 'C': n[1]++; break;
+ case 'g': case 'G': n[2]++; break;
+ case 't': case 'T': n[3]++; break;
+ };
+}
+
+//nth_nuc
+//=======
+//Useful to get the listing of all the nucleotides sequenced for this position.
+//Note that this function doesn't return the nucleotide in the order presented
+char Nuc::nth_nuc(unsigned int i) {
+ if(i >= N()) { cerr << "Error: requested nucleotide " << i << " but only " << N() << " nucleotides have been recorded\n"; }
+ else if(i < n[0]) { return 'A'; }
+ else if(i < n[0] + n[1]) { return 'C'; }
+ else if(i < n[0] + n[1] + n[2]) { return 'G'; }
+ return 'T';
+}
+
+//N
+//====
+//returns the number of nucleotides at the position
+//todo: may want to make this a cached value so we don't always recalculate
+unsigned int Nuc::N() { return n[0] + n[1] + n[2] + n[3]; }
+
+unsigned int Nuc::at(size_t b) const { return n[b]; }
+unsigned int Nuc::A() const { return this->at(0); }
+unsigned int Nuc::C() const { return this->at(1); }
+unsigned int Nuc::G() const { return this->at(2); }
+unsigned int Nuc::T() const { return this->at(3); }
+
+double Nuc::RE() {
+ double total = this->N() + 4*Nuc::pseudocount;
+ double pA = (Nuc::pseudocount + n[0]) / total;
+ double pC = (Nuc::pseudocount + n[1]) / total;
+ double pG = (Nuc::pseudocount + n[2]) / total;
+ double pT = (Nuc::pseudocount + n[3]) / total;
+ return pA*log2(pA/Nuc::qA) + pC*log2(pC/Nuc::qC) + pG*log2(pG/Nuc::qG) + pT*log2(pT/Nuc::qT);
+}
+
+double Nuc::chastity() {
+ unsigned int max = 0; unsigned int max_idx = 0;
+ for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
+
+ unsigned int max2 = 0; unsigned int max_idx2 = 0;
+ for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] >= max2) { max2 = n[i]; max_idx2 = i; } }
+
+ if(max_idx == max_idx2) { max_idx2++; }
+
+ double total = (double)n[max_idx] + (double)n[max_idx2];
+ return( 2 * (double)n[max_idx]/total );
+}
+
+char Nuc::consensus() {
+ unsigned int N = this->N();
+ if(N == 0) { return ' '; }
+
+ unsigned int max = 0; unsigned int max_idx = 0;
+ for(unsigned int i = 0; i < 4; i++) { if(n[i] > max) { max = n[i]; max_idx = i; } }
+
+ unsigned int max2 = 0; unsigned int max_idx2 = 0;
+ for(unsigned int i = 0; i < 4; i++) { if(i != max_idx && n[i] > max2) { max2 = n[i]; max_idx2 = i; } }
+
+ //For now pick arbitrary zygosity thresholds. Later, update to use mixture model.
+ char c = '\0';
+
+ //homozygous
+ if(chastity() >= 1.25) {
+ switch(max_idx) {
+ case 0: c = 'A'; break;
+ case 1: c = 'C'; break;
+ case 2: c = 'G'; break;
+ case 3: c = 'T'; break;
+ }
+ }
+
+ //heterozygous
+ else {
+ switch(max_idx | max_idx2) {
+ case 1: c = 'M'; break; //A,C
+ case 2: c = 'R'; break; //A,G
+ case 3: c = (max_idx == 0 || max_idx2 == 0)?'W':'S'; break; //A,T or C,G
+ case 4: c = 'Y'; break; //C,T
+ case 5: c = 'K'; break; //G,T
+ }
+ }
+
+ //strong_coverage_threshold is the number of reads needed to make a confident call
+ if(N < this->strong_coverage_threshold) { return tolower(c); } else { return c; }
+}
--- /dev/null
+#include <iostream>
+
+#ifndef NUC_H
+#define NUC_H
+
+class Nuc {
+ protected:
+ unsigned int n[4];
+
+ //background nucleotide probabilities
+ //
+ //can change this to a background model class later if needed
+ static double qA;
+ static double qC;
+ static double qG;
+ static double qT;
+
+ // pseudocount to avoid divide-by-zero errors
+ static double pseudocount;
+ static unsigned int strong_coverage_threshold;
+
+ // protected access directly to the data arrray
+ unsigned int& operator[](size_t b);
+
+ public:
+
+ Nuc();
+ Nuc(const Nuc& n);
+ Nuc& operator=(const Nuc& n);
+
+ void add_nuc(char b);
+
+ //nth_nuc
+ //=======
+ //Useful to get the listing of all the nucleotides sequenced for this position.
+ //Note that this function doesn't return the nucleotide in the order presented
+ char nth_nuc(unsigned int i);
+
+ //N
+ //====
+ //returns the number of nucleotides at the position
+ unsigned int N();
+
+ //at
+ //==
+ //allows read-only access to the dta array
+ unsigned int at(size_t b) const;
+ unsigned int A() const;
+ unsigned int C() const;
+ unsigned int G() const;
+ unsigned int T() const;
+
+ //RE
+ //==
+ //returns the relative entropy of the nucleotide, taking into
+ //account the background model and pseudocounts
+ double RE();
+
+ //chastity
+ //==
+ //returns 2 x the ratio of the most prominent nucleotide to the first two nucleotides.
+ //i.e. a homozygous position has chastity 1, and a heterozygous position has
+ // chastity ~0.5
+ //note, the value is doubled to put it on the same scale and with roughly the same
+ //interpretation as relative entropy (RE)
+ double chastity();
+
+ //consensus:
+ //=========
+ //returns the consensus sequence, including IUPAC degenerate bases to represent heterozygosity
+ char consensus();
+};
+
+#endif
+
--- /dev/null
+/*
+ * Read.cpp:
+ *
+ * Implementation of Read class. Since the class consists of public variables (more of a struct, really) there is not much
+ * here.
+ *
+ * compare_reads: function to compare two reads, first be chromosome, then by position. Useful to sort a vector of reads.
+ */
+
+#include <string>
+#include <iostream>
+
+#include "Read.h"
+
+Read::Read(string chr, unsigned int pos, bool strand, string seq) : Loci(chr,pos) {
+ this->strand = strand;
+ this->seq = seq;
+}
+
+Read::Read(const Read& r) : Loci(r) {
+ this->strand = strand;
+ this->seq = r.seq;
+}
+
+Read& Read::operator=(const Read& r) {
+ if(this == &r) return *this;
+
+ Loci::operator=(r);
+ this->strand = r.strand;
+ this->seq = r.seq;
+ return *this;
+}
+
+unsigned int Read::length() const {
+ return seq.length();
+}
+
+//Errors if the offset exceeds the length of the read
+char Read::operator[](size_t off) const {
+ if(off >= this->length()) {
+ cerr << "Error: index " << off << " out of bounds for read of length " << this->length() << endl;
+ }
+
+ return this->seq[off];
+}
--- /dev/null
+/*
+ * Read.h:
+ * Class to handle short read data. For generalizty, the chromosome is represented as a string.
+ * It may make sense to save memeory to introduce a chromosome class, and using a pointer to reference it.
+ *
+ * chr: name of the chromosome on which the read maps.
+ * pos: posiiton of hte read on the chromosome
+ * strand: diretionality of the read. 0 -> forward, 1 -> reverse.
+ */
+
+#include <vector>
+#include <string>
+#include "loci.h"
+
+#ifndef READ_H
+#define READ_H
+
+using namespace std;
+
+class Read : public Loci {
+ public:
+ string seq;
+ bool strand;
+
+ Read(string chr, unsigned int pos, bool strand, string seq);
+ Read(const Read& r);
+ Read& operator=(const Read& r);
+
+ char operator[](size_t off) const;
+ unsigned int length() const;
+};
+
+typedef vector<Read> Reads;
+
+#endif
--- /dev/null
+#include <string>
+#include <vector>
+#include <math.h>
+
+#include "util.h"
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words) {
+ size_t n = text.length ();
+ size_t start = text.find_first_not_of (separators);
+
+ while (start < n) {
+ size_t stop = text.find_first_of (separators, start);
+ if (stop > n) stop = n;
+ words.push_back (text.substr (start, stop-start));
+ start = text.find_first_not_of (separators, stop+1);
+ }
+}
+
+void revcomp(string& output, const string& input)
+{
+ output = input;
+ unsigned int i;
+
+ for (i = 0; i < output.length(); ++i) { output[i] = input[input.length()-(i+1)]; }
+
+ for (unsigned int p1 = 0; p1 < output.length(); ++p1) {
+ if(output[p1] == 'a' || output[p1] == 'A') { output[p1] = 'T'; }
+ else if(output[p1] == 'c' || output[p1] == 'C') { output[p1] = 'G'; }
+ else if(output[p1] == 'g' || output[p1] == 'G') { output[p1] = 'C'; }
+ else if(output[p1] == 't' || output[p1] == 'T') { output[p1] = 'A'; }
+ }
+}
+
+double norm_prob(double x, double mu, double s) {
+ return (1.0)/(s*sqrt(2*PI)) * exp(-0.5*(x-mu)*(x-mu)/(s*s));
+}
--- /dev/null
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <string>
+#include <vector>
+
+#define PI 3.14159265358979323846
+
+using namespace std;
+
+void split (const string& text, const string& separators, vector<string>& words);
+void revcomp(string& output, const string& input);
+double norm_prob(double x, double mu, double s);
+
+#endif