9 #include <gsl/gsl_statistics.h>
16 void int_to_chromstr(int chrom, char* chr);
17 int chromstr_to_int(string chrom);
19 void split (const string& text, const string& separators, vector<string>& words);
30 unsigned int count[2];
32 Hit(string name, int chr, int start, int end, bool strand) { this->name = name; this->chr = chr; this->start = start; this->end = end; this->strand = strand; count[0] = 0; count[1] = 0; }
35 ostream &operator<<( ostream &out, const Hit &h ) {
36 out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0] << "\t" << h.count[1];
40 typedef vector<Hit> Hits;
42 bool compare_hits(const Hit &a, const Hit &b) {
44 return a.end < b.start;
50 void read_align_file(char* filename, Hits& features) {
52 string location_delim = ":";
53 char strand_str[2]; strand_str[1] = '\0';
54 ifstream seqs(filename);
56 while(seqs.peek() != EOF) {
58 seqs.getline(line,2048,'\n');
59 string line_str(line);
60 vector<string> fields;
61 split(line_str, delim, fields);
62 if(fields.size() <= 4) { continue; }
63 vector<string> location; split(fields[3], location_delim, location);
64 if(location.size() != 2) { continue; }
65 int chr = chromstr_to_int(location[0]);
66 if(chr < 0) { continue; }
67 int pos = atoi(location[1].c_str());
68 bool strand = ((fields[4].c_str())[0] == 'F')?0:1;
71 Hit hit(name,chr,pos,pos,strand);
72 features.push_back(hit);
74 Hit hit(name,chr,pos+25,pos+25,strand);
75 features.push_back(hit);
80 //sort the data so we can run through it once
81 std::stable_sort(features.begin(),features.end(),compare_hits);
84 void print_features(Hits& hits, string filename, unsigned int size_1, unsigned int size_2) {
85 ofstream out(filename.c_str());
86 for(Hits::iterator i = hits.begin(); i != hits.end(); ++i) {
88 out << h.name << "\tchr" << h.chr << "\t" << h.start << "\t" << h.end << "\t" << h.strand << "\t" << h.count[0]/(double)(size_1/1000000.0) << "\t" << h.count[1]/(double)(size_2/1000000.0) << "\t" << (double)log(((double)h.count[0]/(double)size_1)/(double)(h.count[1]/(double)size_2))/log(2) << endl;
93 unsigned int count_complexity(Hits& data) {
94 unsigned int num_hits = data.size();
96 unsigned int n_island = 0;
97 unsigned int island_size = 0;
98 int chrom = data.begin()->chr;
99 for(Hits::iterator i = data.begin()+1; i != data.end()-1 && i != data.end(); ++i) {
100 if(i->chr != chrom) {
102 } else if( (i+1)->chr != chrom ) { continue; }
103 int prev = (i-1)->start;
105 while( (i+1) != data.end() && (i+1)->chr == i->chr && (i+1)->start == i->start) { i++; count++; }
106 int next = (i+1)->start;
108 if(i->chr != (i+1)->chr || (i-1)->chr != i->chr) { continue; }
110 if(prev < i->start - 25 && next > i->start + 25 && count >= 5) { n_island++; island_size += count; }
112 cerr << n_island << " Islands\n";
116 int main(int argc, char** argv) {
117 if(argc < 3) { cerr << "Usage: " << argv[0] << " label reads\n"; exit(1); }
119 char label1[128]; strcpy(label1,argv[1]);
120 Hits align1; read_align_file(argv[2], align1);
122 if(align1.size() <= 1) { cerr << "No alignment\n"; exit(0); }
124 int complexity = count_complexity(align1);
126 cout << label1 << "\t" << complexity << "\t" << align1.size()/1000000 << "M\t" << (double)complexity/(double)align1.size() << endl;
129 int chromstr_to_int(string chromstr) {
130 const char* chrom = chromstr.c_str();
131 if(chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r') { return -1; }
132 if(chrom[3] == 'X') { return 23; } else
133 if(chrom[3] == 'Y') { return 24; } else
134 if(chrom[3] == 'M') { return 25; } else
135 if(chrom[3] >= '0' && chrom[3] <= '9') { return atoi(chrom+3); } else
139 void int_to_chromstr(int chrom, char* chr) {
142 case 23: chr[3]='X'; break;
143 case 24: chr[3]='Y'; break;
144 case 25: chr[3]='M'; break;
145 default: if(chrom < 10) { chr[3]=(char)(48+chrom); chr[4] = '\0'; }
146 else { chr[3]=(char)(48+(int)(chrom/10));
147 chr[4]=(char)(48+(int)(chrom - (int)(chr[3]-48)));
154 void split (const string& text, const string& separators, vector<string>& words) {
156 size_t n = text.length ();
157 size_t start = text.find_first_not_of (separators);
160 size_t stop = text.find_first_of (separators, start);
161 if (stop > n) stop = n;
162 words.push_back (text.substr (start, stop-start));
163 start = text.find_first_not_of (separators, stop+1);