1 // ----------------------------------------
2 // ---------- sequence.cc -----------
3 // ----------------------------------------
14 Sequence::load_fasta(string file_path, int seq_num,
15 int start_index, int end_index)
18 string file_data_line;
19 int header_counter = 0;
22 char conversionTable[257];
23 int seq_i, table_i, len;
25 char * seq_tmp; // holds sequence during basic filtering
28 data_file.open(file_path.c_str(), ios::in);
31 // search for the header of the fasta sequence we want
32 while ( (!data_file.eof()) && (header_counter < seq_num) )
34 getline(data_file,file_data_line);
35 if (file_data_line.substr(0,1) == ">")
39 header = file_data_line.substr(1);
43 while ( !data_file.eof() && read_seq )
45 getline(data_file,file_data_line);
46 if (file_data_line.substr(0,1) == ">")
48 else sequence_raw += file_data_line;
53 // sequence filtering for upcasing agctn and convert non AGCTN to N
55 len = sequence_raw.length();
56 seq_tmp = (char*)sequence_raw.c_str();
60 // Make a conversion table
62 // everything we don't specify below will become 'N'
63 for(table_i=0; table_i < 256; table_i++)
65 conversionTable[table_i] = 'N';
67 // add end of string character for printing out table for testing purposes
68 conversionTable[256] = '\0';
70 // we want these to map to themselves - ie not to change
71 conversionTable['A'] = 'A';
72 conversionTable['T'] = 'T';
73 conversionTable['G'] = 'G';
74 conversionTable['C'] = 'C';
76 conversionTable['a'] = 'A';
77 conversionTable['t'] = 'T';
78 conversionTable['g'] = 'G';
79 conversionTable['c'] = 'C';
81 // finally, the actual conversion loop
82 for(seq_i = 0; seq_i < len; seq_i++)
84 table_i = (int) seq_tmp[seq_i];
85 sequence += conversionTable[table_i];
89 // Lastly, if subselection of the sequence was specified we keep cut out
90 // and only keep that part
92 // end_index = 0 means no end was specified, so cut to the end
96 // start_index = 0 if no start was specified
97 sequence = sequence.substr(start_index, end_index - start_index);
101 // this doesn't work properly under gcc 3.x ... it can't recognize toupper
102 //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
106 Sequence::load_annot(string file_path)
109 string file_data_line;
115 data_file.open(file_path.c_str(), ios::in);
117 getline(data_file,file_data_line);
118 species = file_data_line;
120 while ( !data_file.eof() )
122 getline(data_file,file_data_line);
123 if (file_data_line != "")
125 // need to get 4 values...almost same code 4 times...
126 // get annot start index
127 space_split_i = file_data_line.find(" ");
128 annot_value = file_data_line.substr(0,space_split_i);
129 an_annot.start = atoi (annot_value.c_str());
130 file_data_line = file_data_line.substr(space_split_i+1);
131 // get annot end index
132 space_split_i = file_data_line.find(" ");
133 annot_value = file_data_line.substr(0,space_split_i);
134 an_annot.end = atoi (annot_value.c_str());
135 file_data_line = file_data_line.substr(space_split_i+1);
136 // get annot start index
137 space_split_i = file_data_line.find(" ");
138 annot_value = file_data_line.substr(0,space_split_i);
139 an_annot.name = annot_value;
140 file_data_line = file_data_line.substr(space_split_i+1);
141 // get annot start index
142 space_split_i = file_data_line.find(" ");
143 annot_value = file_data_line.substr(0,space_split_i);
144 an_annot.type = annot_value;
145 annots.push_back(an_annot);
151 list<annot>::iterator list_i;
152 for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
154 cout << (*list_i).start << "," << (*list_i).end << "\t";
155 cout << (*list_i).name << "\t" << (*list_i).type << endl;
163 return sequence.length();
176 return sequence.c_str();
183 char conversionTable[257];
184 int seq_i, table_i, len;
186 len = sequence.length();
187 rev_comp.reserve(len);
188 // make a conversion table
189 // init all parts of conversion table to '~' character
190 // '~' I doubt will ever appear in a sequence file (jeez, I hope)
191 // and may the fleas of 1000 camels infest the genitals of any biologist (and
192 // seven generations of their progeny) who decides to make it mean
193 // something special!!!
194 // PS - double the curse for any smartass non-biologist who tries it as well
195 for(table_i=0; table_i < 256; table_i++)
197 conversionTable[table_i] = '~';
199 // add end of string character for printing out table for testing purposes
200 conversionTable[256] = '\0';
202 // add in the characters for the bases we want to convert
203 conversionTable['A'] = 'T';
204 conversionTable['T'] = 'A';
205 conversionTable['G'] = 'C';
206 conversionTable['C'] = 'G';
207 conversionTable['N'] = 'N';
209 // finally, the actual conversion loop
210 for(seq_i = len - 1; seq_i >= 0; seq_i--)
212 table_i = (int) sequence[seq_i];
213 rev_comp += conversionTable[table_i];
228 Sequence::set_seq(string a_seq)
254 Sequence::save(fstream &save_file)
255 //string save_file_path)
258 list<annot>::iterator annots_i;
261 // not sure why, or if i'm doing something wrong, but can't seem to pass
262 // file pointers down to this method from the mussa control class
263 // so each call to save a sequence appends to the file started by mussa_class
264 //save_file.open(save_file_path.c_str(), ios::app);
266 save_file << "<Sequence>" << endl;
267 save_file << sequence << endl;
268 save_file << "</Sequence>" << endl;
270 save_file << "<Annotations>" << endl;
271 for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
273 save_file << annots_i->start << " " << annots_i->end << " " ;
274 save_file << annots_i->name << " " << annots_i->type << endl;
276 save_file << "</Annotations>" << endl;
281 Sequence::load_museq(string load_file_path, int seq_num)
284 string file_data_line;
291 load_file.open(load_file_path.c_str(), ios::in);
294 // search for the seq_num-th sequence
295 while ( (!load_file.eof()) && (seq_counter < seq_num) )
297 getline(load_file,file_data_line);
298 if (file_data_line == "<Sequence>")
301 getline(load_file, file_data_line);
303 sequence = file_data_line;
305 getline(load_file, file_data_line);
306 getline(load_file, file_data_line);
307 if (file_data_line == "<Annotations>")
309 while ( (!load_file.eof()) && (file_data_line != "</Annotations>") )
312 getline(load_file,file_data_line);
313 if ((file_data_line != "") && (file_data_line != "</Annotations>"))
316 // need to get 4 values...almost same code 4 times...
317 // get annot start index
318 space_split_i = file_data_line.find(" ");
319 annot_value = file_data_line.substr(0,space_split_i);
320 an_annot.start = atoi (annot_value.c_str());
321 file_data_line = file_data_line.substr(space_split_i+1);
322 // get annot end index
323 space_split_i = file_data_line.find(" ");
324 annot_value = file_data_line.substr(0,space_split_i);
325 an_annot.end = atoi (annot_value.c_str());
326 file_data_line = file_data_line.substr(space_split_i+1);
327 // get annot start index
328 space_split_i = file_data_line.find(" ");
329 annot_value = file_data_line.substr(0,space_split_i);
330 an_annot.name = annot_value;
331 file_data_line = file_data_line.substr(space_split_i+1);
332 // get annot start index
333 space_split_i = file_data_line.find(" ");
334 annot_value = file_data_line.substr(0,space_split_i);
335 an_annot.type = annot_value;
336 annots.push_back(an_annot);
345 Sequence::find_motif(string a_motif)
348 int seq_i, motif_i, motif_len;
349 list<int> motif_match_starts;
351 motif_match_starts.clear();
352 // faster to loop thru the sequence as a old c string (ie char array)
353 seq_c = (char*)sequence.c_str();
354 motif_len = a_motif.length();
357 for (seq_i = 0; seq_i < length; seq_i++)
360 // this is pretty much a straight translation of Nora's python code
361 // to match iupac letter codes
362 if (a_motif[motif_i] == seq_c[seq_i])
364 else if (seq_c[seq_i] =='N')
366 else if ((a_motif[motif_i] =='M') &&
367 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
369 else if ((a_motif[motif_i] =='R') &&
370 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
372 else if ((a_motif[motif_i] =='W') &&
373 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
375 else if ((a_motif[motif_i] =='S') &&
376 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
378 else if ((a_motif[motif_i] =='Y') &&
379 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
381 else if ((a_motif[motif_i] =='K') &&
382 ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
384 else if ((a_motif[motif_i] =='V') &&
385 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
386 (seq_c[seq_i]=='G')))
388 else if ((a_motif[seq_i] =='H') &&
389 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
390 (seq_c[seq_i]=='T')))
392 else if ((a_motif[motif_i] =='D') &&
393 ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
394 (seq_c[seq_i]=='T')))
396 else if ((a_motif[motif_i] =='B') &&
397 ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
398 (seq_c[seq_i]=='T')))
403 // end Nora stuff, now we see if a match is found this pass
404 if (motif_i == motif_len)
406 motif_match_starts.push_back(seq_i - motif_len + 1);
411 return motif_match_starts;