sequence.cc

   1 //                        ----------------------------------------
   2 //                           ---------- sequence.cc -----------
   3 //                        ----------------------------------------
   4
   5 #include "sequence.hh"
   6
   7
   8 Sequence::Sequence()
   9 {
  10 }
  11
  12
  13 void
  14 Sequence::load_fasta(string file_path, int seq_num,
  15                      int start_index, int end_index)
  16 {
  17   fstream data_file;
  18   string file_data_line;
  19   int header_counter = 0;
  20   bool read_seq = true;
  21   string rev_comp;
  22   char conversionTable[257];
  23   int seq_i, table_i, len;
  24   string sequence_raw;
  25   char * seq_tmp;             // holds sequence during basic filtering
  26
  27
  28   data_file.open(file_path.c_str(), ios::in);
  29
  30
  31   // search for the header of the fasta sequence we want
  32   while ( (!data_file.eof()) && (header_counter < seq_num) )
  33   {
  34     getline(data_file,file_data_line);
  35     if (file_data_line.substr(0,1) == ">")
  36       header_counter++;
  37   }
  38
  39   header = file_data_line.substr(1);
  40
  41   sequence_raw = "";
  42
  43   while ( !data_file.eof() && read_seq )
  44   {
  45     getline(data_file,file_data_line);
  46     if (file_data_line.substr(0,1) == ">")
  47       read_seq = false;
  48     else sequence_raw += file_data_line;
  49   }
  50
  51   data_file.close();
  52
  53   // sequence filtering for upcasing agctn and convert non AGCTN to N
  54
  55   len = sequence_raw.length();
  56   seq_tmp = (char*)sequence_raw.c_str();
  57
  58   sequence = "";
  59
  60   // Make a conversion table
  61
  62   // everything we don't specify below will become 'N'
  63   for(table_i=0; table_i < 256; table_i++)
  64   {
  65     conversionTable[table_i] = 'N';
  66   }
  67   // add end of string character for printing out table for testing purposes
  68   conversionTable[256] = '\0';
  69
  70   // we want these to map to themselves - ie not to change
  71   conversionTable['A'] = 'A';
  72   conversionTable['T'] = 'T';
  73   conversionTable['G'] = 'G';
  74   conversionTable['C'] = 'C';
  75   // this is to upcase
  76   conversionTable['a'] = 'A';
  77   conversionTable['t'] = 'T';
  78   conversionTable['g'] = 'G';
  79   conversionTable['c'] = 'C';
  80
  81   // finally, the actual conversion loop
  82   for(seq_i = 0; seq_i < len; seq_i++)
  83   {
  84     table_i = (int) seq_tmp[seq_i];
  85     sequence += conversionTable[table_i];
  86   }
  87
  88
  89   // Lastly, if subselection of the sequence was specified we keep cut out
  90   // and only keep that part
  91
  92   // end_index = 0 means no end was specified, so cut to the end
  93   if (end_index == 0)
  94     end_index = len;
  95
  96   // start_index = 0 if no start was specified
  97   sequence = sequence.substr(start_index, end_index - start_index);
  98   length = sequence.length();
  99 }
 100
 101
 102   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 103   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 104
 105
 106 void
 107 Sequence::load_annot(string file_path, int start_index, int end_index)
 108 {
 109   fstream data_file;
 110   string file_data_line;
 111   annot an_annot;
 112   int space_split_i;
 113   string annot_value;
 114   list<annot>::iterator list_i;
 115
 116   annots.clear();
 117   data_file.open(file_path.c_str(), ios::in);
 118
 119   getline(data_file,file_data_line);
 120   species = file_data_line;
 121
 122   // end_index = 0 means no end was specified, so cut to the end
 123   if (end_index == 0)
 124     end_index = length;
 125
 126   cout << "START: " << start_index << " END: " << end_index << endl;
 127
 128   while ( !data_file.eof() )
 129   {
 130     getline(data_file,file_data_line);
 131     if (file_data_line != "")
 132     {
 133       // need to get 4 values...almost same code 4 times...
 134       // get annot start index
 135       space_split_i = file_data_line.find(" ");
 136       annot_value = file_data_line.substr(0,space_split_i);
 137       an_annot.start = atoi (annot_value.c_str());
 138       file_data_line = file_data_line.substr(space_split_i+1);
 139       // get annot end index
 140       space_split_i = file_data_line.find(" ");
 141       annot_value = file_data_line.substr(0,space_split_i);
 142       an_annot.end = atoi (annot_value.c_str());
 143       file_data_line = file_data_line.substr(space_split_i+1);
 144       // get annot start index
 145       space_split_i = file_data_line.find(" ");
 146       annot_value = file_data_line.substr(0,space_split_i);
 147       an_annot.name = annot_value;
 148       file_data_line = file_data_line.substr(space_split_i+1);
 149       // get annot start index
 150       space_split_i = file_data_line.find(" ");
 151       annot_value = file_data_line.substr(0,space_split_i);
 152       an_annot.type = annot_value;
 153
 154       cout << "ANA_START: " << an_annot.start <<
 155         " ANA_END: " << an_annot.end << endl;
 156
 157       // add annot to list if it falls within the range of sequence specified
 158       if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 159       {
 160         an_annot.start -= start_index;
 161         an_annot.end -= start_index;
 162         annots.push_back(an_annot);
 163       }
 164       else
 165         cout << "FAILED!!!!!!\n";
 166     }
 167   }
 168
 169   data_file.close();
 170
 171   // debugging check
 172   for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 173   {
 174     cout << (*list_i).start << "," << (*list_i).end << "\t";
 175     cout << (*list_i).name << "\t" << (*list_i).type << endl;
 176   }
 177 }
 178
 179
 180 int
 181 Sequence::len()
 182 {
 183   return sequence.length();
 184 }
 185
 186
 187 string
 188 Sequence::seq()
 189 {
 190   return sequence;
 191 }
 192
 193 const char *
 194 Sequence::c_seq()
 195 {
 196   return sequence.c_str();
 197 }
 198
 199 string
 200 Sequence::rev_comp()
 201 {
 202   string rev_comp;
 203   char conversionTable[257];
 204   int seq_i, table_i, len;
 205
 206   len = sequence.length();
 207   rev_comp.reserve(len);
 208   // make a conversion table
 209   // init all parts of conversion table to '~' character
 210   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 211   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 212   // seven generations of their progeny) who decides to make it mean
 213   // something special!!!
 214   // PS - double the curse for any smartass non-biologist who tries it as well
 215   for(table_i=0; table_i < 256; table_i++)
 216   {
 217     conversionTable[table_i] = '~';
 218   }
 219   // add end of string character for printing out table for testing purposes
 220   conversionTable[256] = '\0';
 221
 222   // add in the characters for the bases we want to convert
 223   conversionTable['A'] = 'T';
 224   conversionTable['T'] = 'A';
 225   conversionTable['G'] = 'C';
 226   conversionTable['C'] = 'G';
 227   conversionTable['N'] = 'N';
 228
 229   // finally, the actual conversion loop
 230   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 231   {
 232     table_i = (int) sequence[seq_i];
 233     rev_comp += conversionTable[table_i];
 234   }
 235
 236   return rev_comp;
 237 }
 238
 239
 240 string
 241 Sequence::hdr()
 242 {
 243   return header;
 244 }
 245
 246 string
 247 Sequence::sp_name()
 248 {
 249   return species;
 250 }
 251
 252
 253 void
 254 Sequence::set_seq(string a_seq)
 255 {
 256   sequence = a_seq;
 257 }
 258
 259
 260 /*
 261 string
 262 Sequence::species()
 263 {
 264   return species;
 265 }
 266 */
 267
 268 void
 269 Sequence::clear()
 270 {
 271   sequence = "";
 272   length = 0;
 273   header = "";
 274   species = "";
 275   species_num = -4;
 276   annots.clear();
 277 }
 278
 279 void
 280 Sequence::save(fstream &save_file)
 281                //string save_file_path)
 282 {
 283   //fstream save_file;
 284   list<annot>::iterator annots_i;
 285   int i;
 286
 287   // not sure why, or if i'm doing something wrong, but can't seem to pass
 288   // file pointers down to this method from the mussa control class
 289   // so each call to save a sequence appends to the file started by mussa_class
 290   //save_file.open(save_file_path.c_str(), ios::app);
 291
 292   save_file << "<Sequence>" << endl;
 293   save_file << sequence << endl;
 294   save_file << "</Sequence>" << endl;
 295
 296   save_file << "<Annotations>" << endl;
 297   save_file << species << endl;
 298   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 299   {
 300     save_file << annots_i->start << " " << annots_i->end << " " ;
 301     save_file << annots_i->name << " " << annots_i->type << endl;
 302   }
 303   save_file << "</Annotations>" << endl;
 304   //save_file.close();
 305 }
 306
 307 void
 308 Sequence::load_museq(string load_file_path, int seq_num)
 309 {
 310   fstream load_file;
 311   string file_data_line;
 312   int seq_counter;
 313   annot an_annot;
 314   int space_split_i;
 315   string annot_value;
 316
 317   annots.clear();
 318   load_file.open(load_file_path.c_str(), ios::in);
 319
 320   seq_counter = 0;
 321   // search for the seq_num-th sequence
 322   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 323   {
 324     getline(load_file,file_data_line);
 325     if (file_data_line == "<Sequence>")
 326       seq_counter++;
 327   }
 328   getline(load_file, file_data_line);
 329   sequence = file_data_line;
 330   length = sequence.length();
 331   getline(load_file, file_data_line);
 332   getline(load_file, file_data_line);
 333   if (file_data_line == "<Annotations>")
 334   {
 335     getline(load_file, file_data_line);
 336     species = file_data_line;
 337     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 338     {
 339       getline(load_file,file_data_line);
 340       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 341       {
 342         // need to get 4 values...almost same code 4 times...
 343         // get annot start index
 344         space_split_i = file_data_line.find(" ");
 345         annot_value = file_data_line.substr(0,space_split_i);
 346         an_annot.start = atoi (annot_value.c_str());
 347         file_data_line = file_data_line.substr(space_split_i+1);
 348         // get annot end index
 349         space_split_i = file_data_line.find(" ");
 350         annot_value = file_data_line.substr(0,space_split_i);
 351         an_annot.end = atoi (annot_value.c_str());
 352         file_data_line = file_data_line.substr(space_split_i+1);
 353         // get annot start index
 354         space_split_i = file_data_line.find(" ");
 355         annot_value = file_data_line.substr(0,space_split_i);
 356         an_annot.name = annot_value;
 357         file_data_line = file_data_line.substr(space_split_i+1);
 358         // get annot start index
 359         space_split_i = file_data_line.find(" ");
 360         annot_value = file_data_line.substr(0,space_split_i);
 361         an_annot.type = annot_value;
 362         annots.push_back(an_annot);
 363       }
 364     }
 365   }
 366   load_file.close();
 367 }
 368
 369
 370 string
 371 Sequence::rc_motif(string a_motif)
 372 {
 373   string rev_comp;
 374   char conversionTable[257];
 375   int seq_i, table_i, len;
 376
 377   len = a_motif.length();
 378   rev_comp.reserve(len);
 379
 380   for(table_i=0; table_i < 256; table_i++)
 381   {
 382     conversionTable[table_i] = '~';
 383   }
 384   // add end of string character for printing out table for testing purposes
 385   conversionTable[256] = '\0';
 386
 387   // add in the characters for the bases we want to convert (IUPAC)
 388   conversionTable['A'] = 'T';
 389   conversionTable['T'] = 'A';
 390   conversionTable['G'] = 'C';
 391   conversionTable['C'] = 'G';
 392   conversionTable['N'] = 'N';
 393   conversionTable['M'] = 'K';
 394   conversionTable['R'] = 'Y';
 395   conversionTable['W'] = 'W';
 396   conversionTable['S'] = 'S';
 397   conversionTable['Y'] = 'R';
 398   conversionTable['K'] = 'M';
 399   conversionTable['V'] = 'B';
 400   conversionTable['H'] = 'D';
 401   conversionTable['D'] = 'H';
 402   conversionTable['B'] = 'V';
 403
 404   // finally, the actual conversion loop
 405   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 406   {
 407     //cout << "** i = " << seq_i << " bp = " <<
 408     table_i = (int) a_motif[seq_i];
 409     rev_comp += conversionTable[table_i];
 410   }
 411
 412   cout << "seq: " << a_motif << endl;
 413   cout << "rc:  " << rev_comp << endl;
 414
 415   return rev_comp;
 416 }
 417
 418
 419 vector<int>
 420 Sequence::find_motif(string a_motif)
 421 {
 422   vector<int> motif_match_starts;
 423   string a_motif_rc;
 424
 425
 426   motif_match_starts.clear();
 427   motif_scan(a_motif, &motif_match_starts);
 428
 429   a_motif_rc = rc_motif(a_motif);
 430   // make sure not to do search again if it is a palindrome
 431   if (a_motif_rc != a_motif)
 432     motif_scan(a_motif_rc, &motif_match_starts);
 433
 434   return motif_match_starts;
 435 }
 436
 437 void
 438 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 439 {
 440   char * seq_c;
 441   int seq_i, motif_i, motif_len;
 442
 443   // faster to loop thru the sequence as a old c string (ie char array)
 444   seq_c = (char*)sequence.c_str();
 445   motif_len = a_motif.length();
 446
 447   //cout << "motif_length: " << motif_len << endl;
 448   //cout << "RAAARRRRR\n";
 449
 450   motif_i = 0;
 451
 452   //cout << "length: " << length << endl;
 453   seq_i = 0;
 454   while (seq_i < length)
 455   {
 456     // this is pretty much a straight translation of Nora's python code
 457     // to match iupac letter codes
 458     if (a_motif[motif_i] =='N')
 459       motif_i++;
 460     else if (a_motif[motif_i] == seq_c[seq_i])
 461       motif_i++;
 462     else if ((a_motif[motif_i] =='M') &&
 463              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 464       motif_i++;
 465     else if ((a_motif[motif_i] =='R') &&
 466              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 467       motif_i++;
 468     else if ((a_motif[motif_i] =='W') &&
 469              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 470       motif_i++;
 471     else if ((a_motif[motif_i] =='S') &&
 472              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 473       motif_i++;
 474     else if ((a_motif[motif_i] =='Y') &&
 475              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 476       motif_i++;
 477     else if ((a_motif[motif_i] =='K') &&
 478              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 479       motif_i++;
 480     else if ((a_motif[motif_i] =='V') &&
 481              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 482               (seq_c[seq_i]=='G')))
 483       motif_i++;
 484     else if ((a_motif[seq_i] =='H') &&
 485              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 486               (seq_c[seq_i]=='T')))
 487       motif_i++;
 488     else if ((a_motif[motif_i] =='D') &&
 489              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 490               (seq_c[seq_i]=='T')))
 491       motif_i++;
 492     else if ((a_motif[motif_i] =='B') &&
 493              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 494               (seq_c[seq_i]=='T')))
 495       motif_i++;
 496     else
 497     {
 498       seq_i -= motif_i;
 499       motif_i = 0;
 500     }
 501
 502     // end Nora stuff, now we see if a match is found this pass
 503     if (motif_i == motif_len)
 504     {
 505       //cout << "MATCH!!!\n";
 506       motif_match_starts->push_back(seq_i - motif_len + 1);
 507       motif_i = 0;
 508     }
 509
 510     seq_i++;
 511   }
 512 }