sequence.cc

   1 //                        ----------------------------------------
   2 //                           ---------- sequence.cc -----------
   3 //                        ----------------------------------------
   4
   5 #include "sequence.hh"
   6
   7
   8 Sequence::Sequence()
   9 {
  10 }
  11
  12
  13 string
  14 Sequence::load_fasta(string file_path, int seq_num,
  15                      int start_index, int end_index)
  16 {
  17   fstream data_file;
  18   string file_data_line;
  19   int header_counter = 0;
  20   bool read_seq = true;
  21   string rev_comp;
  22   char conversionTable[257];
  23   int seq_i, table_i, len;
  24   string sequence_raw;
  25   char * seq_tmp;             // holds sequence during basic filtering
  26   string err_msg;
  27
  28   data_file.open(file_path.c_str(), ios::in);
  29
  30   if (!data_file)
  31   {
  32     err_msg = "Sequence File: " + file_path + " not found";
  33     return err_msg;
  34   }
  35   // if file opened okay, read it
  36   else
  37   {
  38     // search for the header of the fasta sequence we want
  39     while ( (!data_file.eof()) && (header_counter < seq_num) )
  40     {
  41       getline(data_file,file_data_line);
  42       if (file_data_line.substr(0,1) == ">")
  43         header_counter++;
  44     }
  45
  46     header = file_data_line.substr(1);
  47
  48     sequence_raw = "";
  49
  50     while ( !data_file.eof() && read_seq )
  51     {
  52       getline(data_file,file_data_line);
  53       if (file_data_line.substr(0,1) == ">")
  54         read_seq = false;
  55       else sequence_raw += file_data_line;
  56     }
  57
  58     data_file.close();
  59
  60     // sequence filtering for upcasing agctn and convert non AGCTN to N
  61
  62     len = sequence_raw.length();
  63     seq_tmp = (char*)sequence_raw.c_str();
  64
  65     sequence = "";
  66
  67     // Make a conversion table
  68
  69     // everything we don't specify below will become 'N'
  70     for(table_i=0; table_i < 256; table_i++)
  71     {
  72       conversionTable[table_i] = 'N';
  73     }
  74     // add end of string character for printing out table for testing purposes
  75     conversionTable[256] = '\0';
  76
  77     // we want these to map to themselves - ie not to change
  78     conversionTable['A'] = 'A';
  79     conversionTable['T'] = 'T';
  80     conversionTable['G'] = 'G';
  81     conversionTable['C'] = 'C';
  82     // this is to upcase
  83     conversionTable['a'] = 'A';
  84     conversionTable['t'] = 'T';
  85     conversionTable['g'] = 'G';
  86     conversionTable['c'] = 'C';
  87
  88     // finally, the actual conversion loop
  89     for(seq_i = 0; seq_i < len; seq_i++)
  90     {
  91       table_i = (int) seq_tmp[seq_i];
  92       sequence += conversionTable[table_i];
  93     }
  94
  95
  96     // Lastly, if subselection of the sequence was specified we keep cut out
  97     // and only keep that part
  98
  99     // end_index = 0 means no end was specified, so cut to the end
 100     if (end_index == 0)
 101       end_index = len;
 102
 103     // start_index = 0 if no start was specified
 104     sequence = sequence.substr(start_index, end_index - start_index);
 105     length = sequence.length();
 106
 107     return "";
 108   }
 109 }
 110
 111
 112   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 113   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 114
 115
 116 string
 117 Sequence::load_annot(string file_path, int start_index, int end_index)
 118 {
 119   fstream data_file;
 120   string file_data_line;
 121   annot an_annot;
 122   int space_split_i;
 123   string annot_value;
 124   list<annot>::iterator list_i;
 125   string err_msg;
 126
 127
 128   annots.clear();
 129   data_file.open(file_path.c_str(), ios::in);
 130
 131   if (!data_file)
 132   {
 133     err_msg = "Sequence File: " + file_path + " not found";
 134     return err_msg;
 135   }
 136   // if file opened okay, read it
 137   else
 138   {
 139     getline(data_file,file_data_line);
 140     species = file_data_line;
 141
 142     // end_index = 0 means no end was specified, so cut to the end
 143     if (end_index == 0)
 144       end_index = length;
 145
 146     //cout << "START: " << start_index << " END: " << end_index << endl;
 147
 148     while ( !data_file.eof() )
 149     {
 150       getline(data_file,file_data_line);
 151       if (file_data_line != "")
 152       {
 153         // need to get 4 values...almost same code 4 times...
 154         // get annot start index
 155         space_split_i = file_data_line.find(" ");
 156         annot_value = file_data_line.substr(0,space_split_i);
 157         an_annot.start = atoi (annot_value.c_str());
 158         file_data_line = file_data_line.substr(space_split_i+1);
 159         // get annot end index
 160         space_split_i = file_data_line.find(" ");
 161         annot_value = file_data_line.substr(0,space_split_i);
 162         an_annot.end = atoi (annot_value.c_str());
 163         file_data_line = file_data_line.substr(space_split_i+1);
 164
 165         cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 166              << endl;
 167
 168         // get annot name
 169         space_split_i = file_data_line.find(" ");
 170         if (space_split_i == string::npos)  // no entries for name & type
 171         {
 172           cout << "seq, annots - no name or type\n";
 173           an_annot.name = "";
 174           an_annot.type = "";
 175         }
 176         else
 177         {
 178           annot_value = file_data_line.substr(0,space_split_i);
 179           an_annot.name = annot_value;
 180           file_data_line = file_data_line.substr(space_split_i+1);
 181           // get annot type
 182           space_split_i = file_data_line.find(" ");
 183           if (space_split_i == string::npos)  // no entry for type
 184             an_annot.type = "";
 185           else
 186           {
 187             annot_value = file_data_line.substr(0,space_split_i);
 188             an_annot.type = annot_value;
 189           }
 190         }
 191
 192
 193         // add annot to list if it falls within the range of sequence specified
 194         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 195         {
 196           an_annot.start -= start_index;
 197           an_annot.end -= start_index;
 198           annots.push_back(an_annot);
 199         }
 200         else
 201           cout << "FAILED!!!!!!\n";
 202       }
 203     }
 204
 205     data_file.close();
 206     /*
 207     // debugging check
 208     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 209     {
 210       cout << (*list_i).start << "," << (*list_i).end << "\t";
 211       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 212     }
 213     */
 214     return "";
 215   }
 216 }
 217
 218
 219 int
 220 Sequence::len()
 221 {
 222   return sequence.length();
 223 }
 224
 225
 226 string
 227 Sequence::seq()
 228 {
 229   return sequence;
 230 }
 231
 232
 233 string
 234 Sequence::subseq(int start, int end)
 235 {
 236   return sequence.substr(start, end);
 237 }
 238
 239
 240 const char *
 241 Sequence::c_seq()
 242 {
 243   return sequence.c_str();
 244 }
 245
 246 string
 247 Sequence::rev_comp()
 248 {
 249   string rev_comp;
 250   char conversionTable[257];
 251   int seq_i, table_i, len;
 252
 253   len = sequence.length();
 254   rev_comp.reserve(len);
 255   // make a conversion table
 256   // init all parts of conversion table to '~' character
 257   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 258   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 259   // seven generations of their progeny) who decides to make it mean
 260   // something special!!!
 261   // PS - double the curse for any smartass non-biologist who tries it as well
 262   for(table_i=0; table_i < 256; table_i++)
 263   {
 264     conversionTable[table_i] = '~';
 265   }
 266   // add end of string character for printing out table for testing purposes
 267   conversionTable[256] = '\0';
 268
 269   // add in the characters for the bases we want to convert
 270   conversionTable['A'] = 'T';
 271   conversionTable['T'] = 'A';
 272   conversionTable['G'] = 'C';
 273   conversionTable['C'] = 'G';
 274   conversionTable['N'] = 'N';
 275
 276   // finally, the actual conversion loop
 277   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 278   {
 279     table_i = (int) sequence[seq_i];
 280     rev_comp += conversionTable[table_i];
 281   }
 282
 283   return rev_comp;
 284 }
 285
 286
 287 string
 288 Sequence::hdr()
 289 {
 290   return header;
 291 }
 292
 293 string
 294 Sequence::sp_name()
 295 {
 296   return species;
 297 }
 298
 299
 300 void
 301 Sequence::set_seq(string a_seq)
 302 {
 303   sequence = a_seq;
 304 }
 305
 306
 307 /*
 308 string
 309 Sequence::species()
 310 {
 311   return species;
 312 }
 313 */
 314
 315 void
 316 Sequence::clear()
 317 {
 318   sequence = "";
 319   length = 0;
 320   header = "";
 321   species = "";
 322   species_num = -4;
 323   annots.clear();
 324 }
 325
 326 void
 327 Sequence::save(fstream &save_file)
 328                //string save_file_path)
 329 {
 330   //fstream save_file;
 331   list<annot>::iterator annots_i;
 332   int i;
 333
 334   // not sure why, or if i'm doing something wrong, but can't seem to pass
 335   // file pointers down to this method from the mussa control class
 336   // so each call to save a sequence appends to the file started by mussa_class
 337   //save_file.open(save_file_path.c_str(), ios::app);
 338
 339   save_file << "<Sequence>" << endl;
 340   save_file << sequence << endl;
 341   save_file << "</Sequence>" << endl;
 342
 343   save_file << "<Annotations>" << endl;
 344   save_file << species << endl;
 345   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 346   {
 347     save_file << annots_i->start << " " << annots_i->end << " " ;
 348     save_file << annots_i->name << " " << annots_i->type << endl;
 349   }
 350   save_file << "</Annotations>" << endl;
 351   //save_file.close();
 352 }
 353
 354 void
 355 Sequence::load_museq(string load_file_path, int seq_num)
 356 {
 357   fstream load_file;
 358   string file_data_line;
 359   int seq_counter;
 360   annot an_annot;
 361   int space_split_i;
 362   string annot_value;
 363
 364   annots.clear();
 365   load_file.open(load_file_path.c_str(), ios::in);
 366
 367   seq_counter = 0;
 368   // search for the seq_num-th sequence
 369   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 370   {
 371     getline(load_file,file_data_line);
 372     if (file_data_line == "<Sequence>")
 373       seq_counter++;
 374   }
 375   getline(load_file, file_data_line);
 376   sequence = file_data_line;
 377   length = sequence.length();
 378   getline(load_file, file_data_line);
 379   getline(load_file, file_data_line);
 380   if (file_data_line == "<Annotations>")
 381   {
 382     getline(load_file, file_data_line);
 383     species = file_data_line;
 384     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 385     {
 386       getline(load_file,file_data_line);
 387       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 388       {
 389         // need to get 4 values...almost same code 4 times...
 390         // get annot start index
 391         space_split_i = file_data_line.find(" ");
 392         annot_value = file_data_line.substr(0,space_split_i);
 393         an_annot.start = atoi (annot_value.c_str());
 394         file_data_line = file_data_line.substr(space_split_i+1);
 395         // get annot end index
 396         space_split_i = file_data_line.find(" ");
 397         annot_value = file_data_line.substr(0,space_split_i);
 398         an_annot.end = atoi (annot_value.c_str());
 399
 400         if (space_split_i == string::npos)  // no entry for type or name
 401         {
 402           cout << "seq, annots - no type or name\n";
 403           an_annot.type = "";
 404           an_annot.name = "";
 405         }
 406         else   // else get annot type
 407         {
 408           file_data_line = file_data_line.substr(space_split_i+1);
 409           space_split_i = file_data_line.find(" ");
 410           annot_value = file_data_line.substr(0,space_split_i);
 411           an_annot.type = annot_value;
 412           if (space_split_i == string::npos)  // no entry for name
 413           {
 414             cout << "seq, annots - no name\n";
 415             an_annot.name = "";
 416           }
 417           else          // get annot name
 418           {
 419             file_data_line = file_data_line.substr(space_split_i+1);
 420             space_split_i = file_data_line.find(" ");
 421             annot_value = file_data_line.substr(0,space_split_i);
 422             an_annot.type = annot_value;
 423           }
 424         }
 425         annots.push_back(an_annot);  // don't forget to actually add the annot
 426       }
 427       cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 428            << "-->" << an_annot.type << "::" << an_annot.name << endl;
 429     }
 430   }
 431   load_file.close();
 432 }
 433
 434
 435 string
 436 Sequence::rc_motif(string a_motif)
 437 {
 438   string rev_comp;
 439   char conversionTable[257];
 440   int seq_i, table_i, len;
 441
 442   len = a_motif.length();
 443   rev_comp.reserve(len);
 444
 445   for(table_i=0; table_i < 256; table_i++)
 446   {
 447     conversionTable[table_i] = '~';
 448   }
 449   // add end of string character for printing out table for testing purposes
 450   conversionTable[256] = '\0';
 451
 452   // add in the characters for the bases we want to convert (IUPAC)
 453   conversionTable['A'] = 'T';
 454   conversionTable['T'] = 'A';
 455   conversionTable['G'] = 'C';
 456   conversionTable['C'] = 'G';
 457   conversionTable['N'] = 'N';
 458   conversionTable['M'] = 'K';
 459   conversionTable['R'] = 'Y';
 460   conversionTable['W'] = 'W';
 461   conversionTable['S'] = 'S';
 462   conversionTable['Y'] = 'R';
 463   conversionTable['K'] = 'M';
 464   conversionTable['V'] = 'B';
 465   conversionTable['H'] = 'D';
 466   conversionTable['D'] = 'H';
 467   conversionTable['B'] = 'V';
 468
 469   // finally, the actual conversion loop
 470   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 471   {
 472     //cout << "** i = " << seq_i << " bp = " <<
 473     table_i = (int) a_motif[seq_i];
 474     rev_comp += conversionTable[table_i];
 475   }
 476
 477   cout << "seq: " << a_motif << endl;
 478   cout << "rc:  " << rev_comp << endl;
 479
 480   return rev_comp;
 481 }
 482
 483 string
 484 Sequence::motif_validate(string a_motif)
 485 {
 486   string valid_motif;
 487   int seq_i, len;
 488
 489
 490   len = a_motif.length();
 491   valid_motif.reserve(len);
 492
 493   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 494   // current nonIUPAC symbols are omitted, which is not reported atm
 495   for(seq_i = 0; seq_i < len; seq_i++)
 496   {
 497      if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 498       valid_motif += 'A';
 499     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 500       valid_motif += 'T';
 501     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 502       valid_motif += 'G';
 503     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 504       valid_motif += 'C';
 505     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 506       valid_motif += 'N';
 507     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 508       valid_motif += 'M';
 509     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 510       valid_motif += 'R';
 511     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 512       valid_motif += 'W';
 513     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 514       valid_motif += 'S';
 515     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 516       valid_motif += 'Y';
 517     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 518       valid_motif += 'G';
 519     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 520       valid_motif += 'V';
 521     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 522       valid_motif += 'H';
 523     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 524       valid_motif += 'D';
 525     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 526       valid_motif += 'B';
 527    }
 528
 529   cout << "valid_motif is: " << valid_motif << endl;
 530
 531   return valid_motif;
 532 }
 533
 534
 535 vector<int>
 536 Sequence::find_motif(string a_motif)
 537 {
 538   vector<int> motif_match_starts;
 539   string a_motif_rc;
 540
 541
 542   motif_match_starts.clear();
 543
 544   cout << "motif is: " << a_motif << endl;
 545   a_motif = motif_validate(a_motif);
 546   cout << "motif is: " << a_motif << endl;
 547
 548   motif_scan(a_motif, &motif_match_starts);
 549
 550   a_motif_rc = rc_motif(a_motif);
 551   // make sure not to do search again if it is a palindrome
 552   if (a_motif_rc != a_motif)
 553     motif_scan(a_motif_rc, &motif_match_starts);
 554
 555   return motif_match_starts;
 556 }
 557
 558 void
 559 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 560 {
 561   char * seq_c;
 562   int seq_i, motif_i, motif_len;
 563
 564   // faster to loop thru the sequence as a old c string (ie char array)
 565   seq_c = (char*)sequence.c_str();
 566   motif_len = a_motif.length();
 567
 568   //cout << "motif_length: " << motif_len << endl;
 569   //cout << "RAAARRRRR\n";
 570
 571   motif_i = 0;
 572
 573   cout << "motif: " << a_motif << endl;
 574
 575   //cout << "length: " << length << endl;
 576   seq_i = 0;
 577   while (seq_i < length)
 578   {
 579     //if ((seq_i > 10885) && (seq_i < 10917))
 580     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 581     // this is pretty much a straight translation of Nora's python code
 582     // to match iupac letter codes
 583     if (a_motif[motif_i] =='N')
 584       motif_i++;
 585     else if (a_motif[motif_i] == seq_c[seq_i])
 586       motif_i++;
 587     else if ((a_motif[motif_i] =='M') &&
 588              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 589       motif_i++;
 590     else if ((a_motif[motif_i] =='R') &&
 591              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 592       motif_i++;
 593     else if ((a_motif[motif_i] =='W') &&
 594              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 595       motif_i++;
 596     else if ((a_motif[motif_i] =='S') &&
 597              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 598       motif_i++;
 599     else if ((a_motif[motif_i] =='Y') &&
 600              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 601       motif_i++;
 602     else if ((a_motif[motif_i] =='K') &&
 603              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 604       motif_i++;
 605     else if ((a_motif[motif_i] =='V') &&
 606              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 607               (seq_c[seq_i]=='G')))
 608       motif_i++;
 609     else if ((a_motif[seq_i] =='H') &&
 610              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 611               (seq_c[seq_i]=='T')))
 612       motif_i++;
 613     else if ((a_motif[motif_i] =='D') &&
 614              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 615               (seq_c[seq_i]=='T')))
 616       motif_i++;
 617     else if ((a_motif[motif_i] =='B') &&
 618              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 619               (seq_c[seq_i]=='T')))
 620       motif_i++;
 621
 622     else
 623     {
 624       seq_i -= motif_i;
 625       motif_i = 0;
 626     }
 627
 628     // end Nora stuff, now we see if a match is found this pass
 629     if (motif_i == motif_len)
 630     {
 631       //cout << "!!";
 632       motif_match_starts->push_back(seq_i - motif_len + 1);
 633       motif_i = 0;
 634     }
 635
 636     seq_i++;
 637   }
 638   cout << endl;
 639 }
 640
 641 /*
 642         // get annot start index
 643         space_split_i = file_data_line.find(" ");
 644         annot_value = file_data_line.substr(0,space_split_i);
 645         an_annot.name = annot_value;
 646         file_data_line = file_data_line.substr(space_split_i+1);
 647         // get annot start index
 648         space_split_i = file_data_line.find(" ");
 649         annot_value = file_data_line.substr(0,space_split_i);
 650         an_annot.type = annot_value;
 651 */