sequence.cc

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "sequence.hh"
  26
  27
  28 Sequence::Sequence()
  29 {
  30 }
  31
  32
  33 string
  34 Sequence::load_fasta(string file_path, int seq_num,
  35                      int start_index, int end_index)
  36 {
  37   fstream data_file;
  38   string file_data_line;
  39   int header_counter = 0;
  40   bool read_seq = true;
  41   string rev_comp;
  42   char conversionTable[257];
  43   int seq_i, table_i, len;
  44   string sequence_raw;
  45   char * seq_tmp;             // holds sequence during basic filtering
  46   string err_msg;
  47
  48   data_file.open(file_path.c_str(), ios::in);
  49
  50   if (!data_file)
  51   {
  52     err_msg = "Sequence File: " + file_path + " not found";
  53     return err_msg;
  54   }
  55   // if file opened okay, read it
  56   else
  57   {
  58     // search for the header of the fasta sequence we want
  59     while ( (!data_file.eof()) && (header_counter < seq_num) )
  60     {
  61       getline(data_file,file_data_line);
  62       if (file_data_line.substr(0,1) == ">")
  63         header_counter++;
  64     }
  65
  66     header = file_data_line.substr(1);
  67
  68     sequence_raw = "";
  69
  70     while ( !data_file.eof() && read_seq )
  71     {
  72       getline(data_file,file_data_line);
  73       if (file_data_line.substr(0,1) == ">")
  74         read_seq = false;
  75       else sequence_raw += file_data_line;
  76     }
  77
  78     data_file.close();
  79
  80     // sequence filtering for upcasing agctn and convert non AGCTN to N
  81
  82     len = sequence_raw.length();
  83     seq_tmp = (char*)sequence_raw.c_str();
  84
  85     sequence = "";
  86
  87     // Make a conversion table
  88
  89     // everything we don't specify below will become 'N'
  90     for(table_i=0; table_i < 256; table_i++)
  91     {
  92       conversionTable[table_i] = 'N';
  93     }
  94     // add end of string character for printing out table for testing purposes
  95     conversionTable[256] = '\0';
  96
  97     // we want these to map to themselves - ie not to change
  98     conversionTable['A'] = 'A';
  99     conversionTable['T'] = 'T';
 100     conversionTable['G'] = 'G';
 101     conversionTable['C'] = 'C';
 102     // this is to upcase
 103     conversionTable['a'] = 'A';
 104     conversionTable['t'] = 'T';
 105     conversionTable['g'] = 'G';
 106     conversionTable['c'] = 'C';
 107
 108     // finally, the actual conversion loop
 109     for(seq_i = 0; seq_i < len; seq_i++)
 110     {
 111       table_i = (int) seq_tmp[seq_i];
 112       sequence += conversionTable[table_i];
 113     }
 114
 115
 116     // Lastly, if subselection of the sequence was specified we keep cut out
 117     // and only keep that part
 118
 119     // end_index = 0 means no end was specified, so cut to the end
 120     if (end_index == 0)
 121       end_index = len;
 122
 123     // start_index = 0 if no start was specified
 124     sequence = sequence.substr(start_index, end_index - start_index);
 125     length = sequence.length();
 126
 127     return "";
 128   }
 129 }
 130
 131
 132   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 133   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 134
 135
 136 string
 137 Sequence::load_annot(string file_path, int start_index, int end_index)
 138 {
 139   fstream data_file;
 140   string file_data_line;
 141   annot an_annot;
 142   int space_split_i;
 143   string annot_value;
 144   list<annot>::iterator list_i;
 145   string err_msg;
 146
 147
 148   annots.clear();
 149   data_file.open(file_path.c_str(), ios::in);
 150
 151   if (!data_file)
 152   {
 153     err_msg = "Sequence File: " + file_path + " not found";
 154     return err_msg;
 155   }
 156   // if file opened okay, read it
 157   else
 158   {
 159     getline(data_file,file_data_line);
 160     species = file_data_line;
 161
 162     // end_index = 0 means no end was specified, so cut to the end
 163     if (end_index == 0)
 164       end_index = length;
 165
 166     //cout << "START: " << start_index << " END: " << end_index << endl;
 167
 168     while ( !data_file.eof() )
 169     {
 170       getline(data_file,file_data_line);
 171       if (file_data_line != "")
 172       {
 173         // need to get 4 values...almost same code 4 times...
 174         // get annot start index
 175         space_split_i = file_data_line.find(" ");
 176         annot_value = file_data_line.substr(0,space_split_i);
 177         an_annot.start = atoi (annot_value.c_str());
 178         file_data_line = file_data_line.substr(space_split_i+1);
 179         // get annot end index
 180         space_split_i = file_data_line.find(" ");
 181         annot_value = file_data_line.substr(0,space_split_i);
 182         an_annot.end = atoi (annot_value.c_str());
 183         file_data_line = file_data_line.substr(space_split_i+1);
 184
 185         cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 186              << endl;
 187
 188         // get annot name
 189         space_split_i = file_data_line.find(" ");
 190         if (space_split_i == string::npos)  // no entries for name & type
 191         {
 192           cout << "seq, annots - no name or type\n";
 193           an_annot.name = "";
 194           an_annot.type = "";
 195         }
 196         else
 197         {
 198           annot_value = file_data_line.substr(0,space_split_i);
 199           an_annot.name = annot_value;
 200           file_data_line = file_data_line.substr(space_split_i+1);
 201           // get annot type
 202           space_split_i = file_data_line.find(" ");
 203           if (space_split_i == string::npos)  // no entry for type
 204             an_annot.type = "";
 205           else
 206           {
 207             annot_value = file_data_line.substr(0,space_split_i);
 208             an_annot.type = annot_value;
 209           }
 210         }
 211
 212
 213         // add annot to list if it falls within the range of sequence specified
 214         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 215         {
 216           an_annot.start -= start_index;
 217           an_annot.end -= start_index;
 218           annots.push_back(an_annot);
 219         }
 220         else
 221           cout << "FAILED!!!!!!\n";
 222       }
 223     }
 224
 225     data_file.close();
 226     /*
 227     // debugging check
 228     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 229     {
 230       cout << (*list_i).start << "," << (*list_i).end << "\t";
 231       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 232     }
 233     */
 234     return "";
 235   }
 236 }
 237
 238
 239 int
 240 Sequence::len()
 241 {
 242   return sequence.length();
 243 }
 244
 245
 246 string
 247 Sequence::seq()
 248 {
 249   return sequence;
 250 }
 251
 252
 253 string
 254 Sequence::subseq(int start, int end)
 255 {
 256   return sequence.substr(start, end);
 257 }
 258
 259
 260 const char *
 261 Sequence::c_seq()
 262 {
 263   return sequence.c_str();
 264 }
 265
 266 string
 267 Sequence::rev_comp()
 268 {
 269   string rev_comp;
 270   char conversionTable[257];
 271   int seq_i, table_i, len;
 272
 273   len = sequence.length();
 274   rev_comp.reserve(len);
 275   // make a conversion table
 276   // init all parts of conversion table to '~' character
 277   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 278   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 279   // seven generations of their progeny) who decides to make it mean
 280   // something special!!!
 281   // PS - double the curse for any smartass non-biologist who tries it as well
 282   for(table_i=0; table_i < 256; table_i++)
 283   {
 284     conversionTable[table_i] = '~';
 285   }
 286   // add end of string character for printing out table for testing purposes
 287   conversionTable[256] = '\0';
 288
 289   // add in the characters for the bases we want to convert
 290   conversionTable['A'] = 'T';
 291   conversionTable['T'] = 'A';
 292   conversionTable['G'] = 'C';
 293   conversionTable['C'] = 'G';
 294   conversionTable['N'] = 'N';
 295
 296   // finally, the actual conversion loop
 297   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 298   {
 299     table_i = (int) sequence[seq_i];
 300     rev_comp += conversionTable[table_i];
 301   }
 302
 303   return rev_comp;
 304 }
 305
 306
 307 string
 308 Sequence::hdr()
 309 {
 310   return header;
 311 }
 312
 313 string
 314 Sequence::sp_name()
 315 {
 316   return species;
 317 }
 318
 319
 320 void
 321 Sequence::set_seq(string a_seq)
 322 {
 323   sequence = a_seq;
 324   length = sequence.length();
 325 }
 326
 327
 328 /*
 329 string
 330 Sequence::species()
 331 {
 332   return species;
 333 }
 334 */
 335
 336 void
 337 Sequence::clear()
 338 {
 339   sequence = "";
 340   length = 0;
 341   header = "";
 342   species = "";
 343   species_num = -4;
 344   annots.clear();
 345 }
 346
 347 void
 348 Sequence::save(fstream &save_file)
 349                //string save_file_path)
 350 {
 351   //fstream save_file;
 352   list<annot>::iterator annots_i;
 353   int i;
 354
 355   // not sure why, or if i'm doing something wrong, but can't seem to pass
 356   // file pointers down to this method from the mussa control class
 357   // so each call to save a sequence appends to the file started by mussa_class
 358   //save_file.open(save_file_path.c_str(), ios::app);
 359
 360   save_file << "<Sequence>" << endl;
 361   save_file << sequence << endl;
 362   save_file << "</Sequence>" << endl;
 363
 364   save_file << "<Annotations>" << endl;
 365   save_file << species << endl;
 366   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 367   {
 368     save_file << annots_i->start << " " << annots_i->end << " " ;
 369     save_file << annots_i->name << " " << annots_i->type << endl;
 370   }
 371   save_file << "</Annotations>" << endl;
 372   //save_file.close();
 373 }
 374
 375 void
 376 Sequence::load_museq(string load_file_path, int seq_num)
 377 {
 378   fstream load_file;
 379   string file_data_line;
 380   int seq_counter;
 381   annot an_annot;
 382   int space_split_i;
 383   string annot_value;
 384
 385   annots.clear();
 386   load_file.open(load_file_path.c_str(), ios::in);
 387
 388   seq_counter = 0;
 389   // search for the seq_num-th sequence
 390   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 391   {
 392     getline(load_file,file_data_line);
 393     if (file_data_line == "<Sequence>")
 394       seq_counter++;
 395   }
 396   getline(load_file, file_data_line);
 397   sequence = file_data_line;
 398   length = sequence.length();
 399   getline(load_file, file_data_line);
 400   getline(load_file, file_data_line);
 401   if (file_data_line == "<Annotations>")
 402   {
 403     getline(load_file, file_data_line);
 404     species = file_data_line;
 405     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 406     {
 407       getline(load_file,file_data_line);
 408       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 409       {
 410         // need to get 4 values...almost same code 4 times...
 411         // get annot start index
 412         space_split_i = file_data_line.find(" ");
 413         annot_value = file_data_line.substr(0,space_split_i);
 414         an_annot.start = atoi (annot_value.c_str());
 415         file_data_line = file_data_line.substr(space_split_i+1);
 416         // get annot end index
 417         space_split_i = file_data_line.find(" ");
 418         annot_value = file_data_line.substr(0,space_split_i);
 419         an_annot.end = atoi (annot_value.c_str());
 420
 421         if (space_split_i == string::npos)  // no entry for type or name
 422         {
 423           cout << "seq, annots - no type or name\n";
 424           an_annot.type = "";
 425           an_annot.name = "";
 426         }
 427         else   // else get annot type
 428         {
 429           file_data_line = file_data_line.substr(space_split_i+1);
 430           space_split_i = file_data_line.find(" ");
 431           annot_value = file_data_line.substr(0,space_split_i);
 432           an_annot.type = annot_value;
 433           if (space_split_i == string::npos)  // no entry for name
 434           {
 435             cout << "seq, annots - no name\n";
 436             an_annot.name = "";
 437           }
 438           else          // get annot name
 439           {
 440             file_data_line = file_data_line.substr(space_split_i+1);
 441             space_split_i = file_data_line.find(" ");
 442             annot_value = file_data_line.substr(0,space_split_i);
 443             an_annot.type = annot_value;
 444           }
 445         }
 446         annots.push_back(an_annot);  // don't forget to actually add the annot
 447       }
 448       cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 449            << "-->" << an_annot.type << "::" << an_annot.name << endl;
 450     }
 451   }
 452   load_file.close();
 453 }
 454
 455
 456 string
 457 Sequence::rc_motif(string a_motif)
 458 {
 459   string rev_comp;
 460   char conversionTable[257];
 461   int seq_i, table_i, len;
 462
 463   len = a_motif.length();
 464   rev_comp.reserve(len);
 465
 466   for(table_i=0; table_i < 256; table_i++)
 467   {
 468     conversionTable[table_i] = '~';
 469   }
 470   // add end of string character for printing out table for testing purposes
 471   conversionTable[256] = '\0';
 472
 473   // add in the characters for the bases we want to convert (IUPAC)
 474   conversionTable['A'] = 'T';
 475   conversionTable['T'] = 'A';
 476   conversionTable['G'] = 'C';
 477   conversionTable['C'] = 'G';
 478   conversionTable['N'] = 'N';
 479   conversionTable['M'] = 'K';
 480   conversionTable['R'] = 'Y';
 481   conversionTable['W'] = 'W';
 482   conversionTable['S'] = 'S';
 483   conversionTable['Y'] = 'R';
 484   conversionTable['K'] = 'M';
 485   conversionTable['V'] = 'B';
 486   conversionTable['H'] = 'D';
 487   conversionTable['D'] = 'H';
 488   conversionTable['B'] = 'V';
 489
 490   // finally, the actual conversion loop
 491   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 492   {
 493     //cout << "** i = " << seq_i << " bp = " <<
 494     table_i = (int) a_motif[seq_i];
 495     rev_comp += conversionTable[table_i];
 496   }
 497
 498   cout << "seq: " << a_motif << endl;
 499   cout << "rc:  " << rev_comp << endl;
 500
 501   return rev_comp;
 502 }
 503
 504 string
 505 Sequence::motif_validate(string a_motif)
 506 {
 507   string valid_motif;
 508   int seq_i, len;
 509
 510
 511   len = a_motif.length();
 512   valid_motif.reserve(len);
 513
 514   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 515   // current nonIUPAC symbols are omitted, which is not reported atm
 516   for(seq_i = 0; seq_i < len; seq_i++)
 517   {
 518      if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 519       valid_motif += 'A';
 520     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 521       valid_motif += 'T';
 522     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 523       valid_motif += 'G';
 524     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 525       valid_motif += 'C';
 526     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 527       valid_motif += 'N';
 528     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 529       valid_motif += 'M';
 530     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 531       valid_motif += 'R';
 532     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 533       valid_motif += 'W';
 534     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 535       valid_motif += 'S';
 536     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 537       valid_motif += 'Y';
 538     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 539       valid_motif += 'G';
 540     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 541       valid_motif += 'V';
 542     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 543       valid_motif += 'H';
 544     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 545       valid_motif += 'D';
 546     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 547       valid_motif += 'B';
 548    }
 549
 550   cout << "valid_motif is: " << valid_motif << endl;
 551
 552   return valid_motif;
 553 }
 554
 555
 556 vector<int>
 557 Sequence::find_motif(string a_motif)
 558 {
 559   vector<int> motif_match_starts;
 560   string a_motif_rc;
 561
 562
 563   motif_match_starts.clear();
 564
 565   cout << "motif is: " << a_motif << endl;
 566   a_motif = motif_validate(a_motif);
 567   //cout << "motif is: " << a_motif << endl;
 568
 569
 570   if (a_motif != "")
 571   {
 572     cout << "Sequence: none blank motif\n";
 573     motif_scan(a_motif, &motif_match_starts);
 574
 575     a_motif_rc = rc_motif(a_motif);
 576     // make sure not to do search again if it is a palindrome
 577     if (a_motif_rc != a_motif)
 578       motif_scan(a_motif_rc, &motif_match_starts);
 579   }
 580   return motif_match_starts;
 581 }
 582
 583 void
 584 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 585 {
 586   char * seq_c;
 587   int seq_i, motif_i, motif_len;
 588
 589   // faster to loop thru the sequence as a old c string (ie char array)
 590   seq_c = (char*)sequence.c_str();
 591   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 592   motif_len = a_motif.length();
 593
 594   //cout << "motif_length: " << motif_len << endl;
 595   //cout << "RAAARRRRR\n";
 596
 597   motif_i = 0;
 598
 599   //cout << "motif: " << a_motif << endl;
 600
 601   //cout << "Sequence: motif, length= " << length << endl;
 602   seq_i = 0;
 603   while (seq_i < length)
 604   {
 605     cout << seq_c[seq_i];
 606     //if ((seq_i > 10885) && (seq_i < 10917))
 607     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 608     // this is pretty much a straight translation of Nora's python code
 609     // to match iupac letter codes
 610     if (a_motif[motif_i] =='N')
 611       motif_i++;
 612     else if (a_motif[motif_i] == seq_c[seq_i])
 613       motif_i++;
 614     else if ((a_motif[motif_i] =='M') &&
 615              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 616       motif_i++;
 617     else if ((a_motif[motif_i] =='R') &&
 618              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 619       motif_i++;
 620     else if ((a_motif[motif_i] =='W') &&
 621              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 622       motif_i++;
 623     else if ((a_motif[motif_i] =='S') &&
 624              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 625       motif_i++;
 626     else if ((a_motif[motif_i] =='Y') &&
 627              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 628       motif_i++;
 629     else if ((a_motif[motif_i] =='K') &&
 630              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 631       motif_i++;
 632     else if ((a_motif[motif_i] =='V') &&
 633              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 634               (seq_c[seq_i]=='G')))
 635       motif_i++;
 636     else if ((a_motif[seq_i] =='H') &&
 637              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 638               (seq_c[seq_i]=='T')))
 639       motif_i++;
 640     else if ((a_motif[motif_i] =='D') &&
 641              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 642               (seq_c[seq_i]=='T')))
 643       motif_i++;
 644     else if ((a_motif[motif_i] =='B') &&
 645              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 646               (seq_c[seq_i]=='T')))
 647       motif_i++;
 648
 649     else
 650     {
 651       seq_i -= motif_i;
 652       motif_i = 0;
 653     }
 654
 655     // end Nora stuff, now we see if a match is found this pass
 656     if (motif_i == motif_len)
 657     {
 658       //cout << "!!";
 659       motif_match_starts->push_back(seq_i - motif_len + 1);
 660       motif_i = 0;
 661     }
 662
 663     seq_i++;
 664   }
 665   cout << endl;
 666 }
 667
 668 /*
 669         // get annot start index
 670         space_split_i = file_data_line.find(" ");
 671         annot_value = file_data_line.substr(0,space_split_i);
 672         an_annot.name = annot_value;
 673         file_data_line = file_data_line.substr(space_split_i+1);
 674         // get annot start index
 675         space_split_i = file_data_line.find(" ");
 676         annot_value = file_data_line.substr(0,space_split_i);
 677         an_annot.type = annot_value;
 678 */