sequence.cxx

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "sequence.hh"
  26 #include <iostream>
  27
  28 using namespace std;
  29
  30 Sequence::Sequence()
  31 {
  32 }
  33
  34 //! load a fasta file into a sequence
  35 /*!
  36  * \param file_path the location of the fasta file in the filesystem
  37  * \param seq_num which sequence in the file to load
  38  * \param start_index starting position in the fasta sequence, 0 for beginning
  39  * \param end_index ending position in the fasta sequence, 0 for end
  40  * \return error message, empty string if no error. (gag!)
  41  */
  42 string
  43 Sequence::load_fasta(string file_path, int seq_num,
  44                      int start_index, int end_index)
  45 {
  46   fstream data_file;
  47   string file_data_line;
  48   int header_counter = 0;
  49   bool read_seq = true;
  50   string rev_comp;
  51   char conversionTable[257];
  52   int seq_i, table_i, len;
  53   string sequence_raw;
  54   char * seq_tmp;             // holds sequence during basic filtering
  55   string err_msg;
  56
  57   data_file.open(file_path.c_str(), ios::in);
  58
  59   if (!data_file)
  60   {
  61     err_msg = "Sequence File: " + file_path + " not found";
  62     return err_msg;
  63   }
  64   // if file opened okay, read it
  65   else
  66   {
  67     // search for the header of the fasta sequence we want
  68     while ( (!data_file.eof()) && (header_counter < seq_num) )
  69     {
  70       getline(data_file,file_data_line);
  71       if (file_data_line.substr(0,1) == ">")
  72         header_counter++;
  73     }
  74
  75     header = file_data_line.substr(1);
  76
  77     sequence_raw = "";
  78
  79     while ( !data_file.eof() && read_seq )
  80     {
  81       getline(data_file,file_data_line);
  82       if (file_data_line.substr(0,1) == ">")
  83         read_seq = false;
  84       else sequence_raw += file_data_line;
  85     }
  86
  87     data_file.close();
  88
  89     // sequence filtering for upcasing agctn and convert non AGCTN to N
  90
  91     len = sequence_raw.length();
  92     seq_tmp = (char*)sequence_raw.c_str();
  93
  94     sequence = "";
  95
  96     // Make a conversion table
  97
  98     // everything we don't specify below will become 'N'
  99     for(table_i=0; table_i < 256; table_i++)
 100     {
 101       conversionTable[table_i] = 'N';
 102     }
 103     // add end of string character for printing out table for testing purposes
 104     conversionTable[256] = '\0';
 105
 106     // we want these to map to themselves - ie not to change
 107     conversionTable['A'] = 'A';
 108     conversionTable['T'] = 'T';
 109     conversionTable['G'] = 'G';
 110     conversionTable['C'] = 'C';
 111     // this is to upcase
 112     conversionTable['a'] = 'A';
 113     conversionTable['t'] = 'T';
 114     conversionTable['g'] = 'G';
 115     conversionTable['c'] = 'C';
 116
 117     // finally, the actual conversion loop
 118     for(seq_i = 0; seq_i < len; seq_i++)
 119     {
 120       table_i = (int) seq_tmp[seq_i];
 121       sequence += conversionTable[table_i];
 122     }
 123
 124
 125     // Lastly, if subselection of the sequence was specified we keep cut out
 126     // and only keep that part
 127
 128     // end_index = 0 means no end was specified, so cut to the end
 129     if (end_index == 0)
 130       end_index = len;
 131
 132     // start_index = 0 if no start was specified
 133     sequence = sequence.substr(start_index, end_index - start_index);
 134
 135     return "";
 136   }
 137 }
 138
 139
 140   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 141   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 142
 143
 144 string
 145 Sequence::load_annot(string file_path, int start_index, int end_index)
 146 {
 147   fstream data_file;
 148   string file_data_line;
 149   annot an_annot;
 150   int space_split_i;
 151   string annot_value;
 152   list<annot>::iterator list_i;
 153   string err_msg;
 154
 155
 156   annots.clear();
 157   data_file.open(file_path.c_str(), ios::in);
 158
 159   if (!data_file)
 160   {
 161     err_msg = "Sequence File: " + file_path + " not found";
 162     return err_msg;
 163   }
 164   // if file opened okay, read it
 165   else
 166   {
 167     getline(data_file,file_data_line);
 168     species = file_data_line;
 169
 170     // end_index = 0 means no end was specified, so cut to the end
 171     if (end_index == 0)
 172       end_index = sequence.length();
 173
 174     //cout << "START: " << start_index << " END: " << end_index << endl;
 175
 176     while ( !data_file.eof() )
 177     {
 178       getline(data_file,file_data_line);
 179       if (file_data_line != "")
 180       {
 181         // need to get 4 values...almost same code 4 times...
 182         // get annot start index
 183         space_split_i = file_data_line.find(" ");
 184         annot_value = file_data_line.substr(0,space_split_i);
 185         an_annot.start = atoi (annot_value.c_str());
 186         file_data_line = file_data_line.substr(space_split_i+1);
 187         // get annot end index
 188         space_split_i = file_data_line.find(" ");
 189         annot_value = file_data_line.substr(0,space_split_i);
 190         an_annot.end = atoi (annot_value.c_str());
 191         file_data_line = file_data_line.substr(space_split_i+1);
 192
 193         cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 194              << endl;
 195
 196         // get annot name
 197         space_split_i = file_data_line.find(" ");
 198         if (space_split_i == string::npos)  // no entries for name & type
 199         {
 200           cout << "seq, annots - no name or type\n";
 201           an_annot.name = "";
 202           an_annot.type = "";
 203         }
 204         else
 205         {
 206           annot_value = file_data_line.substr(0,space_split_i);
 207           an_annot.name = annot_value;
 208           file_data_line = file_data_line.substr(space_split_i+1);
 209           // get annot type
 210           space_split_i = file_data_line.find(" ");
 211           if (space_split_i == string::npos)  // no entry for type
 212             an_annot.type = "";
 213           else
 214           {
 215             annot_value = file_data_line.substr(0,space_split_i);
 216             an_annot.type = annot_value;
 217           }
 218         }
 219
 220
 221         // add annot to list if it falls within the range of sequence specified
 222         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 223         {
 224           an_annot.start -= start_index;
 225           an_annot.end -= start_index;
 226           annots.push_back(an_annot);
 227         }
 228         else
 229           cout << "FAILED!!!!!!\n";
 230       }
 231     }
 232
 233     data_file.close();
 234     /*
 235     // debugging check
 236     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 237     {
 238       cout << (*list_i).start << "," << (*list_i).end << "\t";
 239       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 240     }
 241     */
 242     return "";
 243   }
 244 }
 245
 246
 247 int
 248 Sequence::len()
 249 {
 250   return sequence.length();
 251 }
 252
 253
 254 string
 255 Sequence::seq()
 256 {
 257   return sequence;
 258 }
 259
 260
 261 string
 262 Sequence::subseq(int start, int end)
 263 {
 264   return sequence.substr(start, end);
 265 }
 266
 267
 268 const char *
 269 Sequence::c_seq()
 270 {
 271   return sequence.c_str();
 272 }
 273
 274 string
 275 Sequence::rev_comp()
 276 {
 277   string rev_comp;
 278   char conversionTable[257];
 279   int seq_i, table_i, len;
 280
 281   len = sequence.length();
 282   rev_comp.reserve(len);
 283   // make a conversion table
 284   // init all parts of conversion table to '~' character
 285   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 286   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 287   // seven generations of their progeny) who decides to make it mean
 288   // something special!!!
 289   // PS - double the curse for any smartass non-biologist who tries it as well
 290   for(table_i=0; table_i < 256; table_i++)
 291   {
 292     conversionTable[table_i] = '~';
 293   }
 294   // add end of string character for printing out table for testing purposes
 295   conversionTable[256] = '\0';
 296
 297   // add in the characters for the bases we want to convert
 298   conversionTable['A'] = 'T';
 299   conversionTable['T'] = 'A';
 300   conversionTable['G'] = 'C';
 301   conversionTable['C'] = 'G';
 302   conversionTable['N'] = 'N';
 303
 304   // finally, the actual conversion loop
 305   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 306   {
 307     table_i = (int) sequence[seq_i];
 308     rev_comp += conversionTable[table_i];
 309   }
 310
 311   return rev_comp;
 312 }
 313
 314
 315 string
 316 Sequence::hdr()
 317 {
 318   return header;
 319 }
 320
 321 string
 322 Sequence::sp_name()
 323 {
 324   return species;
 325 }
 326
 327
 328 void
 329 Sequence::set_seq(string a_seq)
 330 {
 331   sequence = a_seq;
 332 }
 333
 334
 335 /*
 336 string
 337 Sequence::species()
 338 {
 339   return species;
 340 }
 341 */
 342
 343 void
 344 Sequence::clear()
 345 {
 346   sequence = "";
 347   header = "";
 348   species = "";
 349   species_num = -4;
 350   annots.clear();
 351 }
 352
 353 void
 354 Sequence::save(fstream &save_file)
 355                //string save_file_path)
 356 {
 357   //fstream save_file;
 358   list<annot>::iterator annots_i;
 359   int i;
 360
 361   // not sure why, or if i'm doing something wrong, but can't seem to pass
 362   // file pointers down to this method from the mussa control class
 363   // so each call to save a sequence appends to the file started by mussa_class
 364   //save_file.open(save_file_path.c_str(), ios::app);
 365
 366   save_file << "<Sequence>" << endl;
 367   save_file << sequence << endl;
 368   save_file << "</Sequence>" << endl;
 369
 370   save_file << "<Annotations>" << endl;
 371   save_file << species << endl;
 372   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 373   {
 374     save_file << annots_i->start << " " << annots_i->end << " " ;
 375     save_file << annots_i->name << " " << annots_i->type << endl;
 376   }
 377   save_file << "</Annotations>" << endl;
 378   //save_file.close();
 379 }
 380
 381 void
 382 Sequence::load_museq(string load_file_path, int seq_num)
 383 {
 384   fstream load_file;
 385   string file_data_line;
 386   int seq_counter;
 387   annot an_annot;
 388   int space_split_i;
 389   string annot_value;
 390
 391   annots.clear();
 392   load_file.open(load_file_path.c_str(), ios::in);
 393
 394   seq_counter = 0;
 395   // search for the seq_num-th sequence
 396   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 397   {
 398     getline(load_file,file_data_line);
 399     if (file_data_line == "<Sequence>")
 400       seq_counter++;
 401   }
 402   getline(load_file, file_data_line);
 403   sequence = file_data_line;
 404   getline(load_file, file_data_line);
 405   getline(load_file, file_data_line);
 406   if (file_data_line == "<Annotations>")
 407   {
 408     getline(load_file, file_data_line);
 409     species = file_data_line;
 410     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 411     {
 412       getline(load_file,file_data_line);
 413       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 414       {
 415         // need to get 4 values...almost same code 4 times...
 416         // get annot start index
 417         space_split_i = file_data_line.find(" ");
 418         annot_value = file_data_line.substr(0,space_split_i);
 419         an_annot.start = atoi (annot_value.c_str());
 420         file_data_line = file_data_line.substr(space_split_i+1);
 421         // get annot end index
 422         space_split_i = file_data_line.find(" ");
 423         annot_value = file_data_line.substr(0,space_split_i);
 424         an_annot.end = atoi (annot_value.c_str());
 425
 426         if (space_split_i == string::npos)  // no entry for type or name
 427         {
 428           cout << "seq, annots - no type or name\n";
 429           an_annot.type = "";
 430           an_annot.name = "";
 431         }
 432         else   // else get annot type
 433         {
 434           file_data_line = file_data_line.substr(space_split_i+1);
 435           space_split_i = file_data_line.find(" ");
 436           annot_value = file_data_line.substr(0,space_split_i);
 437           an_annot.type = annot_value;
 438           if (space_split_i == string::npos)  // no entry for name
 439           {
 440             cout << "seq, annots - no name\n";
 441             an_annot.name = "";
 442           }
 443           else          // get annot name
 444           {
 445             file_data_line = file_data_line.substr(space_split_i+1);
 446             space_split_i = file_data_line.find(" ");
 447             annot_value = file_data_line.substr(0,space_split_i);
 448             an_annot.type = annot_value;
 449           }
 450         }
 451         annots.push_back(an_annot);  // don't forget to actually add the annot
 452       }
 453       cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 454            << "-->" << an_annot.type << "::" << an_annot.name << endl;
 455     }
 456   }
 457   load_file.close();
 458 }
 459
 460
 461 string
 462 Sequence::rc_motif(string a_motif)
 463 {
 464   string rev_comp;
 465   char conversionTable[257];
 466   int seq_i, table_i, len;
 467
 468   len = a_motif.length();
 469   rev_comp.reserve(len);
 470
 471   for(table_i=0; table_i < 256; table_i++)
 472   {
 473     conversionTable[table_i] = '~';
 474   }
 475   // add end of string character for printing out table for testing purposes
 476   conversionTable[256] = '\0';
 477
 478   // add in the characters for the bases we want to convert (IUPAC)
 479   conversionTable['A'] = 'T';
 480   conversionTable['T'] = 'A';
 481   conversionTable['G'] = 'C';
 482   conversionTable['C'] = 'G';
 483   conversionTable['N'] = 'N';
 484   conversionTable['M'] = 'K';
 485   conversionTable['R'] = 'Y';
 486   conversionTable['W'] = 'W';
 487   conversionTable['S'] = 'S';
 488   conversionTable['Y'] = 'R';
 489   conversionTable['K'] = 'M';
 490   conversionTable['V'] = 'B';
 491   conversionTable['H'] = 'D';
 492   conversionTable['D'] = 'H';
 493   conversionTable['B'] = 'V';
 494
 495   // finally, the actual conversion loop
 496   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 497   {
 498     //cout << "** i = " << seq_i << " bp = " <<
 499     table_i = (int) a_motif[seq_i];
 500     rev_comp += conversionTable[table_i];
 501   }
 502
 503   cout << "seq: " << a_motif << endl;
 504   cout << "rc:  " << rev_comp << endl;
 505
 506   return rev_comp;
 507 }
 508
 509 string
 510 Sequence::motif_validate(string a_motif)
 511 {
 512   string valid_motif;
 513   int seq_i, len;
 514
 515   len = a_motif.length();
 516   valid_motif.reserve(len);
 517
 518   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 519   // current nonIUPAC symbols are omitted, which is not reported atm
 520   for(seq_i = 0; seq_i < len; seq_i++)
 521   {
 522      if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 523       valid_motif += 'A';
 524     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 525       valid_motif += 'T';
 526     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 527       valid_motif += 'G';
 528     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 529       valid_motif += 'C';
 530     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 531       valid_motif += 'N';
 532     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 533       valid_motif += 'M';
 534     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 535       valid_motif += 'R';
 536     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 537       valid_motif += 'W';
 538     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 539       valid_motif += 'S';
 540     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 541       valid_motif += 'Y';
 542     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 543       valid_motif += 'G';
 544     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 545       valid_motif += 'V';
 546     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 547       valid_motif += 'H';
 548     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 549       valid_motif += 'D';
 550     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 551       valid_motif += 'B';
 552    }
 553
 554   cout << "valid_motif is: " << valid_motif << endl;
 555
 556   return valid_motif;
 557 }
 558
 559
 560 vector<int>
 561 Sequence::find_motif(string a_motif)
 562 {
 563   vector<int> motif_match_starts;
 564   string a_motif_rc;
 565
 566
 567   motif_match_starts.clear();
 568
 569   cout << "motif is: " << a_motif << endl;
 570   a_motif = motif_validate(a_motif);
 571   //cout << "motif is: " << a_motif << endl;
 572
 573
 574   if (a_motif != "")
 575   {
 576     cout << "Sequence: none blank motif\n";
 577     motif_scan(a_motif, &motif_match_starts);
 578
 579     a_motif_rc = rc_motif(a_motif);
 580     // make sure not to do search again if it is a palindrome
 581     if (a_motif_rc != a_motif)
 582       motif_scan(a_motif_rc, &motif_match_starts);
 583   }
 584   return motif_match_starts;
 585 }
 586
 587 void
 588 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 589 {
 590   char * seq_c;
 591   int seq_i, motif_i, motif_len;
 592
 593   // faster to loop thru the sequence as a old c string (ie char array)
 594   seq_c = (char*)sequence.c_str();
 595   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 596   motif_len = a_motif.length();
 597
 598   //cout << "motif_length: " << motif_len << endl;
 599   //cout << "RAAARRRRR\n";
 600
 601   motif_i = 0;
 602
 603   //cout << "motif: " << a_motif << endl;
 604
 605   //cout << "Sequence: motif, length= " << length << endl;
 606   seq_i = 0;
 607   while (seq_i < sequence.length())
 608   {
 609     cout << seq_c[seq_i];
 610     //if ((seq_i > 10885) && (seq_i < 10917))
 611     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 612     // this is pretty much a straight translation of Nora's python code
 613     // to match iupac letter codes
 614     if (a_motif[motif_i] =='N')
 615       motif_i++;
 616     else if (a_motif[motif_i] == seq_c[seq_i])
 617       motif_i++;
 618     else if ((a_motif[motif_i] =='M') &&
 619              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 620       motif_i++;
 621     else if ((a_motif[motif_i] =='R') &&
 622              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 623       motif_i++;
 624     else if ((a_motif[motif_i] =='W') &&
 625              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 626       motif_i++;
 627     else if ((a_motif[motif_i] =='S') &&
 628              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 629       motif_i++;
 630     else if ((a_motif[motif_i] =='Y') &&
 631              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 632       motif_i++;
 633     else if ((a_motif[motif_i] =='K') &&
 634              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 635       motif_i++;
 636     else if ((a_motif[motif_i] =='V') &&
 637              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 638               (seq_c[seq_i]=='G')))
 639       motif_i++;
 640     else if ((a_motif[seq_i] =='H') &&
 641              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 642               (seq_c[seq_i]=='T')))
 643       motif_i++;
 644     else if ((a_motif[motif_i] =='D') &&
 645              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 646               (seq_c[seq_i]=='T')))
 647       motif_i++;
 648     else if ((a_motif[motif_i] =='B') &&
 649              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 650               (seq_c[seq_i]=='T')))
 651       motif_i++;
 652
 653     else
 654     {
 655       seq_i -= motif_i;
 656       motif_i = 0;
 657     }
 658
 659     // end Nora stuff, now we see if a match is found this pass
 660     if (motif_i == motif_len)
 661     {
 662       //cout << "!!";
 663       motif_match_starts->push_back(seq_i - motif_len + 1);
 664       motif_i = 0;
 665     }
 666
 667     seq_i++;
 668   }
 669   cout << endl;
 670 }
 671
 672 /*
 673         // get annot start index
 674         space_split_i = file_data_line.find(" ");
 675         annot_value = file_data_line.substr(0,space_split_i);
 676         an_annot.name = annot_value;
 677         file_data_line = file_data_line.substr(space_split_i+1);
 678         // get annot start index
 679         space_split_i = file_data_line.find(" ");
 680         annot_value = file_data_line.substr(0,space_split_i);
 681         an_annot.type = annot_value;
 682 */