sequence.cc

   1 //  This file is part of the Mussa source distribution.
   2 //  http://mussa.caltech.edu/
   3 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
   4
   5 // This program and all associated source code files are Copyright (C) 2005
   6 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
   7 // under the GNU Public License; please see the included LICENSE.txt
   8 // file for more information, or contact Tristan directly.
   9
  10
  11 //  This file is part of the Mussa source distribution.
  12 //  http://mussa.caltech.edu/
  13 //  Contact author: Tristan  De Buysscher, tristan@caltech.edu
  14
  15 // This program and all associated source code files are Copyright (C) 2005
  16 // the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
  17 // under the GNU Public License; please see the included LICENSE.txt
  18 // file for more information, or contact Tristan directly.
  19
  20
  21 //                        ----------------------------------------
  22 //                           ---------- sequence.cc -----------
  23 //                        ----------------------------------------
  24
  25 #include "sequence.hh"
  26 #include <iostream>
  27
  28 using namespace std;
  29
  30 Sequence::Sequence()
  31 {
  32 }
  33
  34
  35 string
  36 Sequence::load_fasta(string file_path, int seq_num,
  37                      int start_index, int end_index)
  38 {
  39   fstream data_file;
  40   string file_data_line;
  41   int header_counter = 0;
  42   bool read_seq = true;
  43   string rev_comp;
  44   char conversionTable[257];
  45   int seq_i, table_i, len;
  46   string sequence_raw;
  47   char * seq_tmp;             // holds sequence during basic filtering
  48   string err_msg;
  49
  50   data_file.open(file_path.c_str(), ios::in);
  51
  52   if (!data_file)
  53   {
  54     err_msg = "Sequence File: " + file_path + " not found";
  55     return err_msg;
  56   }
  57   // if file opened okay, read it
  58   else
  59   {
  60     // search for the header of the fasta sequence we want
  61     while ( (!data_file.eof()) && (header_counter < seq_num) )
  62     {
  63       getline(data_file,file_data_line);
  64       if (file_data_line.substr(0,1) == ">")
  65         header_counter++;
  66     }
  67
  68     header = file_data_line.substr(1);
  69
  70     sequence_raw = "";
  71
  72     while ( !data_file.eof() && read_seq )
  73     {
  74       getline(data_file,file_data_line);
  75       if (file_data_line.substr(0,1) == ">")
  76         read_seq = false;
  77       else sequence_raw += file_data_line;
  78     }
  79
  80     data_file.close();
  81
  82     // sequence filtering for upcasing agctn and convert non AGCTN to N
  83
  84     len = sequence_raw.length();
  85     seq_tmp = (char*)sequence_raw.c_str();
  86
  87     sequence = "";
  88
  89     // Make a conversion table
  90
  91     // everything we don't specify below will become 'N'
  92     for(table_i=0; table_i < 256; table_i++)
  93     {
  94       conversionTable[table_i] = 'N';
  95     }
  96     // add end of string character for printing out table for testing purposes
  97     conversionTable[256] = '\0';
  98
  99     // we want these to map to themselves - ie not to change
 100     conversionTable['A'] = 'A';
 101     conversionTable['T'] = 'T';
 102     conversionTable['G'] = 'G';
 103     conversionTable['C'] = 'C';
 104     // this is to upcase
 105     conversionTable['a'] = 'A';
 106     conversionTable['t'] = 'T';
 107     conversionTable['g'] = 'G';
 108     conversionTable['c'] = 'C';
 109
 110     // finally, the actual conversion loop
 111     for(seq_i = 0; seq_i < len; seq_i++)
 112     {
 113       table_i = (int) seq_tmp[seq_i];
 114       sequence += conversionTable[table_i];
 115     }
 116
 117
 118     // Lastly, if subselection of the sequence was specified we keep cut out
 119     // and only keep that part
 120
 121     // end_index = 0 means no end was specified, so cut to the end
 122     if (end_index == 0)
 123       end_index = len;
 124
 125     // start_index = 0 if no start was specified
 126     sequence = sequence.substr(start_index, end_index - start_index);
 127
 128     return "";
 129   }
 130 }
 131
 132
 133   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 134   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 135
 136
 137 string
 138 Sequence::load_annot(string file_path, int start_index, int end_index)
 139 {
 140   fstream data_file;
 141   string file_data_line;
 142   annot an_annot;
 143   int space_split_i;
 144   string annot_value;
 145   list<annot>::iterator list_i;
 146   string err_msg;
 147
 148
 149   annots.clear();
 150   data_file.open(file_path.c_str(), ios::in);
 151
 152   if (!data_file)
 153   {
 154     err_msg = "Sequence File: " + file_path + " not found";
 155     return err_msg;
 156   }
 157   // if file opened okay, read it
 158   else
 159   {
 160     getline(data_file,file_data_line);
 161     species = file_data_line;
 162
 163     // end_index = 0 means no end was specified, so cut to the end
 164     if (end_index == 0)
 165       end_index = sequence.length();
 166
 167     //cout << "START: " << start_index << " END: " << end_index << endl;
 168
 169     while ( !data_file.eof() )
 170     {
 171       getline(data_file,file_data_line);
 172       if (file_data_line != "")
 173       {
 174         // need to get 4 values...almost same code 4 times...
 175         // get annot start index
 176         space_split_i = file_data_line.find(" ");
 177         annot_value = file_data_line.substr(0,space_split_i);
 178         an_annot.start = atoi (annot_value.c_str());
 179         file_data_line = file_data_line.substr(space_split_i+1);
 180         // get annot end index
 181         space_split_i = file_data_line.find(" ");
 182         annot_value = file_data_line.substr(0,space_split_i);
 183         an_annot.end = atoi (annot_value.c_str());
 184         file_data_line = file_data_line.substr(space_split_i+1);
 185
 186         cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 187              << endl;
 188
 189         // get annot name
 190         space_split_i = file_data_line.find(" ");
 191         if (space_split_i == string::npos)  // no entries for name & type
 192         {
 193           cout << "seq, annots - no name or type\n";
 194           an_annot.name = "";
 195           an_annot.type = "";
 196         }
 197         else
 198         {
 199           annot_value = file_data_line.substr(0,space_split_i);
 200           an_annot.name = annot_value;
 201           file_data_line = file_data_line.substr(space_split_i+1);
 202           // get annot type
 203           space_split_i = file_data_line.find(" ");
 204           if (space_split_i == string::npos)  // no entry for type
 205             an_annot.type = "";
 206           else
 207           {
 208             annot_value = file_data_line.substr(0,space_split_i);
 209             an_annot.type = annot_value;
 210           }
 211         }
 212
 213
 214         // add annot to list if it falls within the range of sequence specified
 215         if ((start_index <= an_annot.start) && (end_index >= an_annot.end))
 216         {
 217           an_annot.start -= start_index;
 218           an_annot.end -= start_index;
 219           annots.push_back(an_annot);
 220         }
 221         else
 222           cout << "FAILED!!!!!!\n";
 223       }
 224     }
 225
 226     data_file.close();
 227     /*
 228     // debugging check
 229     for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 230     {
 231       cout << (*list_i).start << "," << (*list_i).end << "\t";
 232       cout << (*list_i).name << "\t" << (*list_i).type << endl;
 233     }
 234     */
 235     return "";
 236   }
 237 }
 238
 239
 240 int
 241 Sequence::len()
 242 {
 243   return sequence.length();
 244 }
 245
 246
 247 string
 248 Sequence::seq()
 249 {
 250   return sequence;
 251 }
 252
 253
 254 string
 255 Sequence::subseq(int start, int end)
 256 {
 257   return sequence.substr(start, end);
 258 }
 259
 260
 261 const char *
 262 Sequence::c_seq()
 263 {
 264   return sequence.c_str();
 265 }
 266
 267 string
 268 Sequence::rev_comp()
 269 {
 270   string rev_comp;
 271   char conversionTable[257];
 272   int seq_i, table_i, len;
 273
 274   len = sequence.length();
 275   rev_comp.reserve(len);
 276   // make a conversion table
 277   // init all parts of conversion table to '~' character
 278   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 279   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 280   // seven generations of their progeny) who decides to make it mean
 281   // something special!!!
 282   // PS - double the curse for any smartass non-biologist who tries it as well
 283   for(table_i=0; table_i < 256; table_i++)
 284   {
 285     conversionTable[table_i] = '~';
 286   }
 287   // add end of string character for printing out table for testing purposes
 288   conversionTable[256] = '\0';
 289
 290   // add in the characters for the bases we want to convert
 291   conversionTable['A'] = 'T';
 292   conversionTable['T'] = 'A';
 293   conversionTable['G'] = 'C';
 294   conversionTable['C'] = 'G';
 295   conversionTable['N'] = 'N';
 296
 297   // finally, the actual conversion loop
 298   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 299   {
 300     table_i = (int) sequence[seq_i];
 301     rev_comp += conversionTable[table_i];
 302   }
 303
 304   return rev_comp;
 305 }
 306
 307
 308 string
 309 Sequence::hdr()
 310 {
 311   return header;
 312 }
 313
 314 string
 315 Sequence::sp_name()
 316 {
 317   return species;
 318 }
 319
 320
 321 void
 322 Sequence::set_seq(string a_seq)
 323 {
 324   sequence = a_seq;
 325 }
 326
 327
 328 /*
 329 string
 330 Sequence::species()
 331 {
 332   return species;
 333 }
 334 */
 335
 336 void
 337 Sequence::clear()
 338 {
 339   sequence = "";
 340   header = "";
 341   species = "";
 342   species_num = -4;
 343   annots.clear();
 344 }
 345
 346 void
 347 Sequence::save(fstream &save_file)
 348                //string save_file_path)
 349 {
 350   //fstream save_file;
 351   list<annot>::iterator annots_i;
 352   int i;
 353
 354   // not sure why, or if i'm doing something wrong, but can't seem to pass
 355   // file pointers down to this method from the mussa control class
 356   // so each call to save a sequence appends to the file started by mussa_class
 357   //save_file.open(save_file_path.c_str(), ios::app);
 358
 359   save_file << "<Sequence>" << endl;
 360   save_file << sequence << endl;
 361   save_file << "</Sequence>" << endl;
 362
 363   save_file << "<Annotations>" << endl;
 364   save_file << species << endl;
 365   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 366   {
 367     save_file << annots_i->start << " " << annots_i->end << " " ;
 368     save_file << annots_i->name << " " << annots_i->type << endl;
 369   }
 370   save_file << "</Annotations>" << endl;
 371   //save_file.close();
 372 }
 373
 374 void
 375 Sequence::load_museq(string load_file_path, int seq_num)
 376 {
 377   fstream load_file;
 378   string file_data_line;
 379   int seq_counter;
 380   annot an_annot;
 381   int space_split_i;
 382   string annot_value;
 383
 384   annots.clear();
 385   load_file.open(load_file_path.c_str(), ios::in);
 386
 387   seq_counter = 0;
 388   // search for the seq_num-th sequence
 389   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 390   {
 391     getline(load_file,file_data_line);
 392     if (file_data_line == "<Sequence>")
 393       seq_counter++;
 394   }
 395   getline(load_file, file_data_line);
 396   sequence = file_data_line;
 397   getline(load_file, file_data_line);
 398   getline(load_file, file_data_line);
 399   if (file_data_line == "<Annotations>")
 400   {
 401     getline(load_file, file_data_line);
 402     species = file_data_line;
 403     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 404     {
 405       getline(load_file,file_data_line);
 406       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 407       {
 408         // need to get 4 values...almost same code 4 times...
 409         // get annot start index
 410         space_split_i = file_data_line.find(" ");
 411         annot_value = file_data_line.substr(0,space_split_i);
 412         an_annot.start = atoi (annot_value.c_str());
 413         file_data_line = file_data_line.substr(space_split_i+1);
 414         // get annot end index
 415         space_split_i = file_data_line.find(" ");
 416         annot_value = file_data_line.substr(0,space_split_i);
 417         an_annot.end = atoi (annot_value.c_str());
 418
 419         if (space_split_i == string::npos)  // no entry for type or name
 420         {
 421           cout << "seq, annots - no type or name\n";
 422           an_annot.type = "";
 423           an_annot.name = "";
 424         }
 425         else   // else get annot type
 426         {
 427           file_data_line = file_data_line.substr(space_split_i+1);
 428           space_split_i = file_data_line.find(" ");
 429           annot_value = file_data_line.substr(0,space_split_i);
 430           an_annot.type = annot_value;
 431           if (space_split_i == string::npos)  // no entry for name
 432           {
 433             cout << "seq, annots - no name\n";
 434             an_annot.name = "";
 435           }
 436           else          // get annot name
 437           {
 438             file_data_line = file_data_line.substr(space_split_i+1);
 439             space_split_i = file_data_line.find(" ");
 440             annot_value = file_data_line.substr(0,space_split_i);
 441             an_annot.type = annot_value;
 442           }
 443         }
 444         annots.push_back(an_annot);  // don't forget to actually add the annot
 445       }
 446       cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
 447            << "-->" << an_annot.type << "::" << an_annot.name << endl;
 448     }
 449   }
 450   load_file.close();
 451 }
 452
 453
 454 string
 455 Sequence::rc_motif(string a_motif)
 456 {
 457   string rev_comp;
 458   char conversionTable[257];
 459   int seq_i, table_i, len;
 460
 461   len = a_motif.length();
 462   rev_comp.reserve(len);
 463
 464   for(table_i=0; table_i < 256; table_i++)
 465   {
 466     conversionTable[table_i] = '~';
 467   }
 468   // add end of string character for printing out table for testing purposes
 469   conversionTable[256] = '\0';
 470
 471   // add in the characters for the bases we want to convert (IUPAC)
 472   conversionTable['A'] = 'T';
 473   conversionTable['T'] = 'A';
 474   conversionTable['G'] = 'C';
 475   conversionTable['C'] = 'G';
 476   conversionTable['N'] = 'N';
 477   conversionTable['M'] = 'K';
 478   conversionTable['R'] = 'Y';
 479   conversionTable['W'] = 'W';
 480   conversionTable['S'] = 'S';
 481   conversionTable['Y'] = 'R';
 482   conversionTable['K'] = 'M';
 483   conversionTable['V'] = 'B';
 484   conversionTable['H'] = 'D';
 485   conversionTable['D'] = 'H';
 486   conversionTable['B'] = 'V';
 487
 488   // finally, the actual conversion loop
 489   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 490   {
 491     //cout << "** i = " << seq_i << " bp = " <<
 492     table_i = (int) a_motif[seq_i];
 493     rev_comp += conversionTable[table_i];
 494   }
 495
 496   cout << "seq: " << a_motif << endl;
 497   cout << "rc:  " << rev_comp << endl;
 498
 499   return rev_comp;
 500 }
 501
 502 string
 503 Sequence::motif_validate(string a_motif)
 504 {
 505   string valid_motif;
 506   int seq_i, len;
 507
 508   len = a_motif.length();
 509   valid_motif.reserve(len);
 510
 511   // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
 512   // current nonIUPAC symbols are omitted, which is not reported atm
 513   for(seq_i = 0; seq_i < len; seq_i++)
 514   {
 515      if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
 516       valid_motif += 'A';
 517     else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
 518       valid_motif += 'T';
 519     else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
 520       valid_motif += 'G';
 521     else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
 522       valid_motif += 'C';
 523     else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
 524       valid_motif += 'N';
 525     else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
 526       valid_motif += 'M';
 527     else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
 528       valid_motif += 'R';
 529     else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
 530       valid_motif += 'W';
 531     else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
 532       valid_motif += 'S';
 533     else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
 534       valid_motif += 'Y';
 535     else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
 536       valid_motif += 'G';
 537     else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
 538       valid_motif += 'V';
 539     else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
 540       valid_motif += 'H';
 541     else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
 542       valid_motif += 'D';
 543     else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
 544       valid_motif += 'B';
 545    }
 546
 547   cout << "valid_motif is: " << valid_motif << endl;
 548
 549   return valid_motif;
 550 }
 551
 552
 553 vector<int>
 554 Sequence::find_motif(string a_motif)
 555 {
 556   vector<int> motif_match_starts;
 557   string a_motif_rc;
 558
 559
 560   motif_match_starts.clear();
 561
 562   cout << "motif is: " << a_motif << endl;
 563   a_motif = motif_validate(a_motif);
 564   //cout << "motif is: " << a_motif << endl;
 565
 566
 567   if (a_motif != "")
 568   {
 569     cout << "Sequence: none blank motif\n";
 570     motif_scan(a_motif, &motif_match_starts);
 571
 572     a_motif_rc = rc_motif(a_motif);
 573     // make sure not to do search again if it is a palindrome
 574     if (a_motif_rc != a_motif)
 575       motif_scan(a_motif_rc, &motif_match_starts);
 576   }
 577   return motif_match_starts;
 578 }
 579
 580 void
 581 Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
 582 {
 583   char * seq_c;
 584   int seq_i, motif_i, motif_len;
 585
 586   // faster to loop thru the sequence as a old c string (ie char array)
 587   seq_c = (char*)sequence.c_str();
 588   //cout << "Sequence: motif, seq len = " << sequence.length() << endl;
 589   motif_len = a_motif.length();
 590
 591   //cout << "motif_length: " << motif_len << endl;
 592   //cout << "RAAARRRRR\n";
 593
 594   motif_i = 0;
 595
 596   //cout << "motif: " << a_motif << endl;
 597
 598   //cout << "Sequence: motif, length= " << length << endl;
 599   seq_i = 0;
 600   while (seq_i < sequence.length())
 601   {
 602     cout << seq_c[seq_i];
 603     //if ((seq_i > 10885) && (seq_i < 10917))
 604     //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
 605     // this is pretty much a straight translation of Nora's python code
 606     // to match iupac letter codes
 607     if (a_motif[motif_i] =='N')
 608       motif_i++;
 609     else if (a_motif[motif_i] == seq_c[seq_i])
 610       motif_i++;
 611     else if ((a_motif[motif_i] =='M') &&
 612              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 613       motif_i++;
 614     else if ((a_motif[motif_i] =='R') &&
 615              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 616       motif_i++;
 617     else if ((a_motif[motif_i] =='W') &&
 618              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 619       motif_i++;
 620     else if ((a_motif[motif_i] =='S') &&
 621              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 622       motif_i++;
 623     else if ((a_motif[motif_i] =='Y') &&
 624              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 625       motif_i++;
 626     else if ((a_motif[motif_i] =='K') &&
 627              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 628       motif_i++;
 629     else if ((a_motif[motif_i] =='V') &&
 630              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 631               (seq_c[seq_i]=='G')))
 632       motif_i++;
 633     else if ((a_motif[seq_i] =='H') &&
 634              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 635               (seq_c[seq_i]=='T')))
 636       motif_i++;
 637     else if ((a_motif[motif_i] =='D') &&
 638              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 639               (seq_c[seq_i]=='T')))
 640       motif_i++;
 641     else if ((a_motif[motif_i] =='B') &&
 642              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 643               (seq_c[seq_i]=='T')))
 644       motif_i++;
 645
 646     else
 647     {
 648       seq_i -= motif_i;
 649       motif_i = 0;
 650     }
 651
 652     // end Nora stuff, now we see if a match is found this pass
 653     if (motif_i == motif_len)
 654     {
 655       //cout << "!!";
 656       motif_match_starts->push_back(seq_i - motif_len + 1);
 657       motif_i = 0;
 658     }
 659
 660     seq_i++;
 661   }
 662   cout << endl;
 663 }
 664
 665 /*
 666         // get annot start index
 667         space_split_i = file_data_line.find(" ");
 668         annot_value = file_data_line.substr(0,space_split_i);
 669         an_annot.name = annot_value;
 670         file_data_line = file_data_line.substr(space_split_i+1);
 671         // get annot start index
 672         space_split_i = file_data_line.find(" ");
 673         annot_value = file_data_line.substr(0,space_split_i);
 674         an_annot.type = annot_value;
 675 */