sequence.cc

   1 //                        ----------------------------------------
   2 //                           ---------- sequence.cc -----------
   3 //                        ----------------------------------------
   4
   5 #include "sequence.hh"
   6
   7
   8 Sequence::Sequence()
   9 {
  10 }
  11
  12
  13 void
  14 Sequence::load_fasta(string file_path, int seq_num,
  15                      int start_index, int end_index)
  16 {
  17   fstream data_file;
  18   string file_data_line;
  19   int header_counter = 0;
  20   bool read_seq = true;
  21   string rev_comp;
  22   char conversionTable[257];
  23   int seq_i, table_i, len;
  24   string sequence_raw;
  25   char * seq_tmp;             // holds sequence during basic filtering
  26
  27
  28   data_file.open(file_path.c_str(), ios::in);
  29
  30
  31   // search for the header of the fasta sequence we want
  32   while ( (!data_file.eof()) && (header_counter < seq_num) )
  33   {
  34     getline(data_file,file_data_line);
  35     if (file_data_line.substr(0,1) == ">")
  36       header_counter++;
  37   }
  38
  39   header = file_data_line.substr(1);
  40
  41   sequence_raw = "";
  42
  43   while ( !data_file.eof() && read_seq )
  44   {
  45     getline(data_file,file_data_line);
  46     if (file_data_line.substr(0,1) == ">")
  47       read_seq = false;
  48     else sequence_raw += file_data_line;
  49   }
  50
  51   data_file.close();
  52
  53   // sequence filtering for upcasing agctn and convert non AGCTN to N
  54
  55   len = sequence_raw.length();
  56   seq_tmp = (char*)sequence_raw.c_str();
  57
  58   sequence = "";
  59
  60   // Make a conversion table
  61
  62   // everything we don't specify below will become 'N'
  63   for(table_i=0; table_i < 256; table_i++)
  64   {
  65     conversionTable[table_i] = 'N';
  66   }
  67   // add end of string character for printing out table for testing purposes
  68   conversionTable[256] = '\0';
  69
  70   // we want these to map to themselves - ie not to change
  71   conversionTable['A'] = 'A';
  72   conversionTable['T'] = 'T';
  73   conversionTable['G'] = 'G';
  74   conversionTable['C'] = 'C';
  75   // this is to upcase
  76   conversionTable['a'] = 'A';
  77   conversionTable['t'] = 'T';
  78   conversionTable['g'] = 'G';
  79   conversionTable['c'] = 'C';
  80
  81   // finally, the actual conversion loop
  82   for(seq_i = 0; seq_i < len; seq_i++)
  83   {
  84     table_i = (int) seq_tmp[seq_i];
  85     sequence += conversionTable[table_i];
  86   }
  87
  88
  89   // Lastly, if subselection of the sequence was specified we keep cut out
  90   // and only keep that part
  91
  92   // end_index = 0 means no end was specified, so cut to the end
  93   if (end_index == 0)
  94     end_index = len;
  95
  96   // start_index = 0 if no start was specified
  97   sequence = sequence.substr(start_index, end_index - start_index);
  98 }
  99
 100
 101   // this doesn't work properly under gcc 3.x ... it can't recognize toupper
 102   //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
 103
 104
 105 void
 106 Sequence::load_annot(string file_path)
 107 {
 108   fstream data_file;
 109   string file_data_line;
 110   annot an_annot;
 111   int space_split_i;
 112   string annot_value;
 113
 114   annots.clear();
 115   data_file.open(file_path.c_str(), ios::in);
 116
 117   getline(data_file,file_data_line);
 118   species = file_data_line;
 119
 120   while ( !data_file.eof() )
 121   {
 122     getline(data_file,file_data_line);
 123     if (file_data_line != "")
 124     {
 125       // need to get 4 values...almost same code 4 times...
 126       // get annot start index
 127       space_split_i = file_data_line.find(" ");
 128       annot_value = file_data_line.substr(0,space_split_i);
 129       an_annot.start = atoi (annot_value.c_str());
 130       file_data_line = file_data_line.substr(space_split_i+1);
 131       // get annot end index
 132       space_split_i = file_data_line.find(" ");
 133       annot_value = file_data_line.substr(0,space_split_i);
 134       an_annot.end = atoi (annot_value.c_str());
 135       file_data_line = file_data_line.substr(space_split_i+1);
 136       // get annot start index
 137       space_split_i = file_data_line.find(" ");
 138       annot_value = file_data_line.substr(0,space_split_i);
 139       an_annot.name = annot_value;
 140       file_data_line = file_data_line.substr(space_split_i+1);
 141       // get annot start index
 142       space_split_i = file_data_line.find(" ");
 143       annot_value = file_data_line.substr(0,space_split_i);
 144       an_annot.type = annot_value;
 145       annots.push_back(an_annot);
 146     }
 147   }
 148
 149   data_file.close();
 150
 151   list<annot>::iterator list_i;
 152   for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
 153   {
 154     cout << (*list_i).start << "," << (*list_i).end << "\t";
 155     cout << (*list_i).name << "\t" << (*list_i).type << endl;
 156   }
 157 }
 158
 159
 160 int
 161 Sequence::len()
 162 {
 163   return sequence.length();
 164 }
 165
 166
 167 string
 168 Sequence::seq()
 169 {
 170   return sequence;
 171 }
 172
 173 const char *
 174 Sequence::c_seq()
 175 {
 176   return sequence.c_str();
 177 }
 178
 179 string
 180 Sequence::rev_comp()
 181 {
 182   string rev_comp;
 183   char conversionTable[257];
 184   int seq_i, table_i, len;
 185
 186   len = sequence.length();
 187   rev_comp.reserve(len);
 188   // make a conversion table
 189   // init all parts of conversion table to '~' character
 190   // '~' I doubt will ever appear in a sequence file (jeez, I hope)
 191   // and may the fleas of 1000 camels infest the genitals of any biologist (and
 192   // seven generations of their progeny) who decides to make it mean
 193   // something special!!!
 194   // PS - double the curse for any smartass non-biologist who tries it as well
 195   for(table_i=0; table_i < 256; table_i++)
 196   {
 197     conversionTable[table_i] = '~';
 198   }
 199   // add end of string character for printing out table for testing purposes
 200   conversionTable[256] = '\0';
 201
 202   // add in the characters for the bases we want to convert
 203   conversionTable['A'] = 'T';
 204   conversionTable['T'] = 'A';
 205   conversionTable['G'] = 'C';
 206   conversionTable['C'] = 'G';
 207   conversionTable['N'] = 'N';
 208
 209   // finally, the actual conversion loop
 210   for(seq_i = len - 1; seq_i >= 0; seq_i--)
 211   {
 212     table_i = (int) sequence[seq_i];
 213     rev_comp += conversionTable[table_i];
 214   }
 215
 216   return rev_comp;
 217 }
 218
 219
 220 string
 221 Sequence::hdr()
 222 {
 223   return header;
 224 }
 225
 226
 227 void
 228 Sequence::set_seq(string a_seq)
 229 {
 230   sequence = a_seq;
 231 }
 232
 233
 234 /*
 235 string
 236 Sequence::species()
 237 {
 238   return species;
 239 }
 240 */
 241
 242 void
 243 Sequence::clear()
 244 {
 245   sequence = "";
 246   length = 0;
 247   header = "";
 248   species = "";
 249   species_num = -4;
 250   annots.clear();
 251 }
 252
 253 void
 254 Sequence::save(fstream &save_file)
 255                //string save_file_path)
 256 {
 257   //fstream save_file;
 258   list<annot>::iterator annots_i;
 259   int i;
 260
 261   // not sure why, or if i'm doing something wrong, but can't seem to pass
 262   // file pointers down to this method from the mussa control class
 263   // so each call to save a sequence appends to the file started by mussa_class
 264   //save_file.open(save_file_path.c_str(), ios::app);
 265
 266   save_file << "<Sequence>" << endl;
 267   save_file << sequence << endl;
 268   save_file << "</Sequence>" << endl;
 269
 270   save_file << "<Annotations>" << endl;
 271   for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
 272   {
 273     save_file << annots_i->start << " " << annots_i->end << " " ;
 274     save_file << annots_i->name << " " << annots_i->type << endl;
 275   }
 276   save_file << "</Annotations>" << endl;
 277   //save_file.close();
 278 }
 279
 280 void
 281 Sequence::load_museq(string load_file_path, int seq_num)
 282 {
 283   fstream load_file;
 284   string file_data_line;
 285   int seq_counter;
 286   annot an_annot;
 287   int space_split_i;
 288   string annot_value;
 289
 290   annots.clear();
 291   load_file.open(load_file_path.c_str(), ios::in);
 292
 293   seq_counter = 0;
 294   // search for the seq_num-th sequence
 295   while ( (!load_file.eof()) && (seq_counter < seq_num) )
 296   {
 297     getline(load_file,file_data_line);
 298     if (file_data_line == "<Sequence>")
 299       seq_counter++;
 300   }
 301   getline(load_file, file_data_line);
 302   //cout << "*fee\n";
 303   sequence = file_data_line;
 304   //cout << "*fie\n";
 305   getline(load_file, file_data_line);
 306   getline(load_file, file_data_line);
 307   if (file_data_line == "<Annotations>")
 308   {
 309     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
 310     {
 311       //cout << "*foe\n";
 312       getline(load_file,file_data_line);
 313       if ((file_data_line != "") && (file_data_line != "</Annotations>"))
 314       {
 315         //cout << "*fum\n";
 316         // need to get 4 values...almost same code 4 times...
 317         // get annot start index
 318         space_split_i = file_data_line.find(" ");
 319         annot_value = file_data_line.substr(0,space_split_i);
 320         an_annot.start = atoi (annot_value.c_str());
 321         file_data_line = file_data_line.substr(space_split_i+1);
 322         // get annot end index
 323         space_split_i = file_data_line.find(" ");
 324         annot_value = file_data_line.substr(0,space_split_i);
 325         an_annot.end = atoi (annot_value.c_str());
 326         file_data_line = file_data_line.substr(space_split_i+1);
 327         // get annot start index
 328         space_split_i = file_data_line.find(" ");
 329         annot_value = file_data_line.substr(0,space_split_i);
 330         an_annot.name = annot_value;
 331         file_data_line = file_data_line.substr(space_split_i+1);
 332         // get annot start index
 333         space_split_i = file_data_line.find(" ");
 334         annot_value = file_data_line.substr(0,space_split_i);
 335         an_annot.type = annot_value;
 336         annots.push_back(an_annot);
 337       }
 338     }
 339   }
 340   load_file.close();
 341 }
 342
 343
 344 list<int>
 345 Sequence::find_motif(string a_motif)
 346 {
 347   char * seq_c;
 348   int seq_i, motif_i, motif_len;
 349   list<int> motif_match_starts;
 350
 351   motif_match_starts.clear();
 352   // faster to loop thru the sequence as a old c string (ie char array)
 353   seq_c = (char*)sequence.c_str();
 354   motif_len = a_motif.length();
 355
 356
 357   for (seq_i = 0; seq_i < length; seq_i++)
 358   {
 359
 360     // this is pretty much a straight translation of Nora's python code
 361     // to match iupac letter codes
 362     if (a_motif[motif_i] == seq_c[seq_i])
 363       motif_i++;
 364     else if (seq_c[seq_i] =='N')
 365       motif_i++;
 366     else if ((a_motif[motif_i] =='M') &&
 367              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
 368       motif_i++;
 369     else if ((a_motif[motif_i] =='R') &&
 370              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G')))
 371       motif_i++;
 372     else if ((a_motif[motif_i] =='W') &&
 373              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T')))
 374       motif_i++;
 375     else if ((a_motif[motif_i] =='S') &&
 376              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G')))
 377       motif_i++;
 378     else if ((a_motif[motif_i] =='Y') &&
 379              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T')))
 380       motif_i++;
 381     else if ((a_motif[motif_i] =='K') &&
 382              ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T')))
 383       motif_i++;
 384     else if ((a_motif[motif_i] =='V') &&
 385              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 386               (seq_c[seq_i]=='G')))
 387       motif_i++;
 388     else if ((a_motif[seq_i] =='H') &&
 389              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') ||
 390               (seq_c[seq_i]=='T')))
 391       motif_i++;
 392     else if ((a_motif[motif_i] =='D') &&
 393              ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') ||
 394               (seq_c[seq_i]=='T')))
 395       motif_i++;
 396     else if ((a_motif[motif_i] =='B') &&
 397              ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
 398               (seq_c[seq_i]=='T')))
 399       motif_i++;
 400     else
 401       motif_i = 0;
 402
 403     // end Nora stuff, now we see if a match is found this pass
 404     if (motif_i == motif_len)
 405     {
 406       motif_match_starts.push_back(seq_i - motif_len + 1);
 407       motif_i = 0;
 408     }
 409   }
 410
 411   return motif_match_starts;
 412 }