[project @ 13]

[mussa.git] / sequence.cc
diff --git a/sequence.cc b/sequence.cc

index 3a25656c8ba099ccf93d3abcfdd9732fa1df4a4a..4351f8915bf528a9c350b11f0f343fe0ef30f51e 100644 (file)
--- a/sequence.cc
+++ b/sequence.cc
@@ -1,3 +1,23 @@
+//  This file is part of the Mussa source distribution.
+//  http://mussa.caltech.edu/
+//  Contact author: Tristan  De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
+//  This file is part of the Mussa source distribution.
+//  http://mussa.caltech.edu/
+//  Contact author: Tristan  De Buysscher, tristan@caltech.edu
+
+// This program and all associated source code files are Copyright (C) 2005
+// the California Institute of Technology, Pasadena, CA, 91125 USA.  It is
+// under the GNU Public License; please see the included LICENSE.txt
+// file for more information, or contact Tristan directly.
+
+
  //                        ----------------------------------------
  //                           ---------- sequence.cc -----------
  //                        ----------------------------------------
@@ -10,7 +30,7 @@ Sequence::Sequence()
  }
  
  
-void
+string
  Sequence::load_fasta(string file_path, int seq_num, 
                       int start_index, int end_index)
  {
@@ -23,78 +43,89 @@ Sequence::load_fasta(string file_path, int seq_num,
    int seq_i, table_i, len;
    string sequence_raw;
    char * seq_tmp;             // holds sequence during basic filtering
-
+  string err_msg;
  
    data_file.open(file_path.c_str(), ios::in);
  
-
-  // search for the header of the fasta sequence we want
-  while ( (!data_file.eof()) && (header_counter < seq_num) )
+  if (!data_file)
    {
-    getline(data_file,file_data_line);
-    if (file_data_line.substr(0,1) == ">")
-      header_counter++;
+    err_msg = "Sequence File: " + file_path + " not found"; 
+    return err_msg;
    }
+  // if file opened okay, read it
+  else
+  {
+    // search for the header of the fasta sequence we want
+    while ( (!data_file.eof()) && (header_counter < seq_num) )
+    {
+      getline(data_file,file_data_line);
+      if (file_data_line.substr(0,1) == ">")
+        header_counter++;
+    }
  
-  header = file_data_line.substr(1);
-
-  sequence_raw = "";
+    header = file_data_line.substr(1);
  
-  while ( !data_file.eof() && read_seq )
-  {
-    getline(data_file,file_data_line);
-    if (file_data_line.substr(0,1) == ">")
-      read_seq = false;
-    else sequence_raw += file_data_line;
-  }
+    sequence_raw = "";
  
-  data_file.close();
+    while ( !data_file.eof() && read_seq )
+    {
+      getline(data_file,file_data_line);
+      if (file_data_line.substr(0,1) == ">")
+        read_seq = false;
+      else sequence_raw += file_data_line;
+    }
  
-  // sequence filtering for upcasing agctn and convert non AGCTN to N
+    data_file.close();
  
-  len = sequence_raw.length();
-  seq_tmp = (char*)sequence_raw.c_str();
+    // sequence filtering for upcasing agctn and convert non AGCTN to N
  
-  sequence = "";
+    len = sequence_raw.length();
+    seq_tmp = (char*)sequence_raw.c_str();
  
-  // Make a conversion table
+    sequence = "";
  
-  // everything we don't specify below will become 'N'
-  for(table_i=0; table_i < 256; table_i++)
-  {
-    conversionTable[table_i] = 'N';
-  }
-  // add end of string character for printing out table for testing purposes
-  conversionTable[256] = '\0';
+    // Make a conversion table
  
-  // we want these to map to themselves - ie not to change
-  conversionTable['A'] = 'A';
-  conversionTable['T'] = 'T';
-  conversionTable['G'] = 'G';
-  conversionTable['C'] = 'C';
-  // this is to upcase
-  conversionTable['a'] = 'A';
-  conversionTable['t'] = 'T';
-  conversionTable['g'] = 'G';
-  conversionTable['c'] = 'C';
+    // everything we don't specify below will become 'N'
+    for(table_i=0; table_i < 256; table_i++)
+    {
+      conversionTable[table_i] = 'N';
+    }
+    // add end of string character for printing out table for testing purposes
+    conversionTable[256] = '\0';
+
+    // we want these to map to themselves - ie not to change
+    conversionTable['A'] = 'A';
+    conversionTable['T'] = 'T';
+    conversionTable['G'] = 'G';
+    conversionTable['C'] = 'C';
+    // this is to upcase
+    conversionTable['a'] = 'A';
+    conversionTable['t'] = 'T';
+    conversionTable['g'] = 'G';
+    conversionTable['c'] = 'C';
+
+    // finally, the actual conversion loop
+    for(seq_i = 0; seq_i < len; seq_i++)
+    {
+      table_i = (int) seq_tmp[seq_i];
+      sequence += conversionTable[table_i];
+    }
  
-  // finally, the actual conversion loop
-  for(seq_i = 0; seq_i < len; seq_i++)
-  {
-    table_i = (int) seq_tmp[seq_i];
-    sequence += conversionTable[table_i];
-  }
  
+    // Lastly, if subselection of the sequence was specified we keep cut out
+    // and only keep that part
  
-  // Lastly, if subselection of the sequence was specified we keep cut out
-  // and only keep that part
+    // end_index = 0 means no end was specified, so cut to the end 
+    if (end_index == 0)
+      end_index = len;
  
-  // end_index = 0 means no end was specified, so cut to the end 
-  if (end_index == 0)
-    end_index = len;
+    // start_index = 0 if no start was specified
+    sequence = sequence.substr(start_index, end_index - start_index);
+    length = sequence.length();
  
-  // start_index = 0 if no start was specified
-  sequence = sequence.substr(start_index, end_index - start_index);
+    return "";
+  }
  }
  
  
@@ -102,57 +133,105 @@ Sequence::load_fasta(string file_path, int seq_num,
    //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper);
  
  
-void
-Sequence::load_annot(string file_path)
+string
+Sequence::load_annot(string file_path, int start_index, int end_index)
  {
    fstream data_file;
    string file_data_line;
    annot an_annot;
    int space_split_i;
    string annot_value;
+  list<annot>::iterator list_i;
+  string err_msg;
+
  
    annots.clear();
    data_file.open(file_path.c_str(), ios::in);
  
-  getline(data_file,file_data_line);
-  species = file_data_line;
-
-  while ( !data_file.eof() )
+  if (!data_file)
+  {
+    err_msg = "Sequence File: " + file_path + " not found"; 
+    return err_msg;
+  }
+  // if file opened okay, read it
+  else
    {
      getline(data_file,file_data_line);
-    if (file_data_line != "")
+    species = file_data_line;
+
+    // end_index = 0 means no end was specified, so cut to the end 
+    if (end_index == 0)
+      end_index = length;
+
+    //cout << "START: " << start_index << " END: " << end_index << endl;
+
+    while ( !data_file.eof() )
      {
-      // need to get 4 values...almost same code 4 times...
-      // get annot start index
-      space_split_i = file_data_line.find(" ");
-      annot_value = file_data_line.substr(0,space_split_i);
-      an_annot.start = atoi (annot_value.c_str());
-      file_data_line = file_data_line.substr(space_split_i+1);
-      // get annot end index
-      space_split_i = file_data_line.find(" ");
-      annot_value = file_data_line.substr(0,space_split_i);
-      an_annot.end = atoi (annot_value.c_str());
-      file_data_line = file_data_line.substr(space_split_i+1);
-      // get annot start index
-      space_split_i = file_data_line.find(" ");
-      annot_value = file_data_line.substr(0,space_split_i);
-      an_annot.name = annot_value;
-      file_data_line = file_data_line.substr(space_split_i+1);
-      // get annot start index
-      space_split_i = file_data_line.find(" ");
-      annot_value = file_data_line.substr(0,space_split_i);
-      an_annot.type = annot_value;
-      annots.push_back(an_annot);
-    }
-  }
+      getline(data_file,file_data_line);
+      if (file_data_line != "")
+      {
+        // need to get 4 values...almost same code 4 times...
+        // get annot start index
+        space_split_i = file_data_line.find(" ");
+        annot_value = file_data_line.substr(0,space_split_i);
+        an_annot.start = atoi (annot_value.c_str());
+        file_data_line = file_data_line.substr(space_split_i+1);
+        // get annot end index
+        space_split_i = file_data_line.find(" ");
+        annot_value = file_data_line.substr(0,space_split_i);
+        an_annot.end = atoi (annot_value.c_str());
+        file_data_line = file_data_line.substr(space_split_i+1);
  
-  data_file.close();
+        cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+            << endl;
  
-  list<annot>::iterator list_i;
-  for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
-  {
-    cout << (*list_i).start << "," << (*list_i).end << "\t";
-    cout << (*list_i).name << "\t" << (*list_i).type << endl;
+        // get annot name
+        space_split_i = file_data_line.find(" ");
+       if (space_split_i == string::npos)  // no entries for name & type
+       {
+         cout << "seq, annots - no name or type\n";
+         an_annot.name = "";
+         an_annot.type = "";
+       }
+       else
+       {
+         annot_value = file_data_line.substr(0,space_split_i);
+         an_annot.name = annot_value;
+         file_data_line = file_data_line.substr(space_split_i+1);
+          // get annot type
+         space_split_i = file_data_line.find(" ");
+         if (space_split_i == string::npos)  // no entry for type
+           an_annot.type = "";
+         else
+         {
+           annot_value = file_data_line.substr(0,space_split_i);
+           an_annot.type = annot_value;
+         }
+       }
+
+
+        // add annot to list if it falls within the range of sequence specified
+        if ((start_index <= an_annot.start) && (end_index >= an_annot.end)) 
+        {
+          an_annot.start -= start_index;
+          an_annot.end -= start_index;
+          annots.push_back(an_annot);
+        }
+        else
+          cout << "FAILED!!!!!!\n";
+      }
+    }
+
+    data_file.close();
+    /*
+    // debugging check
+    for(list_i = annots.begin(); list_i != annots.end(); ++list_i)
+    {
+      cout << (*list_i).start << "," << (*list_i).end << "\t";
+      cout << (*list_i).name << "\t" << (*list_i).type << endl;
+    }
+    */
+    return "";
    }
  }
  
@@ -170,6 +249,14 @@ Sequence::seq()
    return sequence;
  }
  
+
+string
+Sequence::subseq(int start, int end)
+{
+  return sequence.substr(start, end);
+}
+
+
  const char *
  Sequence::c_seq()
  {
@@ -223,11 +310,18 @@ Sequence::hdr()
    return header;
  }
  
+string 
+Sequence::sp_name()
+{
+  return species;
+}
+
  
  void
  Sequence::set_seq(string a_seq)
  {
    sequence = a_seq;
+  length = sequence.length();
  }
  
  
@@ -268,6 +362,7 @@ Sequence::save(fstream &save_file)
    save_file << "</Sequence>" << endl;
  
    save_file << "<Annotations>" << endl;
+  save_file << species << endl;
    for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i)
    {
      save_file << annots_i->start << " " << annots_i->end << " " ;
@@ -299,20 +394,19 @@ Sequence::load_museq(string load_file_path, int seq_num)
        seq_counter++;
    }
    getline(load_file, file_data_line);
-  //cout << "*fee\n";
    sequence = file_data_line;
-  //cout << "*fie\n";
+  length = sequence.length();
    getline(load_file, file_data_line);
    getline(load_file, file_data_line);
    if (file_data_line == "<Annotations>")
    {
+    getline(load_file, file_data_line);
+    species = file_data_line;
      while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
      {
-      //cout << "*foe\n";
        getline(load_file,file_data_line);
        if ((file_data_line != "") && (file_data_line != "</Annotations>"))  
        {
-        //cout << "*fum\n";
          // need to get 4 values...almost same code 4 times...
          // get annot start index
          space_split_i = file_data_line.find(" ");
@@ -323,45 +417,199 @@ Sequence::load_museq(string load_file_path, int seq_num)
          space_split_i = file_data_line.find(" ");
          annot_value = file_data_line.substr(0,space_split_i);
          an_annot.end = atoi (annot_value.c_str());
-        file_data_line = file_data_line.substr(space_split_i+1);
-        // get annot start index
-        space_split_i = file_data_line.find(" ");
-        annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.name = annot_value;
-        file_data_line = file_data_line.substr(space_split_i+1);
-        // get annot start index
-        space_split_i = file_data_line.find(" ");
-        annot_value = file_data_line.substr(0,space_split_i);
-        an_annot.type = annot_value;
-        annots.push_back(an_annot);
+
+       if (space_split_i == string::npos)  // no entry for type or name
+       {
+         cout << "seq, annots - no type or name\n";
+         an_annot.type = "";
+         an_annot.name = "";
+       }
+       else   // else get annot type
+       {
+         file_data_line = file_data_line.substr(space_split_i+1);
+         space_split_i = file_data_line.find(" ");
+         annot_value = file_data_line.substr(0,space_split_i);
+         an_annot.type = annot_value;
+         if (space_split_i == string::npos)  // no entry for name
+         {
+           cout << "seq, annots - no name\n";
+           an_annot.name = "";
+         }
+         else          // get annot name
+         {
+           file_data_line = file_data_line.substr(space_split_i+1);
+           space_split_i = file_data_line.find(" ");
+           annot_value = file_data_line.substr(0,space_split_i);
+           an_annot.type = annot_value;
+         }
+       }
+       annots.push_back(an_annot);  // don't forget to actually add the annot
        }
+      cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+          << "-->" << an_annot.type << "::" << an_annot.name << endl;
      }
    }
    load_file.close();
  }
  
  
-list<int>
+string
+Sequence::rc_motif(string a_motif)
+{
+  string rev_comp;
+  char conversionTable[257];
+  int seq_i, table_i, len;
+
+  len = a_motif.length();
+  rev_comp.reserve(len);
+
+  for(table_i=0; table_i < 256; table_i++)
+  {
+    conversionTable[table_i] = '~';
+  }
+  // add end of string character for printing out table for testing purposes
+  conversionTable[256] = '\0';
+
+  // add in the characters for the bases we want to convert (IUPAC)
+  conversionTable['A'] = 'T';
+  conversionTable['T'] = 'A';
+  conversionTable['G'] = 'C';
+  conversionTable['C'] = 'G';
+  conversionTable['N'] = 'N';
+  conversionTable['M'] = 'K';
+  conversionTable['R'] = 'Y';
+  conversionTable['W'] = 'W';
+  conversionTable['S'] = 'S';
+  conversionTable['Y'] = 'R';
+  conversionTable['K'] = 'M';
+  conversionTable['V'] = 'B';
+  conversionTable['H'] = 'D';
+  conversionTable['D'] = 'H';
+  conversionTable['B'] = 'V';
+
+  // finally, the actual conversion loop
+  for(seq_i = len - 1; seq_i >= 0; seq_i--)
+  {
+    //cout << "** i = " << seq_i << " bp = " << 
+    table_i = (int) a_motif[seq_i];
+    rev_comp += conversionTable[table_i];
+  }
+
+  cout << "seq: " << a_motif << endl;
+  cout << "rc:  " << rev_comp << endl;
+
+  return rev_comp;
+}
+
+string
+Sequence::motif_validate(string a_motif)
+{
+  string valid_motif;
+  int seq_i, len;
+
+
+  len = a_motif.length();
+  valid_motif.reserve(len);
+
+  // this just upcases IUPAC symbols.  Eventually should return an error if non IUPAC is present.
+  // current nonIUPAC symbols are omitted, which is not reported atm
+  for(seq_i = 0; seq_i < len; seq_i++)
+  {
+     if ((a_motif[seq_i] == 'a') || (a_motif[seq_i] == 'A'))
+      valid_motif += 'A';
+    else if ((a_motif[seq_i] == 't') || (a_motif[seq_i] == 'T'))
+      valid_motif += 'T';
+    else if ((a_motif[seq_i] == 'g') || (a_motif[seq_i] == 'G'))
+      valid_motif += 'G';
+    else if ((a_motif[seq_i] == 'c') || (a_motif[seq_i] == 'C'))
+      valid_motif += 'C';
+    else if ((a_motif[seq_i] == 'n') || (a_motif[seq_i] == 'N'))
+      valid_motif += 'N';
+    else if ((a_motif[seq_i] == 'm') || (a_motif[seq_i] == 'M'))
+      valid_motif += 'M';
+    else if ((a_motif[seq_i] == 'r') || (a_motif[seq_i] == 'R'))
+      valid_motif += 'R';
+    else if ((a_motif[seq_i] == 'w') || (a_motif[seq_i] == 'W'))
+      valid_motif += 'W';
+    else if ((a_motif[seq_i] == 's') || (a_motif[seq_i] == 'S'))
+      valid_motif += 'S';
+    else if ((a_motif[seq_i] == 'y') || (a_motif[seq_i] == 'Y'))
+      valid_motif += 'Y';
+    else if ((a_motif[seq_i] == 'k') || (a_motif[seq_i] == 'K'))
+      valid_motif += 'G';
+    else if ((a_motif[seq_i] == 'v') || (a_motif[seq_i] == 'V'))
+      valid_motif += 'V';
+    else if ((a_motif[seq_i] == 'h') || (a_motif[seq_i] == 'H'))
+      valid_motif += 'H';
+    else if ((a_motif[seq_i] == 'd') || (a_motif[seq_i] == 'D'))
+      valid_motif += 'D';
+    else if ((a_motif[seq_i] == 'b') || (a_motif[seq_i] == 'B'))
+      valid_motif += 'B';
+   }
+
+  cout << "valid_motif is: " << valid_motif << endl;
+ 
+  return valid_motif;
+}
+
+
+vector<int>
  Sequence::find_motif(string a_motif)
+{
+  vector<int> motif_match_starts;
+  string a_motif_rc;
+
+
+  motif_match_starts.clear();
+
+  cout << "motif is: " << a_motif << endl;
+  a_motif = motif_validate(a_motif);
+  //cout << "motif is: " << a_motif << endl;
+
+
+  if (a_motif != "")
+  {
+    cout << "Sequence: none blank motif\n";
+    motif_scan(a_motif, &motif_match_starts);
+
+    a_motif_rc = rc_motif(a_motif);
+    // make sure not to do search again if it is a palindrome
+    if (a_motif_rc != a_motif)
+      motif_scan(a_motif_rc, &motif_match_starts);
+  }
+  return motif_match_starts;
+}
+
+void
+Sequence::motif_scan(string a_motif, vector<int> * motif_match_starts)
  {
    char * seq_c;
    int seq_i, motif_i, motif_len;
-  list<int> motif_match_starts;
  
-  motif_match_starts.clear();
    // faster to loop thru the sequence as a old c string (ie char array)
    seq_c = (char*)sequence.c_str();
+  //cout << "Sequence: motif, seq len = " << sequence.length() << endl; 
    motif_len = a_motif.length();
  
-  
-  for (seq_i = 0; seq_i < length; seq_i++)
-  {
+  //cout << "motif_length: " << motif_len << endl;
+  //cout << "RAAARRRRR\n";
+
+  motif_i = 0;
  
+  //cout << "motif: " << a_motif << endl;
+
+  //cout << "Sequence: motif, length= " << length << endl;
+  seq_i = 0;
+  while (seq_i < length)
+  {
+    cout << seq_c[seq_i];
+    //if ((seq_i > 10885) && (seq_i < 10917))
+    //cout << seq_c[seq_i] << "?" << a_motif[motif_i] << ":" << motif_i << " ";
      // this is pretty much a straight translation of Nora's python code
      // to match iupac letter codes
-    if (a_motif[motif_i] == seq_c[seq_i])
+    if (a_motif[motif_i] =='N')
        motif_i++;
-    else if (seq_c[seq_i] =='N')
+    else if (a_motif[motif_i] == seq_c[seq_i])
        motif_i++;
      else if ((a_motif[motif_i] =='M') && 
               ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C')))
@@ -397,16 +645,34 @@ Sequence::find_motif(string a_motif)
               ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') ||
                (seq_c[seq_i]=='T')))
        motif_i++;
+
      else
+    {
+      seq_i -= motif_i;
        motif_i = 0;
+    }
  
      // end Nora stuff, now we see if a match is found this pass
      if (motif_i == motif_len)
      {
-      motif_match_starts.push_back(seq_i - motif_len + 1);
+      //cout << "!!";
+      motif_match_starts->push_back(seq_i - motif_len + 1);
        motif_i = 0;
      }
-  }
  
-  return motif_match_starts;
+    seq_i++;
+  }
+  cout << endl;
  }
+
+/*
+        // get annot start index
+        space_split_i = file_data_line.find(" ");
+        annot_value = file_data_line.substr(0,space_split_i);
+        an_annot.name = annot_value;
+        file_data_line = file_data_line.substr(space_split_i+1);
+        // get annot start index
+        space_split_i = file_data_line.find(" ");
+        annot_value = file_data_line.substr(0,space_split_i);
+        an_annot.type = annot_value;
+*/