Load N sequences in Mussa::load
authorBrandon King <kingb@caltech.edu>
Wed, 18 Apr 2007 00:01:36 +0000 (00:01 +0000)
committerBrandon King <kingb@caltech.edu>
Wed, 18 Apr 2007 00:01:36 +0000 (00:01 +0000)
Fix for ticket:240.

Mussa would not load the sequences because the .muway file would
declare that 0 sequences existed. Since the Mussa:load code needed to
know the number of sequences to load the sequences, it failed.

The new version of the load sequence code, reads the sequences in
until there are no longer any sequences to read. At that point, the
number of sequences is the_seqs.size() and therefore is no longer
dependant on the muway file.

alg/mussa.cpp
alg/sequence.cpp
alg/sequence.hpp

index f75290c58705c68ea8e7ccb152da253fde976cbb..82b1d4a6585e49c1a946e9093413b0be357ab570 100644 (file)
@@ -639,6 +639,10 @@ Mussa::load(fs::path ana_file)
   vector<FLPs> empty_FLP_vector;
   FLPs dummy_comp;
 
+
+  //--------------------------------------------------------
+  // Load Muway
+  //--------------------------------------------------------
   analysis_path = ana_file;
   analysis_name = ana_path.leaf();
   fs::path muway(analysis_name+".muway", fs::native);
@@ -650,24 +654,63 @@ Mussa::load(fs::path ana_file)
   threshold = the_paths.get_threshold();
   soft_thres = threshold;
 
-  int seq_num = the_paths.sequence_count();
+
+  //--------------------------------------------------------
+  // Load Sequence
+  //--------------------------------------------------------
+  //int seq_num = the_paths.sequence_count();
 
   fs::path museq(analysis_name + ".museq", fs::native);
   a_file_path = analysis_path / museq;
 
   // this is a bit of a hack due to C++ not acting like it should with files
-  for (i = 1; i <= seq_num; i++)
+  /*for (i = 1; i <= seq_num; i++)
   {
     boost::shared_ptr<Sequence> tmp_seq(new Sequence);
     tmp_seq->load_museq(a_file_path, i);
     the_seqs.push_back(tmp_seq);
+  }*/
+  
+  i = 1;
+  //int seq_num = 0;
+  boost::filesystem::fstream load_museq_fs;
+  load_museq_fs.open(a_file_path, std::ios::in);
+  boost::shared_ptr<Sequence> tmp_seq;
+  while (1)
+  {
+    tmp_seq = Sequence::load_museq(load_museq_fs);
+    
+    if (tmp_seq)
+    {
+      the_seqs.push_back(tmp_seq);
+    }
+    else
+    {
+      break;
+    }
+    
+    
+    //safe guard in case of an infinate loop.
+    //FIXME: If mussa can handle a comparison of 10000 sequences
+    // in the future, then this code should be fixed.
+    if (i == 10000)
+    {
+      throw mussa_load_error(" Run away sequence load!");
+    }
+    i++;
   }
+  load_museq_fs.close();
   
+  //--------------------------------------------------------
+  // Load Motifs
+  //--------------------------------------------------------
   fs::path mtl(analysis_name + ".mtl", fs::native);
   fs::path motif_file = analysis_path / mtl;
   if (fs::exists(motif_file)) {
     load_motifs(motif_file);
   }
+  
+  vector<Sequence>::size_type seq_num = the_seqs.size();
   empty_FLP_vector.clear();
   for(i = 0; i < seq_num; i++)
   {
@@ -676,6 +719,7 @@ Mussa::load(fs::path ana_file)
       all_comps[i].push_back(dummy_comp);
   }
   
+  
   for(i = 0; i < seq_num; i++)
   {
     for(i2 = i+1; i2 < seq_num; i2++)
index 05ec0a928dfc63390421f87927444b4179332063..2e845c67bdfaa84d5843e0b05a5e3a766e363eaf 100644 (file)
@@ -710,10 +710,97 @@ Sequence::save(fs::fstream &save_file)
   //save_file.close();
 }
 
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+//  fs::fstream load_file;
+//  std::string file_data_line;
+//  int seq_counter;
+//  //annot an_annot;
+//  int annot_begin;
+//  int annot_end;
+//  std::string annot_name;
+//  std::string annot_type;
+//  
+//  std::string::size_type space_split_i;
+//  std::string annot_value;
+//
+//  annotation_list.reset(new SeqSpanRefList);
+//  
+//  load_file.open(load_file_path, std::ios::in);
+//
+//  seq_counter = 0;
+//  // search for the seq_num-th sequence 
+//  while ( (!load_file.eof()) && (seq_counter < seq_num) )
+//  {
+//    getline(load_file,file_data_line);
+//    if (file_data_line == "<Sequence>")
+//      seq_counter++;
+//  }
+//  getline(load_file, file_data_line);
+//  // looks like the sequence is written as a single line
+//  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+//  getline(load_file, file_data_line);
+//  getline(load_file, file_data_line);
+//  if (file_data_line == "<Annotations>")
+//  {
+//    getline(load_file, file_data_line);
+//    species = file_data_line;
+//    while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
+//    {
+//      getline(load_file,file_data_line);
+//      if ((file_data_line != "") && (file_data_line != "</Annotations>"))  
+//      {
+//        // need to get 4 values...almost same code 4 times...
+//        // get annot start index
+//        space_split_i = file_data_line.find(" ");
+//        annot_value = file_data_line.substr(0,space_split_i);
+//        annot_begin = atoi (annot_value.c_str());
+//        file_data_line = file_data_line.substr(space_split_i+1);
+//        // get annot end index
+//        space_split_i = file_data_line.find(" ");
+//        annot_value = file_data_line.substr(0,space_split_i);
+//        annot_end = atoi (annot_value.c_str());
+//
+//        if (space_split_i == std::string::npos)  // no entry for type or name
+//        {
+//          std::cout << "seq, annots - no type or name\n";
+//          annot_name = "";
+//          annot_type = "";
+//        }
+//        else   // else get annot type
+//        {
+//          file_data_line = file_data_line.substr(space_split_i+1);
+//          space_split_i = file_data_line.find(" ");
+//          annot_value = file_data_line.substr(0,space_split_i);
+//          //an_annot.type = annot_value;
+//          annot_type = annot_value;
+//          if (space_split_i == std::string::npos)  // no entry for name
+//          {
+//            std::cout << "seq, annots - no name\n";
+//            annot_name = "";
+//          }
+//          else          // get annot name
+//          {
+//            file_data_line = file_data_line.substr(space_split_i+1);
+//            space_split_i = file_data_line.find(" ");
+//            annot_value = file_data_line.substr(0,space_split_i);
+//            // this seems like its wrong?
+//            annot_type = annot_value;
+//          }
+//        }
+//        add_annotation(annot_name, annot_type, annot_begin, annot_end);
+//      }
+//      //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+//      //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+//    }
+//  }
+//  load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
 {
-  fs::fstream load_file;
+  boost::shared_ptr<Sequence> seq(new Sequence);
   std::string file_data_line;
   int seq_counter;
   //annot an_annot;
@@ -725,27 +812,34 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
   std::string::size_type space_split_i;
   std::string annot_value;
 
-  annotation_list.reset(new SeqSpanRefList);
-  
-  load_file.open(load_file_path, std::ios::in);
+  //seq->annotation_list.reset(new SeqSpanRefList);
 
   seq_counter = 0;
-  // search for the seq_num-th sequence 
+  // search for the next sequence
+  int seq_num = 1;
   while ( (!load_file.eof()) && (seq_counter < seq_num) )
   {
     getline(load_file,file_data_line);
     if (file_data_line == "<Sequence>")
       seq_counter++;
   }
+  
+  // Could not find next sequence
+  if (load_file.eof())
+  {
+    seq.reset();
+    return seq;
+  }
+  
   getline(load_file, file_data_line);
   // looks like the sequence is written as a single line
-  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+  seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
   getline(load_file, file_data_line);
   getline(load_file, file_data_line);
   if (file_data_line == "<Annotations>")
   {
     getline(load_file, file_data_line);
-    species = file_data_line;
+    seq->set_species(file_data_line);
     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
     {
       getline(load_file,file_data_line);
@@ -789,13 +883,14 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
             annot_type = annot_value;
           }
         }
-        add_annotation(annot_name, annot_type, annot_begin, annot_end);
+        seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
       }
       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
     }
   }
-  load_file.close();
+  //load_file.close();
+  return seq;
 }
 
 
index 17149415c622c14899c2a6b8f8959c01db36906d..1a4c9eb7a1e6d93079d6a98179efa24469fe6450 100644 (file)
@@ -231,7 +231,8 @@ public:
   SeqSpanRef seqspan() { return seq; }
   
   void save(boost::filesystem::fstream &save_file);
-  void load_museq(boost::filesystem::path load_file_path, int seq_num);
+  //void load_museq(boost::filesystem::path load_file_path, int seq_num);
+  static SequenceRef load_museq(boost::filesystem::fstream& load_file);
   
 protected:  
   SeqSpanRef seq;