test annotating a sequence with fasta records in a stream
[mussa.git] / alg / sequence.cpp
index 05ec0a928dfc63390421f87927444b4179332063..521496d3497b2b96d4786819438d170f6ac50ce4 100644 (file)
@@ -32,6 +32,7 @@ namespace fs = boost::filesystem;
 namespace spirit = boost::spirit;
 
 #include "alg/sequence.hpp"
+#include "io.hpp"
 #include "mussa_exceptions.hpp"
 
 #include <string>
@@ -162,22 +163,6 @@ Sequence &Sequence::operator=(const Sequence& s)
   return *this;
 }
 
-static void multiplatform_getline(std::istream& in, std::string& line)
-{
-  line.clear();
-  char c;
-  in.get(c);
-  while(in.good() and !(c == '\012' or c == '\015') ) {
-    line.push_back(c);
-    in.get(c);
-  }
-  // if we have cr-lf eat it
-  c = in.peek();
-  if (c=='\012' or c == '\015') {
-    in.get();
-  }
-}
-
 void Sequence::load_fasta(fs::path file_path, int seq_num, int start_index, int end_index)
 {
   load_fasta(file_path, reduced_nucleic_alphabet, seq_num, start_index, end_index);
@@ -339,6 +324,21 @@ Sequence::load_annot(fs::path file_path, int start_index, int end_index)
     throw mussa_load_error("Error loading annotation file " + file_path.string());
   }
 
+  try {  
+    load_annot(data_stream, start_index, end_index);
+  }  catch(annotation_load_error e) {
+    std::ostringstream msg;
+    msg << file_path.native_file_string()
+        << " "
+        << e.what();
+    throw annotation_load_error(msg.str());
+  }
+  data_stream.close();
+}
+
+void
+Sequence::load_annot(std::istream& data_stream, int start_index, int end_index)
+{
   // so i should probably be passing the parse function some iterators
   // but the annotations files are (currently) small, so i think i can 
   // get away with loading the whole file into memory
@@ -348,17 +348,8 @@ Sequence::load_annot(fs::path file_path, int start_index, int end_index)
     data_stream.get(c);
     data.push_back(c);
   }
-  data_stream.close();
 
-  try {  
-    parse_annot(data, start_index, end_index);
-  } catch(annotation_load_error e) {
-    std::ostringstream msg;
-    msg << file_path.native_file_string()
-        << " "
-        << e.what();
-    throw annotation_load_error(msg.str());
-  }
+  parse_annot(data, start_index, end_index);
 }
 
 /* If this works, yikes, this is some brain hurting code.
@@ -710,10 +701,97 @@ Sequence::save(fs::fstream &save_file)
   //save_file.close();
 }
 
-void
-Sequence::load_museq(fs::path load_file_path, int seq_num)
+//void
+//Sequence::load_museq(fs::path load_file_path, int seq_num)
+//{
+//  fs::fstream load_file;
+//  std::string file_data_line;
+//  int seq_counter;
+//  //annot an_annot;
+//  int annot_begin;
+//  int annot_end;
+//  std::string annot_name;
+//  std::string annot_type;
+//  
+//  std::string::size_type space_split_i;
+//  std::string annot_value;
+//
+//  annotation_list.reset(new SeqSpanRefList);
+//  
+//  load_file.open(load_file_path, std::ios::in);
+//
+//  seq_counter = 0;
+//  // search for the seq_num-th sequence 
+//  while ( (!load_file.eof()) && (seq_counter < seq_num) )
+//  {
+//    getline(load_file,file_data_line);
+//    if (file_data_line == "<Sequence>")
+//      seq_counter++;
+//  }
+//  getline(load_file, file_data_line);
+//  // looks like the sequence is written as a single line
+//  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+//  getline(load_file, file_data_line);
+//  getline(load_file, file_data_line);
+//  if (file_data_line == "<Annotations>")
+//  {
+//    getline(load_file, file_data_line);
+//    species = file_data_line;
+//    while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
+//    {
+//      getline(load_file,file_data_line);
+//      if ((file_data_line != "") && (file_data_line != "</Annotations>"))  
+//      {
+//        // need to get 4 values...almost same code 4 times...
+//        // get annot start index
+//        space_split_i = file_data_line.find(" ");
+//        annot_value = file_data_line.substr(0,space_split_i);
+//        annot_begin = atoi (annot_value.c_str());
+//        file_data_line = file_data_line.substr(space_split_i+1);
+//        // get annot end index
+//        space_split_i = file_data_line.find(" ");
+//        annot_value = file_data_line.substr(0,space_split_i);
+//        annot_end = atoi (annot_value.c_str());
+//
+//        if (space_split_i == std::string::npos)  // no entry for type or name
+//        {
+//          std::cout << "seq, annots - no type or name\n";
+//          annot_name = "";
+//          annot_type = "";
+//        }
+//        else   // else get annot type
+//        {
+//          file_data_line = file_data_line.substr(space_split_i+1);
+//          space_split_i = file_data_line.find(" ");
+//          annot_value = file_data_line.substr(0,space_split_i);
+//          //an_annot.type = annot_value;
+//          annot_type = annot_value;
+//          if (space_split_i == std::string::npos)  // no entry for name
+//          {
+//            std::cout << "seq, annots - no name\n";
+//            annot_name = "";
+//          }
+//          else          // get annot name
+//          {
+//            file_data_line = file_data_line.substr(space_split_i+1);
+//            space_split_i = file_data_line.find(" ");
+//            annot_value = file_data_line.substr(0,space_split_i);
+//            // this seems like its wrong?
+//            annot_type = annot_value;
+//          }
+//        }
+//        add_annotation(annot_name, annot_type, annot_begin, annot_end);
+//      }
+//      //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
+//      //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
+//    }
+//  }
+//  load_file.close();
+//}
+
+SequenceRef Sequence::load_museq(boost::filesystem::fstream& load_file)
 {
-  fs::fstream load_file;
+  boost::shared_ptr<Sequence> seq(new Sequence);
   std::string file_data_line;
   int seq_counter;
   //annot an_annot;
@@ -725,27 +803,34 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
   std::string::size_type space_split_i;
   std::string annot_value;
 
-  annotation_list.reset(new SeqSpanRefList);
-  
-  load_file.open(load_file_path, std::ios::in);
+  //seq->annotation_list.reset(new SeqSpanRefList);
 
   seq_counter = 0;
-  // search for the seq_num-th sequence 
+  // search for the next sequence
+  int seq_num = 1;
   while ( (!load_file.eof()) && (seq_counter < seq_num) )
   {
     getline(load_file,file_data_line);
     if (file_data_line == "<Sequence>")
       seq_counter++;
   }
+  
+  // Could not find next sequence
+  if (load_file.eof())
+  {
+    seq.reset();
+    return seq;
+  }
+  
   getline(load_file, file_data_line);
   // looks like the sequence is written as a single line
-  set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
+  seq->set_filtered_sequence(file_data_line, reduced_dna_alphabet, 0, file_data_line.size(), SeqSpan::PlusStrand);
   getline(load_file, file_data_line);
   getline(load_file, file_data_line);
   if (file_data_line == "<Annotations>")
   {
     getline(load_file, file_data_line);
-    species = file_data_line;
+    seq->set_species(file_data_line);
     while ( (!load_file.eof())  && (file_data_line != "</Annotations>") )
     {
       getline(load_file,file_data_line);
@@ -789,13 +874,14 @@ Sequence::load_museq(fs::path load_file_path, int seq_num)
             annot_type = annot_value;
           }
         }
-        add_annotation(annot_name, annot_type, annot_begin, annot_end);
+        seq->add_annotation(annot_name, annot_type, annot_begin, annot_end);
       }
       //std::cout << "seq, annots: " << an_annot.start << ", " << an_annot.end
       //     << "-->" << an_annot.type << "::" << an_annot.name << std::endl;
     }
   }
-  load_file.close();
+  //load_file.close();
+  return seq;
 }