improve fasta parsing
authorDiane Trout <diane@caltech.edu>
Thu, 8 Jun 2006 22:23:09 +0000 (22:23 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 8 Jun 2006 22:23:09 +0000 (22:23 +0000)
This extends the parser to use both upper & lower case iupac symbols for
our annotation file. (And also I'm trying to simplify the grammar).
Additionally this patch also throws an error message when the result of
reading a fasta sequence is empty.

alg/sequence.cpp
alg/sequence.hpp
alg/test/test_sequence.cpp

index 903d6c7dcb516b5d41bb8ed64051c01b84bbc6f1..3ff96fc08b9cba1e73a2c4aa810a359e2512c853 100644 (file)
@@ -27,6 +27,7 @@ namespace fs = boost::filesystem;
 #include <boost/spirit/core.hpp>
 #include <boost/spirit/actor/push_back_actor.hpp>
 #include <boost/spirit/iterator/file_iterator.hpp>
+#include <boost/spirit/utility/chset.hpp>
 namespace spirit = boost::spirit;
 
 #include "alg/sequence.hpp"
@@ -36,6 +37,15 @@ namespace spirit = boost::spirit;
 #include <iostream>
 #include <sstream>
 
+// some standard dna alphabets 
+// \012 = nl
+// \015 = cr
+// this should make our sequence parsing end-of-line convention 
+// independent
+static const char* dna_alphabet = "AaCcGgTtNn\012\015";
+static const char* rna_alphabet = "AaCcGgNnUu\012\015";
+static const char* iupac_alphabet = "AaCcGgTtUuRrYyMmKkSsWwBbDdHhVvNn\012\015";
+
 annot::annot() 
  : start(0),
    end(0),
@@ -57,9 +67,9 @@ motif::motif(int start, std::string motif)
    sequence(motif)
 {
 }
-  
+
 Sequence::Sequence()
 : sequence(""),
 sequence(""),
     header(""),
     species("")
 {
@@ -67,7 +77,9 @@ Sequence::Sequence()
   motif_list.clear();
 }
 
-Sequence::Sequence(std::string seq)
+Sequence::Sequence(std::string seq) 
+ :  header(""),
+    species("")
 {
   set_filtered_sequence(seq);
 }
@@ -159,6 +171,14 @@ Sequence::load_fasta(fs::path file_path, int seq_num,
         end_index = sequence_raw.size();
 
       // sequence filtering for upcasing agctn and convert non AGCTN to N
+      if (end_index-start_index <= 0) {
+        // there doesn't appear to be any sequence
+        std::stringstream msg;
+        msg << "The selected sequence in " 
+            << file_path.native_file_string()
+            << " appears to be empty"; 
+        throw mussa_load_error(msg.str());
+      }
       set_filtered_sequence(sequence_raw, start_index, end_index-start_index);
     } else {
       std::stringstream errormsg;
@@ -303,7 +323,7 @@ struct push_back_seq {
         seq_i != seq.end();
         ++seq_i)
     {
-      if (*seq_i != '\n') new_seq += *seq_i;
+      if (*seq_i != '\015' && *seq_i != '\012') new_seq += *seq_i;
     }
     //std::cout << "adding seq: " << name << " " << new_seq << std::endl;
     
@@ -345,14 +365,9 @@ Sequence::parse_annot(std::string data, int start_index, int end_index)
                        )[push_back_annot(annots, start, end, type, name)]
                      |
                       (spirit::ch_p('>') >> 
-                       (*(~spirit::chlit<char>('\n')))[spirit::assign_a(name)] >>
-                        +spirit::space_p >>
-                       (+(spirit::ch_p('A')|
-                          spirit::ch_p('G')|
-                          spirit::ch_p('C')|
-                          spirit::ch_p('T')|
-                          spirit::ch_p('N')|
-                          spirit::ch_p('\n')))[spirit::assign_a(seq)]
+                         (*(spirit::print_p))[spirit::assign_a(name)] >>
+                         spirit::eol_p >> 
+                         (+(spirit::chset<>(iupac_alphabet)))[spirit::assign_a(seq)]
                        )[push_back_seq(query_seqs, name, seq)]
                       ) >>
                       *spirit::space_p
index be1f70f5ed3c10dce4a86b0f05fa7ecd12caad26..f67ca834c98d81732e6cab5e3019fe9ed8d1dbb8 100644 (file)
@@ -139,4 +139,5 @@ class Sequence
     void save(boost::filesystem::fstream &save_file);
     void load_museq(boost::filesystem::path load_file_path, int seq_num); 
 };
+
 #endif
index 00369241fd58fa3ac1d11a578964d9b328afd8bf..eaf6e5c45bd77bfc61aab2f948caedade2096cc3 100644 (file)
@@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE( annotation_load )
                       "60 50 backward\n"
                       ">ident3 asdf\n"
                       "GCT\n"
-                      "GCTN\n"
+                      "gCTn\n"
                       "75\t90\tname2\ttype2\n"
                       ;
   string s(100, 'A');