Commit patch to not break on spaces.
[bowtie.git] / sequence_io.h
1 #ifndef SEQUENCE_IO_H_
2 #define SEQUENCE_IO_H_
3
4 #include <string>
5 #include <vector>
6 #include <stdexcept>
7 #include <fstream>
8 #include <stdio.h>
9 #include <seqan/sequence.h>
10 #include <seqan/file.h>
11 #include "assert_helpers.h"
12 #include "pat.h"
13
14 /**
15  * Read a sequence file of the given format and alphabet type.  Store
16  * all of the extracted sequences in vector ss.  Note that SeqAn's
17  * policy for when it encounters characters not from the specified
18  * alphabet is to convert them to the lexicographically smallest
19  * character in the alphabet.
20  */
21 template <typename TStr, typename TFile>
22 static void readSequenceFile(const std::string& infile,
23                              std::vector<TStr>& ss,
24                              int64_t& baseCutoff, // limit for total bases
25                              int seqCutoff = -1,  // limit for sequences
26                              bool reverse = false)
27 {
28         typedef typename Value<TStr>::Type TVal;
29         static char buf[256 * 1024]; // fairly large input buffer
30         if(baseCutoff <= 0) return;
31         FILE *in = fopen(infile.c_str(), "r");
32         if(in == NULL) {
33                 cerr << "Could not open sequence file" << endl;
34                 throw 1;
35         }
36         // Associate large input buffer with FILE *in
37         if(setvbuf(in, buf, _IOFBF, 256 * 1024) != 0) {
38                 cerr << "Could not create input buffer for sequence file" << endl;
39                 throw 1;
40         }
41         // Read entries using SeqAn
42         int cnt = 0;
43         while(!feof(in)) {
44                 while(true) {
45                         ss.push_back(TStr()); // add a new empty string to the end
46                         // Fill the new empty string with the next sequence from
47                         // the file.  SeqAn allocates just enough mem for it (at
48                         // the expense of lots of file seeks, which can add up)
49                         seqan::read(in, ss.back(), TFile()); 
50                         if(seqan::empty(ss.back())) {
51                                 ss.pop_back();
52                                 break;
53                         }
54                         // Enforce the base cutoff
55                         if((int64_t)length(ss.back()) > baseCutoff) {
56                                 resize(ss.back(), baseCutoff);
57                                 baseCutoff = 0;
58                         } else {
59                                 baseCutoff -= length(ss.back());
60                         }
61                         // Reverse the newly-read sequence in-place if desired
62                         if(reverse) {
63                                 size_t len = length(ss.back());
64                                 for(size_t i = 0; i < len/2; i++) {
65                                         TVal t = ss.back()[i];
66                                         ss.back()[i] = ss.back()[len-i-1];
67                                         ss.back()[len-i-1] = t;
68                                 }
69                         }
70                         #ifndef NDEBUG
71                         // Sanity check that all (int) values are in range
72                         for(size_t i = 0; i < length(ss.back()); i++) {
73                                 assert_lt(ss.back()[i], (int)(ValueSize<TVal>::VALUE));
74                                 assert_geq(ss.back()[i], 0);
75                         }
76                         #endif
77                         cnt++;
78                         // Enforce the sequence cutoff
79                         if(seqCutoff != -1 && cnt >= seqCutoff) {
80                                 fclose(in);
81                                 return;
82                         }
83                 }
84         }
85         fclose(in);
86 }
87
88 /**
89  * Read a set of sequence files of the given format and alphabet type.
90  * Store all of the extracted sequences in vector ss.
91  */
92 template <typename TStr, typename TFile>
93 static void readSequenceFiles(const std::vector<std::string>& infiles,
94                               std::vector<TStr>& ss,
95                               int64_t& baseCutoff,
96                               int seqCutoff = -1,
97                               bool reverse = false)
98 {
99         for(size_t i = 0; i < infiles.size() && baseCutoff > 0; i++) {
100                 readSequenceFile<TStr,TFile>(infiles[i], ss, baseCutoff, seqCutoff, reverse);
101                 if(baseCutoff <= 0) break;
102         }
103 }
104
105 /**
106  * Read a set of sequence files of the given format and alphabet type.
107  * Store all of the extracted sequences in vector ss.
108  */
109 template <typename TStr, typename TFile>
110 static void readSequenceFiles(const std::vector<std::string>& infiles,
111                               std::vector<TStr>& ss,
112                               int seqCutoff = -1,
113                               bool reverse = false)
114 {
115         int64_t i = 0xffffffffll;
116         readSequenceFiles<TStr,TFile>(infiles, ss, i, seqCutoff, reverse);
117 }
118
119 /**
120  * Parse a comma-delimited list of strings of type T into a vector. 
121  */
122 template <typename T>
123 void readSequenceString(const std::string& s,
124                         std::vector<T>& ss,
125                         int64_t& baseCutoff,
126                         int seqCutoff = -1,
127                         bool reverse = false)
128 {
129         // Split string s using comma as a delimiter.  Borrowed from C++
130         // Programming HOWTO 7.3
131         std::string::size_type lastPos = s.find_first_not_of(",", 0);
132         std::string::size_type pos = s.find_first_of(",", lastPos);
133         while (baseCutoff > 0 && (std::string::npos != pos || std::string::npos != lastPos)) {
134                 string stmp = s.substr(lastPos, pos - lastPos);
135                 if((int64_t)stmp.length() < baseCutoff) {
136                         baseCutoff -= stmp.length();
137                 } else {
138                         stmp = stmp.substr(0, baseCutoff);
139                         baseCutoff = 0;
140                 }
141                 if(reverse) {
142                         size_t len = stmp.length();
143                         for(size_t i = 0; i < len/2; i++) {
144                                 char tmp = stmp[i];
145                                 stmp[i] = stmp[len-i-1];
146                                 stmp[len-i-1] = tmp;
147                         }
148                         ss.push_back(T(stmp.c_str()));
149                 } else {
150                         ss.push_back(T(stmp.c_str()));
151                 }
152                 if(seqCutoff != -1 && ss.size() >= (size_t)seqCutoff) {
153                         return;
154                 }
155             lastPos = s.find_first_not_of(",", pos);
156             pos = s.find_first_of(",", lastPos);
157         }
158 }
159
160 /**
161  * Parse a comma-delimited list of strings of type T into a vector. 
162  * Doesn't require callee to supply a baseCutoff.
163  */
164 template <typename T>
165 void readSequenceString(const std::string& s,
166                         std::vector<T>& ss,
167                         int seqCutoff = -1,
168                         bool reverse = false)
169 {
170         int64_t i = 0xffffffffll;
171         readSequenceString(s, ss, i, seqCutoff, reverse);
172 }
173
174 #endif /*SEQUENCE_IO_H_*/