9 #include <seqan/sequence.h>
10 #include <seqan/file.h>
11 #include "assert_helpers.h"
15 * Read a sequence file of the given format and alphabet type. Store
16 * all of the extracted sequences in vector ss. Note that SeqAn's
17 * policy for when it encounters characters not from the specified
18 * alphabet is to convert them to the lexicographically smallest
19 * character in the alphabet.
21 template <typename TStr, typename TFile>
22 static void readSequenceFile(const std::string& infile,
23 std::vector<TStr>& ss,
24 int64_t& baseCutoff, // limit for total bases
25 int seqCutoff = -1, // limit for sequences
28 typedef typename Value<TStr>::Type TVal;
29 static char buf[256 * 1024]; // fairly large input buffer
30 if(baseCutoff <= 0) return;
31 FILE *in = fopen(infile.c_str(), "r");
33 cerr << "Could not open sequence file" << endl;
36 // Associate large input buffer with FILE *in
37 if(setvbuf(in, buf, _IOFBF, 256 * 1024) != 0) {
38 cerr << "Could not create input buffer for sequence file" << endl;
41 // Read entries using SeqAn
45 ss.push_back(TStr()); // add a new empty string to the end
46 // Fill the new empty string with the next sequence from
47 // the file. SeqAn allocates just enough mem for it (at
48 // the expense of lots of file seeks, which can add up)
49 seqan::read(in, ss.back(), TFile());
50 if(seqan::empty(ss.back())) {
54 // Enforce the base cutoff
55 if((int64_t)length(ss.back()) > baseCutoff) {
56 resize(ss.back(), baseCutoff);
59 baseCutoff -= length(ss.back());
61 // Reverse the newly-read sequence in-place if desired
63 size_t len = length(ss.back());
64 for(size_t i = 0; i < len/2; i++) {
65 TVal t = ss.back()[i];
66 ss.back()[i] = ss.back()[len-i-1];
67 ss.back()[len-i-1] = t;
71 // Sanity check that all (int) values are in range
72 for(size_t i = 0; i < length(ss.back()); i++) {
73 assert_lt(ss.back()[i], (int)(ValueSize<TVal>::VALUE));
74 assert_geq(ss.back()[i], 0);
78 // Enforce the sequence cutoff
79 if(seqCutoff != -1 && cnt >= seqCutoff) {
89 * Read a set of sequence files of the given format and alphabet type.
90 * Store all of the extracted sequences in vector ss.
92 template <typename TStr, typename TFile>
93 static void readSequenceFiles(const std::vector<std::string>& infiles,
94 std::vector<TStr>& ss,
99 for(size_t i = 0; i < infiles.size() && baseCutoff > 0; i++) {
100 readSequenceFile<TStr,TFile>(infiles[i], ss, baseCutoff, seqCutoff, reverse);
101 if(baseCutoff <= 0) break;
106 * Read a set of sequence files of the given format and alphabet type.
107 * Store all of the extracted sequences in vector ss.
109 template <typename TStr, typename TFile>
110 static void readSequenceFiles(const std::vector<std::string>& infiles,
111 std::vector<TStr>& ss,
113 bool reverse = false)
115 int64_t i = 0xffffffffll;
116 readSequenceFiles<TStr,TFile>(infiles, ss, i, seqCutoff, reverse);
120 * Parse a comma-delimited list of strings of type T into a vector.
122 template <typename T>
123 void readSequenceString(const std::string& s,
127 bool reverse = false)
129 // Split string s using comma as a delimiter. Borrowed from C++
130 // Programming HOWTO 7.3
131 std::string::size_type lastPos = s.find_first_not_of(",", 0);
132 std::string::size_type pos = s.find_first_of(",", lastPos);
133 while (baseCutoff > 0 && (std::string::npos != pos || std::string::npos != lastPos)) {
134 string stmp = s.substr(lastPos, pos - lastPos);
135 if((int64_t)stmp.length() < baseCutoff) {
136 baseCutoff -= stmp.length();
138 stmp = stmp.substr(0, baseCutoff);
142 size_t len = stmp.length();
143 for(size_t i = 0; i < len/2; i++) {
145 stmp[i] = stmp[len-i-1];
148 ss.push_back(T(stmp.c_str()));
150 ss.push_back(T(stmp.c_str()));
152 if(seqCutoff != -1 && ss.size() >= (size_t)seqCutoff) {
155 lastPos = s.find_first_not_of(",", pos);
156 pos = s.find_first_of(",", lastPos);
161 * Parse a comma-delimited list of strings of type T into a vector.
162 * Doesn't require callee to supply a baseCutoff.
164 template <typename T>
165 void readSequenceString(const std::string& s,
168 bool reverse = false)
170 int64_t i = 0xffffffffll;
171 readSequenceString(s, ss, i, seqCutoff, reverse);
174 #endif /*SEQUENCE_IO_H_*/