1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
4 ============================================================================
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 ============================================================================
18 $Id: file_format_fasta.h,v 1.1 2008/08/25 16:20:03 langmead Exp $
19 ==========================================================================*/
21 #ifndef SEQAN_HEADER_FILE_FASTA_H
22 #define SEQAN_HEADER_FILE_FASTA_H
24 namespace SEQAN_NAMESPACE_MAIN
27 //////////////////////////////////////////////////////////////////////////////
28 // File Formats - Fasta
29 //////////////////////////////////////////////////////////////////////////////
32 .Tag.File Format.tag.Fasta:
33 FASTA file format for sequences.
36 typedef Tag<TagFasta_> const Fasta;
38 //////////////////////////////////////////////////////////////////////////////
40 //////////////////////////////////////////////////////////////////////////////
42 template <typename TFile, typename TFile2, typename TSpec>
44 goBegin(Iter<TFile, FileReader<Fasta, TFile2, TSpec> > & it, bool skip_meta)
46 if (_streamEOF(host(it)))
52 if (skip_meta && (it.data_char == '>'))
55 _stream_skipLine(host(it), it.data_char);
58 //eliminate linebreaks
59 while ((it.data_char == '\n') || (it.data_char == '\r'))
61 if (_streamEOF(host(it)))
66 it.data_char = _streamGet(host(it));
69 if (it.data_char == '>')
72 _streamUnget(host(it));
76 it.data_file_pos = _streamTellG(host(it)) - 1;
77 it.data_eof = _streamEOF(host(it));
79 template <typename TFile, typename TFile2, typename TSpec>
81 goBegin(Iter<TFile, FileReader<Fasta, TFile2, TSpec> > & it)
87 template <typename TFile, typename TFile2, typename TSpec>
89 goNext(Iter<TFile, FileReader<Fasta, TFile2, TSpec> > & it)
92 if (_streamEOF(host(it)))
98 it.data_char = _streamGet(host(it));
101 if (_streamEOF(host(it)))
107 if ((it.data_char == '\n') || (it.data_char == '\r'))
108 {//linebreak detected: find begin of next line
111 it.data_char = _streamGet(host(it));
112 if (_streamEOF(host(it)))
118 } while ((it.data_char == '\n') || (it.data_char == '\r'));
120 if (it.data_char == '>')
122 _streamUnget(host(it));
128 //////////////////////////////////////////////////////////////////////////////
129 // FileFormat Interface
130 //////////////////////////////////////////////////////////////////////////////
134 /////////////////////////////////////////////////////////////////////////
135 //count_valid: zaehlt die nicht-Zeilenumbrueche (input/output)
136 //count_all: zaehlt alle Zeichen incl. Zeilenumbrueche (input/output)
137 //returns: zuletzt gelesenes Zeichen = das erste hinter dem Zeilenumbruch bzw. eof
138 //the last read char is not counted!
139 //count_valid and count_all are not resetted but counted up
140 template <typename TFile, typename TSize>
141 inline typename Value<TFile>::Type
142 _fasta_scan_line(TFile & file,
147 SEQAN_ASSERT(!_streamEOF(file))
153 typename Value<TFile>::Type c = _streamGet(file);
155 if (_streamEOF(file))
157 count_valid += count;
162 if ((c == '\n') || (c == '\r'))
167 c = _streamGet(file);
168 } while ((c == '\n') || (c == '\r'));
170 count_valid += count;
183 /////////////////////////////////////////////////////////////////////////
184 template <typename TFile, typename TSize>
186 _read_n_chars_from_file(TFile & file, TSize count)
189 for (TSize i = 0; i < count; ++i)
196 //////////////////////////////////////////////////////////////////////////////
198 //////////////////////////////////////////////////////////////////////////////
200 template <typename TFile, typename TData, typename TSize>
209 SEQAN_ASSERT(!_streamEOF(file))
212 //determine begin position
213 typename Value<TFile>::Type c_first = _streamGet(file);
214 SEQAN_ASSERT(!_streamEOF(file))
216 typename Position<TFile>::Type begin_pos = _streamTellG(file);
217 typename Size<TData>::Type count_valid = 1; //"valid" characters read (without line breaks)
218 typename Size<TData>::Type count_all = 1; //all characters read (with line breaks)
220 if (_streamEOF(file))
226 {//there is an id line: skip it
227 c_first = _fasta_scan_line(file, count_valid, count_all);
230 if ((c_first == '>') || _streamEOF(file))
231 {//another id line = empty entry
232 _streamSeekG(file, begin_pos);
233 _read_n_chars_from_file(file, count_all);
237 begin_pos = _streamTellG(file);
241 typename Value<TFile>::Type c;
242 bool eof_reached = false;
246 c = _fasta_scan_line(file, count_valid, count_all);
247 if (_streamEOF(file))
248 {//end of file: stop searching
253 {//next entry found: stop seaching
256 if ((c != '\n') && (c != '\r'))
258 ++count_valid; //count c
264 typename Size<TData>::Type count = count_valid;
270 if (length(data) < count)
272 count = length(data);
276 _streamSeekG(file, begin_pos);
278 typename Position<TData>::Type pos = 0;
282 if ((c != '\n') && (c != '\r'))
287 if (pos >= count) break;
289 c = _streamGet(file);
293 //move file ptr to next entry
294 _read_n_chars_from_file(file, count_all - 1);
299 //____________________________________________________________________________
301 template <typename TFile, typename TData>
308 typedef typename Size<TData>::Type TSize;
309 read(file, data, supremumValue<TSize>(), tag);
313 //////////////////////////////////////////////////////////////////////////////
315 //////////////////////////////////////////////////////////////////////////////
317 //the ID is the complete first line (without the leading '>'-sign)
319 template <typename TFile, typename TString>
326 SEQAN_ASSERT(!_streamEOF(file))
328 typename Position<TFile>::Type start_pos = _streamTellG(file);
330 typename Value<TFile>::Type c = _streamGet(file);
337 typename Size<TString>::Type count_valid = 0;
338 typename Size<TString>::Type count_all = 0;
339 _fasta_scan_line(file, count_valid, count_all);
347 resize(id, count_valid);
348 if (length(id) < count_valid)
350 count_valid = length(id);
353 _streamSeekG(file, start_pos);
354 c = _streamGet(file); //pop the '>' character
355 for (typename Position<TString>::Type pos = 0; count_valid; --count_valid)
357 id[pos] = _streamGet(file);
362 _streamSeekG(file, start_pos);
365 //////////////////////////////////////////////////////////////////////////////
367 //////////////////////////////////////////////////////////////////////////////
369 //Fasta file records have no meta data
371 template <typename TFile, typename TMeta>
373 readMeta(TFile & file,
378 readID(file, meta, Fasta());
382 //////////////////////////////////////////////////////////////////////////////
384 //////////////////////////////////////////////////////////////////////////////
386 template <typename TFile>
392 SEQAN_ASSERT(!_streamEOF(file))
394 bool found_data = false;
397 typename Value<TFile>::Type c = _streamGet(file);
399 if (_streamEOF(file)) return;
401 if (c == '\n' || c == '\r')
404 c = _streamGet(file);
405 if (_streamEOF(file)) return;
406 } while (c == '\n' || c == '\r');
423 //////////////////////////////////////////////////////////////////////////////
425 //////////////////////////////////////////////////////////////////////////////
428 template <typename TFile, typename TString, typename TData>
430 _write_impl(TFile & file,
436 _streamPut(file, '>');
437 _streamWrite(file, id);
438 _streamPut(file, '\n');
440 //typename Iterator<TData, Standard>::Type it = begin(data, Standard());
441 //typename Iterator<TData, Standard>::Type it_end = end(data, Standard());
442 typename Iterator<TData>::Type it = begin(data);
443 typename Iterator<TData>::Type it_end = end(data);
447 for (; it < it_end; ++it)
451 _streamPut(file, '\n');
456 _streamPut(file, *it);
458 _streamPut(file, '\n');
461 //____________________________________________________________________________
463 template <typename TFile, typename TString, typename TData, typename TMeta>
470 _write_impl(file, data, "", Fasta());
473 //____________________________________________________________________________
475 template <typename TFile, typename TString, typename TData>
483 _write_impl(file, data, id, Fasta());
487 //VisualC++ const array bug workaround
488 template <typename TFile, typename TString, typename TDataValue>
496 _write_impl(file, data, id, Fasta());
500 //____________________________________________________________________________
502 template <typename TFile, typename TString, typename TData, typename TMeta>
511 _write_impl(file, data, id, Fasta());
515 //////////////////////////////////////////////////////////////////////////////
516 } //namespace SEQAN_NAMESPACE_MAIN
518 //////////////////////////////////////////////////////////////////////////////
520 #endif //#ifndef SEQAN_HEADER_...