2 """Select a fraction of rows from a file
4 # Copyright (2015) Diane Trout & California Institute of Technology
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 def main(cmdline=None):
26 parser = make_parser()
27 args = parser.parse_args(cmdline)
30 random.seed(args.seed)
33 instream = open(args.filename[0], 'rt')
35 parser.error("Please specify input filename")
38 outstream = open(args.output, 'wt')
40 outstream = sys.stdout
42 for line in subset(instream, args.header, args.include):
49 parser = argparse.ArgumentParser()
50 parser.add_argument("filename", nargs=1,
51 help="filename to read from")
52 parser.add_argument("-o", "--output",
53 help="output filename")
54 parser.add_argument("-i", "--include", default=0.10, type=float,
55 help="probability to include a line [0..1]")
56 parser.add_argument("--header", default=1, type=int,
57 help="number of header lines to include")
58 parser.add_argument("-s", "--seed",
63 def subset(instream, header_lines, include_fraction):
64 """Subset lines from a file
66 Always include the first specified number of 'header_lines'
67 then after that include lines if they meet the random threshold
70 while header_lines > 0:
75 if random.random() < include_fraction:
78 if __name__ == "__main__":