+#!/usr/bin/python3
+"""Select a fraction of rows from a file
+"""
+import argparse
+import random
+import os
+import sys
+
+def main(cmdline=None):
+ parser = make_parser()
+ args = parser.parse_args(cmdline)
+
+ if args.seed:
+ random.seed(args.seed)
+
+ if args.filename:
+ instream = open(args.filename[0], 'rt')
+ else:
+ parser.error("Please specify input filename")
+
+ if args.output:
+ outstream = open(args.output, 'wt')
+ else:
+ outstream = sys.stdout
+
+ for line in subset(instream, args.header, args.include):
+ outstream.write(line)
+
+ if args.output:
+ outstream.close()
+
+def make_parser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("filename", nargs=1,
+ help="filename to read from")
+ parser.add_argument("-o", "--output",
+ help="output filename")
+ parser.add_argument("-i", "--include", default=0.10, type=float,
+ help="probability to include a line [0..1]")
+ parser.add_argument("--header", default=1, type=int,
+ help="number of header lines to include")
+ parser.add_argument("-s", "--seed",
+ help="specify seed")
+ return parser
+
+
+def subset(instream, header_lines, include_fraction):
+ """Subset lines from a file
+
+ Always include the first specified number of 'header_lines'
+ then after that include lines if they meet the random threshold
+ """
+
+ while header_lines > 0:
+ yield next(instream)
+ header_lines -= 1
+
+ for line in instream:
+ if random.random() < include_fraction:
+ yield(line)
+
+if __name__ == "__main__":
+ main()
+