From: Diane Trout Date: Mon, 6 Apr 2015 23:29:48 +0000 (-0700) Subject: write matrix subsampler X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=f93b5ea7ba7953e84fbeb026363cb56a3faee201 write matrix subsampler --- diff --git a/subset_matrix.py b/subset_matrix.py new file mode 100644 index 0000000..156692d --- /dev/null +++ b/subset_matrix.py @@ -0,0 +1,64 @@ +#!/usr/bin/python3 +"""Select a fraction of rows from a file +""" +import argparse +import random +import os +import sys + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + if args.seed: + random.seed(args.seed) + + if args.filename: + instream = open(args.filename[0], 'rt') + else: + parser.error("Please specify input filename") + + if args.output: + outstream = open(args.output, 'wt') + else: + outstream = sys.stdout + + for line in subset(instream, args.header, args.include): + outstream.write(line) + + if args.output: + outstream.close() + +def make_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("filename", nargs=1, + help="filename to read from") + parser.add_argument("-o", "--output", + help="output filename") + parser.add_argument("-i", "--include", default=0.10, type=float, + help="probability to include a line [0..1]") + parser.add_argument("--header", default=1, type=int, + help="number of header lines to include") + parser.add_argument("-s", "--seed", + help="specify seed") + return parser + + +def subset(instream, header_lines, include_fraction): + """Subset lines from a file + + Always include the first specified number of 'header_lines' + then after that include lines if they meet the random threshold + """ + + while header_lines > 0: + yield next(instream) + header_lines -= 1 + + for line in instream: + if random.random() < include_fraction: + yield(line) + +if __name__ == "__main__": + main() +