write matrix subsampler
authorDiane Trout <diane@ghic.org>
Mon, 6 Apr 2015 23:29:48 +0000 (16:29 -0700)
committerDiane Trout <diane@ghic.org>
Mon, 6 Apr 2015 23:29:48 +0000 (16:29 -0700)
subset_matrix.py [new file with mode: 0644]

diff --git a/subset_matrix.py b/subset_matrix.py
new file mode 100644 (file)
index 0000000..156692d
--- /dev/null
@@ -0,0 +1,64 @@
+#!/usr/bin/python3
+"""Select a fraction of rows from a file
+"""
+import argparse
+import random
+import os
+import sys
+
+def main(cmdline=None):
+    parser = make_parser()
+    args = parser.parse_args(cmdline)
+
+    if args.seed:
+        random.seed(args.seed)
+
+    if args.filename:
+        instream = open(args.filename[0], 'rt')
+    else:
+        parser.error("Please specify input filename")
+        
+    if args.output:
+        outstream = open(args.output, 'wt')
+    else:
+        outstream = sys.stdout
+
+    for line in subset(instream, args.header, args.include):
+        outstream.write(line)
+
+    if args.output:
+        outstream.close()
+
+def make_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("filename", nargs=1,
+                        help="filename to read from")
+    parser.add_argument("-o", "--output",
+                        help="output filename")
+    parser.add_argument("-i", "--include", default=0.10, type=float,
+                        help="probability to include a line [0..1]")
+    parser.add_argument("--header", default=1, type=int,
+                        help="number of header lines to include")
+    parser.add_argument("-s", "--seed", 
+                        help="specify seed")
+    return parser
+                        
+
+def subset(instream, header_lines, include_fraction):
+    """Subset lines from a file
+
+    Always include the first specified number of 'header_lines'
+    then after that include lines if they meet the random threshold
+    """
+
+    while header_lines > 0:
+        yield next(instream)
+        header_lines -= 1
+        
+    for line in instream:
+        if random.random() < include_fraction:
+            yield(line)
+
+if __name__ == "__main__":
+    main()
+