156692db89289bb9d9adab135a9e81f7a8ae65f2
[helpful_scripts.git] / subset_matrix.py
1 #!/usr/bin/python3
2 """Select a fraction of rows from a file
3 """
4 import argparse
5 import random
6 import os
7 import sys
8
9 def main(cmdline=None):
10     parser = make_parser()
11     args = parser.parse_args(cmdline)
12
13     if args.seed:
14         random.seed(args.seed)
15
16     if args.filename:
17         instream = open(args.filename[0], 'rt')
18     else:
19         parser.error("Please specify input filename")
20         
21     if args.output:
22         outstream = open(args.output, 'wt')
23     else:
24         outstream = sys.stdout
25
26     for line in subset(instream, args.header, args.include):
27         outstream.write(line)
28
29     if args.output:
30         outstream.close()
31
32 def make_parser():
33     parser = argparse.ArgumentParser()
34     parser.add_argument("filename", nargs=1,
35                         help="filename to read from")
36     parser.add_argument("-o", "--output",
37                         help="output filename")
38     parser.add_argument("-i", "--include", default=0.10, type=float,
39                         help="probability to include a line [0..1]")
40     parser.add_argument("--header", default=1, type=int,
41                         help="number of header lines to include")
42     parser.add_argument("-s", "--seed", 
43                         help="specify seed")
44     return parser
45                         
46
47 def subset(instream, header_lines, include_fraction):
48     """Subset lines from a file
49
50     Always include the first specified number of 'header_lines'
51     then after that include lines if they meet the random threshold
52     """
53
54     while header_lines > 0:
55         yield next(instream)
56         header_lines -= 1
57         
58     for line in instream:
59         if random.random() < include_fraction:
60             yield(line)
61
62 if __name__ == "__main__":
63     main()
64