-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathrsample
executable file
·58 lines (40 loc) · 1.41 KB
/
rsample
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
"""
[Reservoir sampler](https://en.wikipedia.org/wiki/Reservoir_sampling) in Python.
Selects k items uniformly at random from an arbitrary-length stream.
Usage:
cat file | rsample [--seed] [K=10]
where `K` is the number of samples desired.
"""
import argparse
import random
import sys
DEFAULT_SIZE = 10
class ReservoirSampler():
def __init__(self, size=DEFAULT_SIZE):
self.size = size
self.samples = []
self.num_seen = 0
def add(self, item):
self.num_seen += 1
if len(self.samples) < self.size:
self.samples.append(item)
else:
# Accept new element from the stream with probability num_samples / i,
# and replace a random element of the sample with it.
if random.random() < 1. * self.size / self.num_seen:
index = random.randint(0, self.size - 1)
self.samples[index] = item
def main(args):
pool = ReservoirSampler(size=args.num_samples)
for i, line in enumerate(map(str.rstrip, sys.stdin), 1):
pool.add(line)
for sample in pool.samples:
print(sample)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=None)
parser.add_argument('num_samples', type=int, nargs='?', default=DEFAULT_SIZE)
args = parser.parse_args()
random.seed(args.seed)
main(args)