#!/usr/bin/env python3 # https://eax.me/approximate-percentile/ import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np from random import random import argparse def plot_results(fname, xs, ys, down_errs, up_errs): dpi = 80 fig = plt.figure(dpi = dpi, figsize = (512 / dpi, 384 / dpi) ) mpl.rcParams.update({'font.size': 10}) plt.errorbar(xs, ys, yerr = [down_errs, up_errs], fmt='-', color='#1f76b4', ecolor='#002036', elinewidth=1.5, capsize=3) plt.xlabel('Reservoir size (%)') plt.ylabel('Error (%)') plt.legend() plt.grid(True) fig.savefig(fname) class ReservoirSampler: def __init__(self, nsamples): self.nsamples = nsamples self.nseen = 0 self.result = [] def sample(self, item): self.nseen += 1 if self.nseen <= self.nsamples: self.result += [item] else: if random() < self.nsamples/self.nseen: i = int(random()*self.nsamples) self.result[i] = item parser = argparse.ArgumentParser(description='Reservoir sampling test') parser.add_argument('--percentile', type=int, default=50, help='percentile to calculate (default: 50)') parser.add_argument('--distribution', type=str, choices=['uniform', 'lognormal'], default='uniform', help='distribution type (default: uniform)') parser.add_argument('--total-samples', type=int, default=10_000, help='total number of samples (default: 10_000)') parser.add_argument('--tests-number', type=int, default=100, help='number of test iterations (default: 100)') parser.add_argument('--reservoir-start', type=int, default=5, help='starting percentage for reservoir size (default: 5)') parser.add_argument('--reservoir-stop', type=int, default=100, help='ending percentage for reservoir size (default: 100)') parser.add_argument('--reservoir-step', type=int, default=5, help='step size for reservoir size percentage (default: 5)') args = parser.parse_args() xs = [ x for x in range(args.reservoir_start, args.reservoir_stop+1, args.reservoir_step) ] ys = [] down_errs = [] up_errs = [] sigma = 0.8 mu = np.log(100) for percentage in xs: errors = [] for ntest in range(0, args.tests_number): sampler = ReservoirSampler(args.total_samples*percentage/100) all_vals = [] for i in range(0, args.total_samples+1): val = 0 if args.distribution == 'uniform': val = int(random()*5000) else: val = np.random.lognormal(mu, sigma) sampler.sample(val) all_vals += [val] sampler.result.sort() n = int(len(sampler.result)*args.percentile/100) approx_percentile = sampler.result[n] all_vals.sort() n = int(len(all_vals)*args.percentile/100) true_percentile = all_vals[n] errors += [ abs(true_percentile - approx_percentile)*100/true_percentile ] errors.sort() ys += [ errors[int(len(errors)*0.95)] ] down_errs += [ ys[-1] - errors[0] ] up_errs += [ errors[-1] - ys[-1] ] print("{} %: min err: {}, max err: {}, 95th percentile: {}".format( percentage, errors[0], errors[-1], errors[int(len(errors)*0.95)])) fname = 'experiment-{}-{}.png'.format(args.distribution, args.percentile) plot_results(fname, xs, ys, down_errs, up_errs)