#!/usr/bin/env python3

# https://eax.me/approximate-percentile/

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from random import random
import argparse

def plot_results(fname, xs, ys, down_errs, up_errs):
    dpi = 80
    fig = plt.figure(dpi = dpi, figsize = (512 / dpi, 384 / dpi) )
    mpl.rcParams.update({'font.size': 10})

    plt.errorbar(xs, ys, yerr = [down_errs, up_errs], fmt='-', color='#1f76b4',
                ecolor='#002036', elinewidth=1.5, capsize=3)
    plt.xlabel('Reservoir size (%)')
    plt.ylabel('Error (%)')
    plt.legend()
    plt.grid(True)
    fig.savefig(fname)

class ReservoirSampler:
    def __init__(self, nsamples):
        self.nsamples = nsamples
        self.nseen = 0
        self.result = []

    def sample(self, item):
        self.nseen += 1
        if self.nseen <= self.nsamples:
            self.result += [item]
        else:
            if random() < self.nsamples/self.nseen:
                i = int(random()*self.nsamples)
                self.result[i] = item

parser = argparse.ArgumentParser(description='Reservoir sampling test')
parser.add_argument('--percentile', type=int, default=50,
                    help='percentile to calculate (default: 50)')
parser.add_argument('--distribution', type=str, choices=['uniform', 'lognormal'],
                    default='uniform', help='distribution type (default: uniform)')
parser.add_argument('--total-samples', type=int, default=10_000,
                    help='total number of samples (default: 10_000)')
parser.add_argument('--tests-number', type=int, default=100,
                    help='number of test iterations (default: 100)')
parser.add_argument('--reservoir-start', type=int, default=5,
                    help='starting percentage for reservoir size (default: 5)')
parser.add_argument('--reservoir-stop', type=int, default=100,
                    help='ending percentage for reservoir size (default: 100)')
parser.add_argument('--reservoir-step', type=int, default=5,
                    help='step size for reservoir size percentage (default: 5)')

args = parser.parse_args()

xs = [ x for x in range(args.reservoir_start, args.reservoir_stop+1, args.reservoir_step) ]
ys = []
down_errs = []
up_errs = []

sigma = 0.8
mu = np.log(100)

for percentage in xs:
    errors = []
    for ntest in range(0, args.tests_number):
        sampler = ReservoirSampler(args.total_samples*percentage/100)
        all_vals = []
        for i in range(0, args.total_samples+1):
            val = 0
            if args.distribution == 'uniform':
                val = int(random()*5000)
            else:
                val = np.random.lognormal(mu, sigma)

            sampler.sample(val)
            all_vals += [val]

        sampler.result.sort()
        n = int(len(sampler.result)*args.percentile/100)
        approx_percentile = sampler.result[n]

        all_vals.sort()
        n = int(len(all_vals)*args.percentile/100)
        true_percentile = all_vals[n]

        errors += [ abs(true_percentile - approx_percentile)*100/true_percentile ]

    errors.sort()
    ys += [ errors[int(len(errors)*0.95)] ]
    down_errs += [ ys[-1] - errors[0] ]
    up_errs += [ errors[-1] - ys[-1] ]
    print("{} %: min err: {}, max err: {}, 95th percentile: {}".format(
        percentage, errors[0], errors[-1], errors[int(len(errors)*0.95)]))

fname = 'experiment-{}-{}.png'.format(args.distribution, args.percentile)
plot_results(fname, xs, ys, down_errs, up_errs)