diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..9d7b34f --- /dev/null +++ b/main.py @@ -0,0 +1,93 @@ +# Tool to fit discrete distributions to a set of data + + +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt + + +def read_file(file_name: str) -> np.array: + return np.loadtxt(file_name, dtype=int) + + +def show_data(sample_data: np.array, ax: plt.Axes = None, title: str = None) -> plt.Axes: + r""" + Always a good idea to look at the data before doing anything to it + :param sample_data: + :param ax: + :param title: + :return: + """ + if ax is None: + _fig, ax = plt.subplots() + + # just count the unique integers and plot as bars + labels, counts = np.unique(sample_data, return_counts=True) + ax.bar(labels, counts, align='center', color='C1') + + if title: + ax.set_title(title) + + # set axes labels + ax.set_xlabel('data') + ax.set_ylabel('frequency') + + # non integer ticks are useless here + ax.xaxis.get_major_locator().set_params(integer=True) + ax.yaxis.get_major_locator().set_params(integer=True) + + return ax + + +def fit_uniform(sample_data: np.array): + """ + The discrete uniform distribution describes a random variable + that has equal probabilities of being any integers in the + half open range [a,b) + + :return: + """ + uniform_dist = stats.randint + + res = stats.fit(uniform_dist, sample_data, + bounds={'low': (0, 4), 'high': (0, 4)}) + res.plot() + plt.show() + print(res.params) + print(-np.sum(uniform_dist.logpmf(sample_data, *res.params))) + + return res + + +def fit_betabinom(sample_data: np.array): + + betabinom_dist = stats.betabinom + + res = stats.fit(betabinom_dist, sample_data, bounds={'n': (0, 4), 'a': (0, 4), 'b': (0, 4), 'loc': (0, 4)}) + + res.plot() + plt.show() + print(res.params) + print(-np.sum(betabinom_dist.logpmf(sample_data, *res.params))) + + +def fit_zipf(sample_data: np.array): + + zipf_dist = stats.zipf + + res = stats.fit(zipf_dist, sample_data, bounds={'a': (0, 4)}) + + res.plot() + plt.show() + print(res.params) + print(-np.sum(zipf_dist.logpmf(sample_data, *res.params))) + + +if __name__ == '__main__': + data = read_file('test_data.txt') + show_data(data) + plt.show() + + fit_uniform(data) + fit_betabinom(data) + fit_zipf(data) diff --git a/test_data.txt b/test_data.txt new file mode 100644 index 0000000..61e1013 --- /dev/null +++ b/test_data.txt @@ -0,0 +1,6 @@ +1 +1 +2 +1 +3 +2 \ No newline at end of file