-
Notifications
You must be signed in to change notification settings - Fork 0
/
fit_discrete.py
102 lines (75 loc) · 3.19 KB
/
fit_discrete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Tool to fit discrete distributions to a set of data
# This version fits the discrete uniform, beta binomal and Zipfian distributions
# Usage:
# >>> fit_discrete.py test_data.txt
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def read_file(file_name: str) -> np.array:
return np.loadtxt(file_name, dtype=int)
def show_data(sample_data: np.array, ax: plt.Axes = None, title: str = None) -> plt.Axes:
r"""
Always a good idea to look at the data before doing anything to it
:param sample_data:
:param ax: if not given, make up one
:param title: title for plot
:return: populated ax
"""
if ax is None:
_fig, ax = plt.subplots()
# just count the unique integers and plot as bars
labels, counts = np.unique(sample_data, return_counts=True)
ax.bar(labels, counts, align='center', color='C1')
if title:
ax.set_title(title)
# set axes labels
ax.set_xlabel('data')
ax.set_ylabel('frequency')
# non integer ticks are useless here
ax.xaxis.get_major_locator().set_params(integer=True)
ax.yaxis.get_major_locator().set_params(integer=True)
return ax
def fit_distribution(sample_data: np.array, distribution: stats.rv_discrete, bounds):
r"""
Fit a discrete distribution to your data
:param sample_data: 1D data array
:param distribution: scipy discrete distribution
:param bounds: dict of bounds, what the dict keys are depends on the distribution
:return: FitResult containing all the useful info and methods
"""
res = stats.fit(distribution, sample_data, bounds)
return res
def guess_bounds(sample_data: np.array):
low = min(sample_data)
high = max(sample_data)
rough_bounds = {'discrete uniform': {'low': low, 'high': high+1, 'loc': (low-1, high)},
'beta binomial': {'n': high-low, 'a': (0, high*10), 'b': (0, high*10), 'loc': (low-1, high)},
'zipfian': {'a': (-1, high*10), 'loc': (low-1, high)}
}
return rough_bounds
def set_plot(rows: int = 2, cols: int = 2):
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 12))
plt.subplots_adjust(hspace=0.5)
plt.suptitle("Fitted discrete distributions", fontsize=18, y=0.95)
return axes.ravel()
if __name__ == '__main__':
data = read_file('test_data.txt')
axs = set_plot()
show_data(data, ax=axs[0], title='Input data')
distributions = {'discrete uniform': stats.randint,
'beta binomial': stats.betabinom,
'zipfian': stats.zipf}
bounds = guess_bounds(data)
for i, (key, dist) in enumerate(distributions.items()):
result = fit_distribution(sample_data=data,
distribution=dist,
bounds=bounds[key])
if result.success:
result.plot(ax=axs[i + 1])
print(f'Successfully fitted the {key} distribution:')
print(f' the fit parameters are: {result.params}')
print(f' the negative log likelihood is: {result.nllf()} \n')
else:
print(f'Failed to fit the {key} distribution. Check the bounds!')
# show final plot
plt.show()