-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoarse-workload.py
214 lines (201 loc) · 10.5 KB
/
coarse-workload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from sigma import sigma
from omega import omega
from kmin_bravo import kmin_bravo
from kmin_minerva import kmin_minerva
from round_size_bravo import round_size_bravo
from round_size_minerva2 import round_size_minerva2
import numpy as np
from scipy.stats import binom
from tqdm import tqdm
from scipy.signal import convolve
#temp
import matplotlib.pyplot as plt
uhohcount = 0
# if each ballot has fixed cost, and rounds have no overhead, selection-ordered bravo is the most efficient RLA
# but since there is at least some overhead, which round schedule minimizes cost (workload)
ballotcost = 1
roundcost = 1000 # mostly due to opening boxes (but also entering sample, getting decision, preparing next sample)
# this workload minimization only makes sense in the context of a certain audit and its parameters
# we begin with the presidenital contest in pennsylvania in 2020
# audit parameters
biden = 3458229
trump = 3377674
margin = (biden - trump) / (trump + biden) # announced margin
p1 = biden / (trump + biden) # announced proportion of winner votes
p0 = .5 # tie
alpha = .1 # risk limit
# option 1: use fixed sprob and find which sprob minimizes workload
# option 2: we could use fixed marginal round sizes finding which minimizes workload
# both entertined in previous paper, we begin by trying option 2
marginal_round_size = 10000 #constant.
# since infinite rounds might take too long, let's have a max
MAX_ROUNDS = 5
# let's compute the cost of this round schedule [1*marginal_round_size, 2*marginal_round_size, ...]
# cost(round schedule) = (expec total num ballots) * (per ballot cost) + (expec num rounds) * (per ballot cost)
# kmins are computed in advance to make sprob calculations faster (can use lookup table not search)
divs = 1000
kmins = np.load('kmins-'+str(divs)+'riskdivs.npy')
def lookupkmin(risk):
half = (1 / divs) / 2
ith = int((risk + half) * divs)
if ith == 0:
idx = 0
else:
idx = ith - 1
if idx >= len(kmins) or idx < 0:
ValueError('lookupkmin - bad idx:',idx)
return kmins[idx]
# FIRST ROUND
roundnum = 1
exp_num_rounds, exp_num_ballots, prob_reach_this_round = 0, 0, 1
#get kmin from lookup table and use it to compute easily the sprob
kmin1 = lookupkmin(alpha)
sprob1 = binom.sf(kmin1-1, marginal_round_size, p1)
print('round',roundnum,'sprob',sprob1)
exp_num_rounds += roundnum * sprob1
exp_num_ballots += marginal_round_size * prob_reach_this_round
print('after round '+str(roundnum)+', expected ballots '+str(exp_num_ballots)+' and expected rounds '+str(exp_num_rounds))
nprev = 0
nprev += marginal_round_size
kprevs = np.arange(nprev+1,dtype=int)
pr_kprevs = binom.pmf(kprevs, nprev, p1)
# zero out those above kmin1 (those which stop in the first round)
pr_kprevs[int(kmin1):] = 0
# going into second round we know that kprevs are just a binomial but also as above can be convolution still
#pr_kprevs = binom.pmf(kprevs, nprev, p1)
# SECOND ROUND (and beyond by induction!)# already set for each time thru this new round loop: kprevs, nprev, pr_kprevs
# for rounds 2 and on, the audit has a different behavior for each kprev (cumulative sample of previous winner ballots)
# make sure that initial values for the preceeding round are set before proceeding onto rounds 2 and on
sprobprev = sprob1
sprobs = [sprob1]
while roundnum <= MAX_ROUNDS - 1:
roundnum += 1
print('round',roundnum)
n = nprev + marginal_round_size
prob_reach2 = 1 - sum(sprobs)
sprob2 = 0
# for each possible previous sample, we compute the sprobs (and update the expectations accordingly)
print('computing sprob...')
for kprev in tqdm(kprevs):
# to do this more quickly, we make the observation that:
# sigma(kprev,nprev) * tau_1(k',n') >= 1 / alpha
# is equivalent to
# tau_1(k',n') >= 1 / alpha' where alpha' = alpha * sigma(kprev,nprev)
# so we can, for a modified risk limit, find the minerva kmin
# we do so using a precomputed look-up table to save time
if pr_kprevs[kprev] == 0:
continue
sigmaprev = sigma(kprev, nprev, p1, p0)
alphaprime = alpha * sigmaprev
if alphaprime >= 1:
uhohcount += 1
#print('uhohcount, kprev, nprev, p1, p0, alphaprime, pr_kprevs[kprev], sigmaprev')
#print(uhohcount, kprev, nprev, p1, p0, alphaprime, pr_kprevs[kprev], sigmaprev)
# need to compute alphaprime from scratch...
kmin = kmin_minerva(marginal_round_size, p1, p0, alphaprime)
plt.plot(pr_kprevs)
else:
kmin = lookupkmin(alphaprime)
sprob = pr_kprevs[kprev] * binom.sf(kmin-1, marginal_round_size, p1)
sprob2 += sprob
# now print the sprobs for this round
#sprob2 = cond_sprob2 * prob_reach2 #sprob2 then is the 'absolute' probability that the audit stops in round 2
sprobs.append(sprob2) #keep track of this sprob so that we can know the cumulative stopping probability
print('round',roundnum,'sprob',sprob2)
print('round',roundnum,'cumulative sprob',sum(sprobs))
exp_num_rounds += roundnum * sprob2
exp_num_ballots += n * sprob2
print('after round '+str(roundnum)+', expected ballots '+str(exp_num_ballots)+' and expected rounds '+str(exp_num_rounds))
# it will be necessary in the next round to know probability of getting each possible kprev: so we compute that here now
# in minerva you can simply lop of the previous distributions tail and convolve with the marginal draw distribution,
# but in providence you cannot do so since kmins depend on the previous value of k, and so we manually compute the distribution
biggest_possible_k2 = nprev + marginal_round_size
k2s = np.arange(0,biggest_possible_k2+1,1,dtype=int) # 0 thru kmin-1 (so index is same as the kprev, nicely)
marginal_pr_ks = binom.pmf(range(marginal_round_size+1), marginal_round_size, p1)
pr_k2s = np.zeros_like(k2s, dtype=float)
k2sparsity = 50 # we skip (and then fill in approximately) a bunch of k2s in the distribution
kprevsparsity = 50 # we skip (and then fill in approximately) a bunch of kprevs when computing pr_k2 for a particular k2
last_pr_k2, second_to_last_pr_k2 = -1, -1
last_k2, second_to_last_k2 = -1, -1
print('computing probability distribution for next round...')
stopped = 0
for k2 in tqdm(k2s):
if not k2 % k2sparsity == 0:
continue
# for now, this continues (and later, when the next point is found, it will be filled in with an approximate value)
lastpr, secondlastpr = 0,0
for kprev in kprevs:
if not kprev % kprevsparsity == 0:
continue
# for now, this continues (and later, when the next point is found, it will be filled in with an approximate value)
stop = (omega(roundnum, k2 - kprev, marginal_round_size, kprev, nprev, p1, p0) >= 1/alpha)
#print(roundnum, k2 - kprev, marginal_round_size, kprev, nprev, p1, p0
#omega(j, dnew, mnew, kprev, nprev, p_1, p_0):
if stop:
# this audit 'path' is ignored when computing probability of each kprev for the next round
# (since this path did not proceed to the next round)
continue
if k2 - kprev < 0 or k2 - kprev >= len(marginal_pr_ks):
continue
# this path also doesn't contribute since it's not possible
# for example with round sizes 10,10 you can't get k2=15 if k1=2 since 15-2 = 13> 10=roundsize
curpr = pr_kprevs[kprev] * marginal_pr_ks[k2 - kprev] #nice indexes :)
#print('k2:',k2,'kprev:',kprev,'pr_kprevs[kprev]:',pr_kprevs[kprev])
#print('k2:',k2,'k2-kprev:',k2-kprev,'marginal_pr_ks[k2-kprev]:',marginal_pr_ks[k2-kprev])
pr_k2s[k2] += curpr
secondlastpr = lastpr
lastpr = curpr
if not(lastpr == -1 or secondlastpr == -1):
# add the number of items skipped times the approximate value for them each (the avg of the two, to get a linear approximation)
pr_k2s[k2] += kprevsparsity * ((lastpr + secondlastpr) / 2)
# so this is convolution-like but we are only adding the audit 'paths' which do in fact continue onto the next round
second_to_last_pr_k2 = last_pr_k2
second_to_last_k2 = last_k2
last_pr_k2 = pr_k2s[k2]
last_k2 = k2
if not(last_k2 == -1 or second_to_last_k2 == -1):
# fill in with an approximationg (average) between the second to last and last points found
pr_k2s[second_to_last_k2+1:last_k2] = (second_to_last_pr_k2 + last_pr_k2) / 2
""" prints for debugging
if k2 >= 10000:
print('filling in')
plt.subplot(411)
plt.plot(pr_k2s)#[9000:11000])
tit = 'absolute pr[k'+str(roundnum)+'and proceed to round '+str(roundnum+1)+']... sum:'+str(sum(pr_k2s))#,'max:'+str(max(pr_k2s))+'min:'+str(min(pr_k2s))
plt.title(tit)
plt.subplot(412)
plt.plot(pr_kprevs)
tit = 'kprevs... sum:',str(sum(pr_kprevs))#,'max:'+str(max(pr_kprevs))+'min:'+str(min(pr_kprevs))
plt.title(tit)
plt.subplot(413)
plt.plot(marginal_pr_ks)
tit = 'marginal ks... sum:',str(sum(marginal_pr_ks))#,'max:'+str(max(marginal_pr_ks))+'min:'+str(min(marginal_pr_ks))
plt.title(tit)
plt.subplot(414)
plt.plot(pr_k2s[9500:10500])
plt.title('pr_k2s[9500:10500]')
plt.show()
"""
""" prints an dplots for debugging
print('filling in')
plt.subplot(311)
plt.plot(pr_k2s)#[9000:11000])
tit = 'absolute pr[k'+str(roundnum)+'and proceed to round '+str(roundnum+1)+']... sum:'+str(sum(pr_k2s))#,'max:'+str(max(pr_k2s))+'min:'+str(min(pr_k2s))
plt.title(tit)
plt.subplot(312)
plt.plot(pr_kprevs)
tit = 'kprevs... sum:',str(sum(pr_kprevs))#,'max:'+str(max(pr_kprevs))+'min:'+str(min(pr_kprevs))
plt.title(tit)
plt.subplot(313)
plt.plot(marginal_pr_ks)
tit = 'marginal ks... sum:',str(sum(marginal_pr_ks))#,'max:'+str(max(marginal_pr_ks))+'min:'+str(min(marginal_pr_ks))
plt.title(tit)
plt.show()
print('sum of pdf over values of k in round',roundnum,':',sum(pr_k2s))
"""
# before going back thru this loop, we need to update kprevs, nprev, and pr_kprevs
kprevs = np.zeros(len(pr_k2s), dtype=int)
for i in range(len(kprevs)):
kprevs[i] = int(i)
nprev = n
pr_kprevs = pr_k2s