-
Notifications
You must be signed in to change notification settings - Fork 0
/
countryBiasESC.jl
267 lines (225 loc) · 9.96 KB
/
countryBiasESC.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#GOALS
#=
1 Get the range of the dataset: dataCountryYearsNum() -> #looks at the whole dataset of score files and reads the header to return a dictionary for the year->countrynumber, min year, max year
2 Check the parameters: paramCheck() -> all the variable values supplied are valid
3 Simulate the null unbiases score distribution in a simulation: scoreSimDist -> Given the full range of years and interval/windowSize return the low2high score accumulation for each interval in a di=ctionary: windowDist[string(yr,"-",yr+windowSize)]
4 using the scoreSim dictionary of ranked list of sampled scores, return the threshold for each window as a dictionary -> windowConf[string(yr,"-",yr+windowSize)] = confalpha
5
=#
#need GraphViz to visualize the results
#using GraphViz
using Statistics
#csvDir2AdjList.jl is the file with functions to call upon the real data to get the aggregates
include("csvDir2AdjList.jl")
#include("biasGraphView.jl")
#include("biasAnalysis.jl")
#>>MAIN<<
function biasesESC(startYr = 1980, endYr = 1990, windowSize = 5, alpha = 0.05)
#load data and get the dictionary for the country num per year
#looks at the whole dataset of score files and reads the header to return a dictionary for the year->countrynumber,
#min year, max year
countryYearsNum, yrMin, yrMax = dataCountryYearsNum()
#check params
paramCheck(startYr, endYr, windowSize, yrMin, yrMax)
#simulate scores to create a distribution1
#return the low2high score accumulation for each interval in a dictionary: windowDist[string(yr,"-",yr+windowSize)]
windowDist = scoreSimDist(startYr, endYr, windowSize, countryYearsNum)
#println(windowDist)
#first the upper
tailSide = "upper"
#get confidence intervals for the lower or upper end
#using the scoresim, get a dictionary for the threshold of significances:
#dictionary -> windowConf[string(yr,"-",yr+windowSize)] = confalpha
windowConf = windowConfValues(startYr, endYr, windowSize, windowDist, tailSide, alpha)
println("finished windowConf")
#Now we must obtain the averages for each country from to country (we need to have the CSV data read)
#return the dictionary of the time windows and the aggregate adjacency list of scores, country names, average scores aggregates for each window, and the thresholds of when the average surpasses the alpha value for significance. keys: "countries" "thresholdSigAdjList" "avgScoreAggregateAdjList" "scoreAggregateAdjList"
winAggDictUpper = windowsDictThresholdsAdjList(windowConf, startYr, endYr, windowSize)
println(winAggDictUpper)
println("finished Agg Dict Upper")
#To reduce computations later on, we now add to the dictionary a key to the total threshold surpassings as a count for the set of windows (1 per window significance) and get the total country name list for the full time period each window covers
winAggDictUpper = dictTotalThresholdsAdjListWindowCount(winAggDictUpper,startYr,endYr,windowSize)
println("finished upper")
#same as above but now for the lower end of the distribution
tailSide = "lower"
windowConfLower = windowConfValues(startYr, endYr, windowSize, windowDist, tailSide, alpha)
winAggDictLower = windowsDictThresholdsAdjList(windowConfLower, startYr, endYr, windowSize)
winAggDictLower = dictTotalThresholdsAdjListWindowCount(winAggDictLower,startYr,endYr,windowSize)
winAggDictUpper["alpha"] = alpha
winAggDictLower["alpha"] = alpha
winAggDictUpper["windowSize"] = windowSize
winAggDictLower["windowSize"] = windowSize
winAggDictUpper["side"] = "Upper"
winAggDictLower["side"] = "Lower"
println("finished lower")
#produce the GraphViz visualizations for this dataset; from ("biasGraphView.jl")
# graphAvoid(winAggDictUpper,winAggDictLower)
# println("finished graphing")
# #produce scatter plots for the aggregate edge types for countries; from ("biasAnalysis.jl")
# analyzeBiases(winAggDictUpper,winAggDictLower)
# println("finished scatter plots")
return [winAggDictUpper,winAggDictLower]
end
#return the year windows' confidence interval values specified
#using the scoreSim dictionary of ranked list of sampled scores, return the threshold for each window as a dictionary -> windowConf[string(yr,"-",yr+windowSize)] = confalpha
function windowConfValues(startYr, endYr, windowSize, windowDist, tailSide, alpha)
if(tailSide == "upper" || tailSide == "right")
alpha = 1 - alpha
end
windowConf = Dict() #hold the distribution of the scores
if(tailSide == "upper" || tailSide == "right")
windowConf["tailSide"] = "upper"
else
windowConf["tailSide"] = "lower"
end
yr = startYr
while( (yr+windowSize) <= endYr )
#each window is a null dist unique due to the voting schemes
#simulate the scores for the dist
distTmp = windowDist[string(yr,"-",yr+windowSize)]
sampleSize = length(distTmp)
confIndAlpha = max(1,floor(Int,alpha*sampleSize))
confalpha = distTmp[confIndAlpha]
windowConf[string(yr,"-",yr+windowSize)] = confalpha
yr = yr + windowSize
end
return windowConf
end
#return the year windows of distributions for scores
#Given the full range of years and interval/windowSize return the low2high score accumulation for each interval in a dictionary: windowDist[string(yr,"-",yr+windowSize)]
function scoreSimDist(startYr, endYr, windowSize, countryYearsNum)
#Generate NULL distribution for each set of years in the windows
windowDist = Dict() #hold the distribution of the scores
yr = startYr
while( (yr+windowSize) <= endYr )
#each window is a null dist unique due to the voting schemes
#simulate the scores for the dist
distTmp = scoreSim(yr,yr+windowSize,countryYearsNum)
windowDist[string(yr,"-",yr+windowSize)] = distTmp
yr = yr + windowSize
end
return windowDist
end
#simulate the score distribution for each window span provided
function scoreSim(startYr,endYr,countryYearsNum)
AVG_SIMULATION = []
iterNum = 3000
for ii = 1:iterNum
ONE_SIMULATION = []
for yr = startYr:endYr
NUM = countryYearsNum[yr]
if(yr >= 1975 || yr == 1963 || yr == 1962)
score = Allocated(yr,NUM);
elseif( (1964<=yr<= 1966) || yr==1974 || (1967<=yr<=1970) || (1957<=yr<=1961))
score = Sequential(yr,NUM)
elseif(1971<=yr<=1973)
score = Rated(yr,NUM)
else
score = Allocated(-1,NUM)
end
append!(ONE_SIMULATION,score);
end
avgSim = mean(ONE_SIMULATION);
append!(AVG_SIMULATION,avgSim);
end
sortedAVG_SIMULATION = sort(AVG_SIMULATION,rev=false)#returns low to high
return sortedAVG_SIMULATION
end
#here each country can receive a set of scores with consecutive points awarded
#in sequence for that year it has an equal chance of receiving each score
function Sequential(yr,NUM)
SCORES1 = [5,3,1]
SCORES2 = ones(Int,1,10)
score = 0
if(1964 <= yr <= 1966)
for ii=1:length(SCORES1)
position = ceil.(rand(1,1)*NUM)
if((Int.(position))[1] == 1)
score = SCORES1[ii] + score
end
end
elseif(yr==1974 || (1967<=yr<=1970) || (1957<=yr<=1961))
for ii=1:length(SCORES2)
position = ceil.(rand(1,1)*NUM)
if((Int.(position))[1] == 1)
score = SCORES2[ii] + score
end
end
end
return score
end
function Allocated(yr,NUM)
SCORES1 = [3,2,1]
SCORES2 = [5,4,3,2,1]
SCORES3 = [12,10,8,7,6,5,4,3,2,1]
position = ceil.(rand(1,1)*NUM)
if(yr >= 1975 && yr <= 2016)
SCORES = SCORES3
elseif(yr == 1962)
SCORES = SCORES1
elseif(yr == 1963)
SCORES = SCORES2
else
SCORES = SCORES3
end
if(position[1] <= length(SCORES))
score = SCORES[Int.(position)]
else
score = 0
end
return score
end
function Rated(yr,NUM)
SCORES1 = [5,4,3,2,1]
if(1971<=yr<=1973)
X1 = SCORES1[rand(1:end)]
X2 = SCORES1[rand(1:end)]
else
return -1
end
score = X1 + X2
return score
end
#get the years of the data provided and country number in each year
#looks at the whole dataset of score files and reads the header to return a dictionary for the year->countrynumber, min year, max year
function dataCountryYearsNum()
countryYearsNum = Dict{Integer,Integer}()
resultsFile = readdir("./dataTables/")
yrMin = 100000
yrMax = -1
for rf in resultsFile
fileTmp = open(string("./dataTables/",rf))
linesTmp = readlines(fileTmp) #readfile lines
yrTmp = parse(Int,((split(rf,"."))[1]))
countryNumTmp = length(split(linesTmp[1],",")) - 1
countryYearsNum[yrTmp] = countryNumTmp
close(fileTmp)
if(yrTmp < yrMin)
yrMin = yrTmp
end
if(yrTmp > yrMax)
yrMax = yrTmp
end
end
return countryYearsNum, yrMin, yrMax
end
#fn to accept the parameters and check for the validity
function paramCheck(startYr, endYr, windowSize, yrMin, yrMax)
if(startYr >= endYr)
println("the start year needs to be before the end year")
quit()
end
if((startYr+4) >= endYr)
println("the start year needs to be at least 4 years prior the end year")
quit()
end
#sanity check input years
if( endYr < startYr || startYr < yrMin || endYr > yrMax )
print(string("year range improperly set, for the analysis end year must be greater than start and the smallest year is $(yrMin) and largest $(yrMax) with smallest first"))
quit()
end
if( (startYr+windowSize) > endYr)
print("not enough years between start and end for analysis due to window size")
quit()
end
end