-
Notifications
You must be signed in to change notification settings - Fork 0
/
ZhuQiangang_Week8.py
269 lines (240 loc) · 9.98 KB
/
ZhuQiangang_Week8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# Assessed exercises for week 8 -qq plots
# It is often the case that we wish to decide which distribution is the best fit
# to a single variable. For example, we might want to see whether the residuals
# of a linear regression are approximately normally distributed. QQ-plots are
# one of the best ways to do this. They are often superior to drawing histograms
# as it's easier to assess whether the tails of the distribution fit.
# In this assessed exercise we're going to create some QQ-plots. The steps to create
# a qqplot to compare a chosen probability distribution with a single variable x are:
# 1. Calculate the empirical cdf (ecdf) of x
# 2. Simulate a large number of observations from the chosen probability distribution
# 3. Find the quantiles of the distribution at the probabilities defined by the ecdf
# If the two data sets match, a plot of the quantiles and the original data should
# fall on a straight line. For more detail, see e.g. http://onlinestatbook.com/2/advanced_graphs/q-q_plots.html
# In this exercises we will use four data sets which come from four unknown probability
# distributions. One of them comes from a N(0,1) distribution, another a t_5 distribution
# another a Exp(1) distribution, and finally a Chi-squared(1) distribution. The files
# are labelled qq1 to qq4.txt and are all of different lengths. We're going to use
# QQ-plots to find which data set matches to which probability distribution
#
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# First you will need to load in the data sets
path = '/path/to/files/'
qq1 = pd.read_csv(path + 'qq1.txt', header=None)
qq2 = pd.read_csv(path + 'qq2.txt', header=None)
qq3 = pd.read_csv(path + 'qq3.txt', header=None)
qq4 = pd.read_csv(path + 'qq4.txt', header=None)
# Q1 For the first part of the task, we need to create the empirical cumulative distribution
# function (ecdf). This is defined as:
# Number of observations z less than or equal to z_i / Number of observations, for every z_i in z
# Write a function called which takes a set of observations z and produces the empirical cdf
# If you are unfamiliar with empirical cdfs, you may want to read the following article:
# https://towardsdatascience.com/what-why-and-how-to-read-empirical-cdf-123e2b922480
def exercise1(z):
x = np.sort(z)
y = np.arange(1, len(z) + 1) / len(z)
return x, y
x1, y1 = exercise1(qq1)
x2, y2 = exercise1(qq2)
x3, y3 = exercise1(qq3)
x4, y4 = exercise1(qq4)
plt.subplot(2, 2, 1)
plt.plot(x1, y1, c='r')
plt.title('ECDF_qq1')
plt.xlabel('X')
plt.ylabel('ECDF')
plt.subplot(2, 2, 2)
plt.plot(x2, y2, c='b')
plt.title('ECDF_qq2')
plt.xlabel('X')
plt.ylabel('ECDF')
plt.subplot(2, 2, 3)
plt.plot(x3, y3, c='g')
plt.title('ECDF_qq3')
plt.xlabel('X')
plt.ylabel('ECDF')
plt.subplot(2, 2, 4)
plt.plot(x4, y4, c='y')
plt.title('ECDF_qq4')
plt.xlabel('X')
plt.ylabel('ECDF')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('ECDF.png', dpi=100)
plt.show()
# Plot each of the variables qq1, qq2, etc. versus their ecdf, as subplots in a single figure window.
# Save your figure and include it in your submission.
# Q2 For the next part we need to create the quantiles of a chosen probability distribution
# Write a function which takes an ecdf created by your function in Q2
# and simulates 10,000 observations from a normal(0,1) distribution. Then calculate
# the quantiles of these simulations at the probabilities defined by the ecdf
def exercise2(ecdf):
data = np.random.normal(0, 1, 10000)
quantiles = [np.quantile(data, ecdf[i]) for i in range(len(ecdf))]
return quantiles
quantile_qq1 = exercise2(exercise1(qq1)[1])
quantile_qq2 = exercise2(exercise1(qq2)[1])
quantile_qq3 = exercise2(exercise1(qq3)[1])
quantile_qq4 = exercise2(exercise1(qq4)[1])
plt.subplot(2, 2, 1)
plt.scatter(quantile_qq1, qq1, c='r')
plt.title('qq plot_qq1')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq1')
plt.plot(quantile_qq1, quantile_qq1, linestyle='-')
plt.subplot(2, 2, 2)
plt.scatter(quantile_qq2, qq2, c='b')
plt.title('qq plot_qq2')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq2')
plt.plot(quantile_qq2, quantile_qq2, linestyle='-')
plt.subplot(2, 2, 3)
plt.scatter(quantile_qq3, qq3, c='g')
plt.title('qq plot_qq3')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq3')
plt.plot(quantile_qq3, quantile_qq3, linestyle='-')
plt.subplot(2, 2, 4)
plt.scatter(quantile_qq4, qq4, c='y')
plt.title('qq plot_qq4')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq4')
plt.plot(quantile_qq4, quantile_qq4, linestyle='-')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('qq_plot.png', dpi=100)
plt.show()
# Create a scatter plot of the theoretical quantiles from your new function (x-axis) against qq1 (y-axis). Repeat
# this for each dataset, creating each plot as a subplot on the same figure. Save your figure and include it in your
# submission. If the two distributions match, the points should lie on a straight line - this is a QQ-plot. Which of
# the datasets is normally distributed variable?
# Ans: qq4
# Q3 Create a new function that takes two arguments. The first argument should be your data Series and the second
# argument should be a set of simulations from some probability distribution. It should use these samples to calculate
# the theoretical quantiles. This function should the computed theoretical quantiles
def exercise3(y, d):
ecdf = exercise1(y)[1]
quantiles = [np.quantile(d, ecdf[i]) for i in range(len(ecdf))]
return quantiles
# Q4 Run your function for each of the datasets, with
# - d = Series(npr.randn(10000)) (normal distribution)
# - d = Series(npr.exponential(1,10000)) (exponential distribution)
# - d = Series(npr.standard_t(5,10000)) (t_5 distribution)
# - d = Series(npr.chisquare(1,10000)) (chi-squared distribution)
# Plot empirical data versus the theoretical quantiles returned by exercise3 to determine which
# data set matches to which probability distribution
# Complete the quiz 'W8 - Assessed exercises Q4' to submit your answer for this question.
# random distribution
exercise3(qq1, Series(np.random.exponential(1, 10000)))
plt.subplot(2, 2, 1)
plt.scatter(exercise3(qq1, Series(np.random.randn(10000))), qq1, c='r')
plt.title('normal distribution_qq1')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq1')
plt.plot(qq1, qq1, linestyle='-')
plt.subplot(2, 2, 2)
plt.scatter(exercise3(qq2, Series(np.random.randn(10000))), qq2, c='b')
plt.title('normal distribution_qq2')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq2')
plt.plot(qq2, qq2, linestyle='-')
plt.subplot(2, 2, 3)
plt.scatter(exercise3(qq3, Series(np.random.randn(10000))), qq3, c='g')
plt.title('normal distribution_qq3')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq3')
plt.plot(qq3, qq3, linestyle='-')
plt.subplot(2, 2, 4)
plt.scatter(exercise3(qq4, Series(np.random.randn(10000))), qq4, c='y')
plt.title('normal distribution_qq4')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq4')
plt.plot(qq4, qq4, linestyle='-')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('normal.png', dpi=100)
plt.show()
# exponential distribution
plt.subplot(2, 2, 1)
plt.scatter(exercise3(qq1, Series(np.random.exponential(1, 10000))), qq1, c='r')
plt.title('exponential distribution_qq1')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq1')
plt.plot(qq1, qq1, linestyle='-')
plt.subplot(2, 2, 2)
plt.scatter(exercise3(qq2, Series(np.random.exponential(1, 10000))), qq2, c='b')
plt.title('exponential distribution_qq2')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq2')
plt.plot(qq2, qq2, linestyle='-')
plt.subplot(2, 2, 3)
plt.scatter(exercise3(qq3, Series(np.random.exponential(1, 10000))), qq3, c='g')
plt.title('exponential distribution_qq3')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq3')
plt.plot(qq3, qq3, linestyle='-')
plt.subplot(2, 2, 4)
plt.scatter(exercise3(qq4, Series(np.random.exponential(1, 10000))), qq4, c='y')
plt.title('exponential distribution_qq4')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq4')
plt.plot(qq4, qq4, linestyle='-')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('exponential.png', dpi=100)
plt.show()
# t_5 distribution
plt.subplot(2, 2, 1)
plt.scatter(exercise3(qq1, Series(np.random.standard_t(5, 10000))), qq1, c='r')
plt.title('t_5 distribution_qq1')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq1')
plt.plot(qq1, qq1, linestyle='-')
plt.subplot(2, 2, 2)
plt.scatter(exercise3(qq2, Series(np.random.standard_t(5, 10000))), qq2, c='b')
plt.title('t_5 distribution_qq2')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq2')
plt.plot(qq2, qq2, linestyle='-')
plt.subplot(2, 2, 3)
plt.scatter(exercise3(qq3, Series(np.random.standard_t(5, 10000))), qq3, c='g')
plt.title('t_5 distribution_qq3')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq3')
plt.plot(qq3, qq3, linestyle='-')
plt.subplot(2, 2, 4)
plt.scatter(exercise3(qq4, Series(np.random.standard_t(5, 10000))), qq4, c='y')
plt.title('t_5 distribution_qq4')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq4')
plt.plot(qq4, qq4, linestyle='-')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('t_5.png', dpi=100)
plt.show()
# chi-squared distribution
plt.subplot(2, 2, 1)
plt.scatter(exercise3(qq1, Series(np.random.chisquare(1, 10000))), qq1, c='r')
plt.title('chisquare distribution_qq1')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq1')
plt.plot(qq1, qq1, linestyle='-')
plt.subplot(2, 2, 2)
plt.scatter(exercise3(qq2, Series(np.random.chisquare(1, 10000))), qq2, c='b')
plt.title('chisquare distribution_qq2')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq2')
plt.plot(qq2, qq2, linestyle='-')
plt.subplot(2, 2, 3)
plt.scatter(exercise3(qq3, Series(np.random.chisquare(1, 10000))), qq3, c='g')
plt.title('chisquare distribution_qq3')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq3')
plt.plot(qq3, qq3, linestyle='-')
plt.subplot(2, 2, 4)
plt.scatter(exercise3(qq4, Series(np.random.chisquare(1, 10000))), qq4, c='y')
plt.title('chisquare distribution_qq4')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('qq4')
plt.plot(qq4, qq4, linestyle='-')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('chisquare.png', dpi=100)
plt.show()