22
22
# computed with DiffOpt, we can perform a gradient descent on top of the inner model
23
23
# to minimize the test loss.
24
24
25
- using DiffOpt
26
- using Statistics
27
- using OSQP
28
- using JuMP
29
- using Plots
30
- import Random
31
- using LinearAlgebra
32
-
25
+ # This tutorial uses the following packages
33
26
34
- """
35
- R2(y_true, y_pred)
36
-
37
- Return the coefficient of determination R2 of the prediction in `[0,1]`.
38
- """
39
- function R2 (y_true, y_pred)
40
- u = norm (y_pred - y_true)^ 2 # Regression sum of squares
41
- v = norm (y_true .- mean (y_true))^ 2 # Total sum of squares
42
- return 1 - u/ v
43
- end
27
+ using JuMP # The mathematical programming modelling language
28
+ import DiffOpt # JuMP extension for differentiable optimization
29
+ import OSQP # Optimization solver that handles quadratic programs
30
+ import Plots # Graphing tool
31
+ import LinearAlgebra: norm, dot
32
+ import Random
44
33
45
34
# ## Generating a noisy regression dataset
46
35
47
- function create_problem (N, D, noise)
48
- w = 10 * randn (D)
49
- X = 10 * randn (N, D)
50
- y = X * w + noise * randn (N)
51
- l = N ÷ 2 # test train split
52
- return (X[1 : l, :], X[l+ 1 : N, :], y[1 : l], y[l+ 1 : N])
53
- end
54
-
55
36
Random. seed! (42 )
56
37
57
- X_train, X_test, y_train, y_test = create_problem (1000 , 200 , 50 );
38
+ N = 100
39
+ D = 20
40
+ noise = 5
41
+
42
+ w_real = 10 * randn (D)
43
+ X = 10 * randn (N, D)
44
+ y = X * w_real + noise * randn (N)
45
+ l = N ÷ 2 # test train split
46
+
47
+ X_train = X[1 : l, :]
48
+ X_test = X[l+ 1 : N, :]
49
+ y_train = y[1 : l]
50
+ y_test = y[l+ 1 : N];
58
51
59
52
# ## Defining the regression problem
60
53
61
54
# We implement the regularized regression problem as a function taking the problem data,
62
55
# building a JuMP model and solving it.
63
56
64
- function fit_ridge (X, y, α, model = Model (() -> diff_optimizer (OSQP . Optimizer)) )
57
+ function fit_ridge (model, X, y, α)
65
58
JuMP. empty! (model)
66
- set_optimizer_attribute (model, MOI . Silent (), true )
59
+ set_silent (model)
67
60
N, D = size (X)
68
61
@variable (model, w[1 : D])
69
62
err_term = X * w - y
70
63
@objective (
71
64
model,
72
65
Min,
73
- 1 / ( 2 * N * D) * dot (err_term, err_term) + 1 / (2 * D) * α * dot (w, w),
66
+ dot (err_term, err_term) / (2 * N * D) + α * dot (w, w) / ( 2 * D ),
74
67
)
75
68
optimize! (model)
76
- if termination_status (model) != MOI. OPTIMAL
77
- error (" Unexpected status: $(termination_status (model)) " )
78
- end
79
- regularized_loss_value = objective_value (model)
80
- training_loss_value = 1 / (2 * N * D) * JuMP. value (dot (err_term, err_term))
81
- return model, w, value .(w), training_loss_value, regularized_loss_value
69
+ @assert termination_status (model) == MOI. OPTIMAL
70
+ return w
82
71
end
83
72
84
73
# We can solve the problem for several values of α
85
74
# to visualize the effect of regularization on the testing and training loss.
86
75
87
- αs = 0.0 : 0.01 : 0.5
88
- Rs = Float64[]
76
+ αs = 0.05 : 0.01 : 0.35
89
77
mse_test = Float64[]
90
78
mse_train = Float64[]
91
- model = JuMP . Model (() -> diff_optimizer (OSQP. Optimizer))
79
+ model = Model (() -> DiffOpt . diff_optimizer (OSQP. Optimizer))
92
80
(Ntest, D) = size (X_test)
81
+ (Ntrain, D) = size (X_train)
93
82
for α in αs
94
- _, _, w_train, training_loss_value, _ = fit_ridge (X_train, y_train, α, model)
95
- y_pred = X_test * w_train
96
- push! (Rs, R2 (y_test, y_pred))
97
- push! (mse_test, dot (y_pred - y_test, y_pred - y_test) / (2 * Ntest * D))
98
- push! (mse_train, training_loss_value)
83
+ w = fit_ridge (model, X_train, y_train, α)
84
+ ŵ = value .(w)
85
+ ŷ_test = X_test * ŵ
86
+ ŷ_train = X_train * ŵ
87
+ push! (mse_test, norm (ŷ_test - y_test)^ 2 / (2 * Ntest * D))
88
+ push! (mse_train, norm (ŷ_train - y_train)^ 2 / (2 * Ntrain * D))
99
89
end
100
90
101
- # Visualize the R2 correlation metric
102
-
103
- plot (αs, Rs, label= nothing , xaxis= " α" , yaxis= " R2" )
104
- title! (" Test coefficient of determination R2" )
105
-
106
91
# Visualize the Mean Score Error metric
107
92
108
- plot (αs, mse_test ./ sum (mse_test), label= " MSE test" , xaxis = " α" , yaxis= " MSE" , legend= (0.8 ,0.2 ))
109
- plot! (αs, mse_train ./ sum (mse_train), label= " MSE train" )
110
- title! (" Normalized MSE on training and testing sets" )
93
+ Plots. plot (
94
+ αs, mse_test ./ sum (mse_test),
95
+ label= " MSE test" , xaxis = " α" , yaxis= " MSE" , legend= (0.8 , 0.2 )
96
+ )
97
+ Plots. plot! (
98
+ αs, mse_train ./ sum (mse_train),
99
+ label= " MSE train"
100
+ )
101
+ Plots. title! (" Normalized MSE on training and testing sets" )
111
102
112
103
# ## Leveraging differentiable optimization: computing the derivative of the solution
113
104
114
105
# Using DiffOpt, we can compute `∂w_i/∂α`, the derivative of the learned solution `̂w`
115
106
# w.r.t. the regularization parameter.
116
107
117
- function compute_dw_dparam (model, w)
108
+ function compute_dw_dα (model, w)
118
109
D = length (w)
119
110
dw_dα = zeros (D)
120
111
MOI. set (
121
112
model,
122
113
DiffOpt. ForwardInObjective (),
123
- 1 / 2 * dot (w, w) / D ,
114
+ dot (w, w) / ( 2 * D) ,
124
115
)
125
116
DiffOpt. forward (model)
126
117
for i in 1 : D
@@ -133,13 +124,13 @@ function compute_dw_dparam(model, w)
133
124
return dw_dα
134
125
end
135
126
136
- # Using `∂w_i/∂α` computed with `compute_dw_dparam `,
127
+ # Using `∂w_i/∂α` computed with `compute_dw_dα `,
137
128
# we can compute the derivative of the test loss w.r.t. the parameter α
138
129
# by composing derivatives.
139
130
140
131
function d_testloss_dα (model, X_test, y_test, w, ŵ)
141
132
N, D = size (X_test)
142
- dw_dα = compute_dw_dparam (model, w)
133
+ dw_dα = compute_dw_dα (model, w)
143
134
err_term = X_test * ŵ - y_test
144
135
return sum (eachindex (err_term)) do i
145
136
dot (X_test[i,:], dw_dα) * err_term[i]
@@ -151,31 +142,34 @@ end
151
142
152
143
function descent (α0, max_iters= 100 ; fixed_step = 0.01 , grad_tol= 1e-3 )
153
144
α_s = Float64[]
154
- test_loss_values = Float64[]
145
+ test_loss = Float64[]
155
146
α = α0
156
- model = JuMP. Model (() -> DiffOpt. diff_optimizer (OSQP. Optimizer))
147
+ N, D = size (X_test)
148
+ model = Model (() -> DiffOpt. diff_optimizer (OSQP. Optimizer))
157
149
for iter in 1 : max_iters
158
- push! (α_s , α)
159
- _, w, ŵ, _, = fit_ridge (X_train, y_train, α, model )
150
+ w = fit_ridge (model, X_train, y_train , α)
151
+ ŵ = value .(w )
160
152
err_term = X_test * ŵ - y_test
161
- test_loss = norm (err_term)^ 2 / (2 * length (X_test))
162
- push! (
163
- test_loss_values,
164
- test_loss,
165
- )
153
+ push! (α_s, α)
154
+ push! (test_loss, norm (err_term)^ 2 / (2 * N * D))
166
155
∂α = d_testloss_dα (model, X_test, y_test, w, ŵ)
167
156
α -= fixed_step * ∂α
168
157
if abs (∂α) ≤ grad_tol
169
158
break
170
159
end
171
160
end
172
- return α_s, test_loss_values
161
+ return α_s, test_loss
173
162
end
174
163
175
- ᾱ, msē = descent (0.1 , 500 );
164
+ ᾱ_l, msē_l = descent (0.10 , 500 );
165
+ ᾱ_r, msē_r = descent (0.33 , 500 );
176
166
177
167
# Visualize gradient descent and convergence
178
168
179
- plot (αs, mse_test, label= " MSE test" , xaxis = (" α" ), legend= :topleft )
180
- plot! (ᾱ, msē, label= " learned α" , lw = 2 )
181
- title! (" Regularizer learning" )
169
+ Plots. plot (
170
+ αs, mse_test,
171
+ label= " MSE test" , xaxis = (" α" ), legend= :topleft
172
+ )
173
+ Plots. plot! (ᾱ_l, msē_l, label= " learned α, start = 0.10" , lw = 2 )
174
+ Plots. plot! (ᾱ_r, msē_r, label= " learned α, start = 0.33" , lw = 2 )
175
+ Plots. title! (" Regularizer learning" )
0 commit comments