jump-dev · joaquimg · Dec 6, 2021 · May 22, 2021 · May 23, 2021 · Jun 15, 2021
diff --git a/Project.toml b/Project.toml
@@ -26,14 +26,19 @@ MathOptSetDistances = "0.1"
 julia = "1"
 
 [extras]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6"
 Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
-JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "JuMP"]
+test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "CSV", "Statistics", "DataFrames", "Flux", "Printf", "HTTP"]
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # DiffOpt.jl
 
-[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://jump.dev/DiffOpt.jl/stable)
+<!-- [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://jump.dev/DiffOpt.jl/stable) -->
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://jump.dev/DiffOpt.jl/dev)
 [![Build Status](https://github.com/jump-dev/DiffOpt.jl/workflows/CI/badge.svg?branch=master)](https://github.com/jump-dev/DiffOpt.jl/actions?query=workflow%3ACI)
 [![Coverage](https://codecov.io/gh/jump-dev/DiffOpt.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/jump-dev/DiffOpt.jl)
@@ -65,7 +65,8 @@ then one can compute gradients by providing perturbations
 grads = backward(diff, dA, db, dc)
 ```
 
+<!-- 
 ## Note
 
 - This is a [NumFOCUS Google Summer of Code (2020) project](https://summerofcode.withgoogle.com/organizations/4727917315096576/?sp-page=2#5232064888045568)
-- Benchmarking with CVXPY or QPTH: Refer relevant examples as in [test/MOI_wrapper.jl](https://github.com/jump-dev/DiffOpt.jl/blob/master/test/MOI_wrapper.jl#L130)
+- Benchmarking with CVXPY or QPTH: Refer relevant examples as in [test/MOI_wrapper.jl](https://github.com/jump-dev/DiffOpt.jl/blob/master/test/MOI_wrapper.jl#L130) -->
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,7 +2,16 @@
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+MatrixOptInterface = "2f4eb8e6-3e35-4ae4-8c7a-f3d7d9bf20ed"
+OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Dualization = "191a621a-6537-11e9-281d-650236a99e60"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/docs/make.jl b/docs/make.jl
@@ -15,13 +15,19 @@ makedocs(;
         "Usage" => "usage.md",
         "Reference" => "reference.md",
         "Examples" => [
-            "Solving an LP" => "solve-LP.md",
-            "Solving a QP" => "solve-QP.md",
             "Solving conic with PSD and SOC constraints" => "solve-conic-1.md",
             "Differentiating a simple QP by hand" => "matrix-inversion-manual.md",
-            "Sensitivity Analysis SVM" => "sensitivity-analysis-svm.md",
+            "Sensitivity Analysis" => [
+                "SVM" => "sensitivity-analysis-svm.md",
+                "Ridge Regression" => "sensitivity-analysis-ridge.md",
+            ],
+            "Hyper-parameter optimization" => "autotuning-ridge.md",
+            "Custom Neural Network Layers" => [
+                "ReLU Layer" => "custom-relu.md",
+                "SVM as a Layer" => "custom-svm.md",
+            ],
             "ChainRules integration" => "chainrules_unit.md",
-        ],
+        ]
     ],
     strict = true,  # See https://github.com/JuliaOpt/JuMP.jl/issues/1576
     repo="https://github.com/jump-dev/DiffOpt.jl",

diff --git a/docs/src/autotuning-ridge.md b/docs/src/autotuning-ridge.md
@@ -0,0 +1,226 @@
+# Auto-tuning Hyperparameters
+
+This example shows how to learn the hyperparameters in Ridge Regression using a gradient descent routine.  Let the problem be modelled as
+
+```math
+\begin{equation}
+\min_{w} \quad \frac{1}{2n} \sum_{i=1}^{n} (y_{i} - w^T x_{i})^2 + \alpha \| w \|_2^2
+\end{equation}
+```
+
+where 
+- `x`, `y` are the data points
+- `w` constitutes weights of the regressing line
+- `α` is the only hyperparameter - regularization constant
+
+
+
+```@example 3
+using DiffOpt
+using Statistics
+using OSQP
+using JuMP
+using Plots
+import Random
+using LinearAlgebra
+nothing # hide
+```
+
+
+```@example 3
+"""
+Return the coefficient of determination R2 of the prediction.
+Best possible score is 1.0, it can be negative because the model can be arbitrarily worse
+"""
+function R2(y_true, y_pred)
+    u = sum((y_pred - y_true).^2)  # Regression sum of squares
+    v = sum((y_true .- mean(y_true)).^2)  # Total sum of squares
+
+    return 1-(u/v)
+end
+
+function create_problem(N, D, noise)
+    w = rand(D) 
+    X = rand(N, D) 
+
+    # if noise=0, then there is no need of regularization
+    # and alpha=0 wi    ll give the best R2 pred score
+    Y = X*w .+ noise*randn(N)
+
+    # test train split
+    l = N ÷ 2
+    return X[1:l, :], X[l+1:N, :], Y[1:l], Y[l+1:N]
+end
+
+X_train, X_test, Y_train, Y_test = create_problem(800, 30, 4);
+```
+
+
+```@example 3
+function fitRidge(X,Y,α)
+    model = Model(() -> diff_optimizer(OSQP.Optimizer))
+
+    N, D = size(X)
+
+    # add variables
+    @variable(model, w[1:D]>=-10)
+    set_optimizer_attribute(model, MOI.Silent(), true)
+
+    @objective(
+        model,
+        Min,
+        sum((Y - X*w).*(Y - X*w))/(2.0*N) + α*(w'w),
+    )
+
+    optimize!(model)
+
+    custom_loss = objective_value(model)
+    return model, w, custom_loss, value.(w)
+end
+nothing # hide
+```
+
+
+
+```@example 3
+αs = [0.0, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 7e-2, 2e-1, 3e-1, .5, .7, 1.0]
+Rs = Float64[]
+mse = Float64[]
+
+for α in αs
+    _, _, _, w_train = fitRidge(X_train, Y_train, α) 
+    Y_pred = X_test*w_train
+    push!(Rs, R2(Y_test, Y_pred))
+    push!(mse, sum((Y_pred - Y_test).^2))
+end
+nothing # hide
+```
+
+
+```@example 3
+plot(log.(αs), Rs*10, label="R2 prediction score",  xaxis = ("log(α)"))
+```
+
+
+
+
+```@example 3
+plot(log.(αs), mse, label="MSE", xaxis = ("log(α)"))
+```    
+
+
+
+
+# Plotting ∂l/∂α
+
+
+```@example 3
+function ∇model(model, X_train, w, ŵ, α)
+    N, D = size(X_train)
+    dw = zeros(D)
+    ∂w_∂α = zeros(D)
+
+    for i in 1:D
+        dw[i] = 1.0 #set
+
+        MOI.set(
+            model, 
+            DiffOpt.ForwardInObjective(), 
+            MOI.ScalarQuadraticFunction(
+                [MOI.ScalarAffineTerm(0.0, w[i].index)], 
+                [MOI.ScalarQuadraticTerm(dw[i]*α, w[i].index, w[i].index)], 
+                0.0
+            )
+        )
+
+        DiffOpt.forward(model)  # find grad
+
+        ∂w_∂α[i] = MOI.get(
+            model,
+            DiffOpt.ForwardOutVariablePrimal(), 
+            w[i]
+        )
+
+        dw[i] = 0.0 #unset
+    end
+    return sqrt(ŵ'ŵ) + 2α*(ŵ'∂w_∂α) - sum((X_train*∂w_∂α).*(Y_train - X_train*ŵ))/(2*N)
+end
+nothing # hide
+```
+
+
+
+```@example 3
+∂l_∂αs = Float64[]
+N, D = size(X_train)
+
+for α in αs
+    model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+    # testing optimality
+    ∂l_∂w = [2*α*ŵ[i] - sum(X_train[:,i].*(Y_train - X_train*ŵ))/N for i in 1:D]
+    @assert norm(∂l_∂w) < 1e-1
+
+    push!(
+        ∂l_∂αs, 
+        ∇model(model, X_train, w, ŵ, α)
+    )
+end
+
+plot(αs, ∂l_∂αs, label="∂l/∂α",  xaxis = ("α"))
+```    
+
+
+
+
+# Gradient Descent
+
+```@example 3
+"""
+    start from initial value of regularization constant
+    do gradient descent on alpha
+    until the MSE keeps on decreasing
+"""
+function descent(α, max_iters=25)
+    prev_mse = 1e7
+    curr_mse = 1e6
+
+    α_s = Float64[]
+    mse = Float64[]
+
+    iter=0
+    while curr_mse-10 < prev_mse && iter < max_iters
+        iter += 1
+        model, w, _, ŵ = fitRidge(X_train, Y_train, α)
+
+        #update
+        ∂α = ∇model(model, X_train, w, ŵ, α)
+
+        α += 0.01*∂α  # make it grow real slow
+
+        push!(α_s, α)
+
+        Y_pred = X_test*ŵ
+
+        prev_mse = curr_mse
+        curr_mse = sum((Y_pred - Y_test).^2) 
+
+        push!(mse, curr_mse)
+    end
+
+    return α_s, mse
+end
+nothing # hide
+```
+
+
+```@example 3
+ᾱ, msē = descent(1.0);
+nothing # hide
+```
+
+
+```@example 3
+plot(log.(αs), mse, label="MSE", xaxis = ("α"))
+plot!(log.(ᾱ), msē, label="G.D. for α", lw = 2)
+```