From 5c70ea4d8b3025993e30142e5834668f361d90e8 Mon Sep 17 00:00:00 2001
From: "michael d. catchen" <mdcatchen@gmail.com>
Date: Wed, 1 May 2024 14:02:38 -0400
Subject: [PATCH] moving cube sampling to testitems

---
 src/cubesampling.jl | 241 ++++++++++++++++++++++++--------------------
 1 file changed, 134 insertions(+), 107 deletions(-)

diff --git a/src/cubesampling.jl b/src/cubesampling.jl
index 20e7b82..5bf7904 100644
--- a/src/cubesampling.jl
+++ b/src/cubesampling.jl
@@ -14,23 +14,29 @@ A `BONRefiner` that uses Cube Sampling (Tillé 2011)
 **πₖ**, a Float Vector indicating the probabilities of inclusion for each candidate point; should sum to numpoints value.
 """
 
-Base.@kwdef mutable struct CubeSampling{I <: Integer, M <: Matrix, V <: Vector} <: BONRefiner
+Base.@kwdef struct CubeSampling{I <: Integer, M <: Matrix, V <: Vector} <: BONRefiner
     numpoints::I = 50
     fast::Bool = true
     x::M = rand(0:4, 3, 50)
-    πₖ::V = zeros(size(x,2))
-
+    πₖ::V = zeros(size(x, 2))
     function CubeSampling(numpoints, fast, x, πₖ)
-        if numpoints < one(numpoints)
-            throw(ArgumentError("You cannot have a CubeSampling with fewer than one point.",),)
-        end
+        cs = new{typeof(numpoints), typeof(x), typeof(πₖ)}(numpoints, fast, x, πₖ)
+        _check_arguments(cs)
         if numpoints > length(πₖ)
-            throw(ArgumentError("You cannot select more points than the number of candidate points.",),)
+            throw(
+                ArgumentError(
+                    "You cannot select more points than the number of candidate points.",
+                ),
+            )
         end
         if length(πₖ) != size(x, 2)
-            throw(DimensionMismatch("The number of inclusion probabilites does not match the dimensions of the auxillary variable matrix.",),)
+            throw(
+                DimensionMismatch(
+                    "The number of inclusion probabilites does not match the dimensions of the auxillary variable matrix.",
+                ),
+            )
         end
-        return new{typeof(numpoints), typeof(x), typeof(πₖ)}(numpoints, fast, x, πₖ)
+        return cs
     end
 end
 
@@ -38,51 +44,56 @@ function check_conditions(coords, pool, sampler)
     πₖ = sampler.πₖ
     if sum(sampler.πₖ) == 0
         @info "Probabilities of inclusion were not provided, so we assume equal probability design."
-        πₖ = fill(sampler.numpoints/length(pool), length(pool))
-    end 
+        πₖ = [sampler.numpoints / length(pool) for _ in eachindex(pool)]
+    end
     if round(Int, sum(πₖ)) != sampler.numpoints
         @warn "The inclusion probabilities sum to $(round(Int, sum(πₖ))), which will be your sample size instead of numpoints."
     end
     if length(pool) != length(πₖ)
-        throw(DimensionMismatch("The πₖ vector does not match the number of candidate points.",),)
+        throw(
+            DimensionMismatch(
+                "The πₖ vector does not match the number of candidate points.",
+            ),
+        )
     end
     if length(πₖ) != size(sampler.x, 2)
-        throw(DimensionMismatch("There is a mismatch in the number of inclusion probabilities and the points in the auxillary matrix.",),)
+        throw(
+            DimensionMismatch(
+                "There is a mismatch in the number of inclusion probabilities and the points in the auxillary matrix.",
+            ),
+        )
     end
-    πₖ
-
+    return πₖ
 end
 
-
 function _generate!(
-    coords::Vector{CartesianIndex}, 
+    coords::Vector{CartesianIndex},
     pool::Vector{CartesianIndex},
     sampler::CubeSampling,
-    uncertainty::Matrix{T}
-    ) where {T <: AbstractFloat}
-
+    uncertainty::Matrix{T},
+) where {T <: AbstractFloat}
     πₖ = check_conditions(coords, pool, sampler)
 
     # sort points by distance in auxillary variable space
     dist = mahalanobis(πₖ, sampler.x)
-    perm = sortperm(dist, rev=true)
-    pool, πₖ = pool[perm],  πₖ[perm]
+    perm = sortperm(dist; rev = true)
+    pool, πₖ = pool[perm], πₖ[perm]
+
+    x = sampler.x[:, perm]
 
-    x = sampler.x[:,perm]
- 
     # To keep the sample size enforced, add πₖ as an aux variable
     x = vcat(transpose(πₖ), x)
 
     # pick flight phase algorithm
     π_optimal_flight = sampler.fast ? cubefastflight(πₖ, x) : cubeflight(πₖ, x)
     # check if there are non-integer probabilities
-    non_int_ind = findall(u -> u .∉ Ref(Set([0,1])), π_optimal_flight)
+    non_int_ind = findall(u -> u .∉ Ref(Set([0, 1])), π_optimal_flight)
     # if so, perform landing phase to resolve them
     π_optimal = isempty(non_int_ind) ? π_optimal_flight : cubeland(π_optimal_flight, πₖ, x)
 
-    selected = pool[findall(z -> z == 1, π_optimal)]
+    selected = pool[findall(isequal(1), π_optimal)]
 
-    for i = 1:length(selected)
+    for i in eachindex(selected)
         coords[i] = pool[i]
     end
 
@@ -90,16 +101,15 @@ function _generate!(
 end
 
 function cubeflight(πₖ, x)
-    
     N = length(πₖ)
     rowdim = size(x)[1]
 
     j = 0
-    set_nullspace = zeros(1,2)
+    set_nullspace = zeros(1, 2)
     π_optimal = πₖ
     # check if there is a possible u to satisfy the conditions
     while size(set_nullspace)[2] != 0
-        j = j+1
+        j = j + 1
 
         ## STEP 1 ##
 
@@ -107,11 +117,11 @@ function cubeflight(πₖ, x)
         # A is the matrix of auxillary variables didvided by the inclusion probability
         # for the population unit
         A = similar(x, Float64)
-        for i = 1:N
-            if π_optimal[i] .∈ Ref(Set([0,1]))
-                A[:,i] = zeros(rowdim) 
-            else 
-                A[:,i] = x[:,i] ./ π_optimal[i]
+        for i in 1:N
+            if π_optimal[i] .∈ Ref(Set([0, 1]))
+                A[:, i] = zeros(rowdim)
+            else
+                A[:, i] = x[:, i] ./ π_optimal[i]
             end
         end
 
@@ -122,13 +132,13 @@ function cubeflight(πₖ, x)
         # let's make sure the rows that need it satisfy that condition
 
         # get index where π_optimal is 0 or 1
-        π_integer = findall(u -> u .∈ Ref(Set([0,1])), π_optimal)
+        π_integer = findall(u -> u .∈ Ref(Set([0, 1])), π_optimal)
 
         # if none of the π_optimal's are fixed yet (as 0 or 1) u can be a vector from the nullspace
         if length(π_integer) == 0
             u = kernel[:, rand(1:size(kernel)[2])]
-        
-        # if only one is fixed, can also pick a u vector but it shouldn't be the trivial indicator vector
+
+            # if only one is fixed, can also pick a u vector but it shouldn't be the trivial indicator vector
         elseif length(π_integer) == 1
             sums = sum(eachrow(kernel))
             # find indicator vector
@@ -138,12 +148,12 @@ function cubeflight(πₖ, x)
             ind = deleteat!(collect(1:size(kernel)[2]), ind)
             u = kernel[:, rand(ind)]
 
-        # otherwise, need to make sure u_k = 0 condition is satisfied for fixed pikstar's
+            # otherwise, need to make sure u_k = 0 condition is satisfied for fixed pikstar's
         else
             # get rows of A's nullspace corresponding to those pikstar's
             set_A = kernel[π_integer, :]
             # get the nullspace of that matrix
-            
+
             set_nullspace = nullspace(set_A)
 
             if size(set_nullspace)[2] == 0
@@ -174,38 +184,36 @@ function cubeflight(πₖ, x)
         #    λ₁ = -πₖ⁺/u
         #    λ₂ = (πₖ⁺ - 1)/u
 
-        λ₁_max(; u, πₖ) = @. ifelse(u > 0, (1 - πₖ) / u, - πₖ / u)
+        λ₁_max(; u, πₖ) = @. ifelse(u > 0, (1 - πₖ) / u, -πₖ / u)
         λ₂_max(; u, πₖ) = @. ifelse(u > 0, πₖ / u, (πₖ - 1) / u)
 
-        λ₁ = minimum(filter(x -> isfinite(x), λ₁_max(u = u, πₖ = πₖ)))
-        λ₂ = minimum(filter(x -> isfinite(x), λ₂_max(u = u, πₖ = πₖ)))
+        λ₁ = minimum(filter(isfinite, λ₁_max(; u = u, πₖ = πₖ)))
+        λ₂ = minimum(filter(isfinite, λ₂_max(; u = u, πₖ = πₖ)))
 
         ## STEP 3 ##
 
         # calculate the inequality expression for both lambdas
-        λ1_ineq = @. πₖ + ( λ₁ * u)
-        λ2_ineq = @. πₖ - ( λ₂ * u)
+        λ1_ineq = @. πₖ + (λ₁ * u)
+        λ2_ineq = @. πₖ - (λ₂ * u)
 
         ineq_mat = reduce(hcat, [λ1_ineq, λ2_ineq])
         # the new inclusion probability π is one of the lambda expressions with a given probability q1, q2
-        q₁ = λ₂/(λ₁ + λ₂)
+        q₁ = λ₂ / (λ₁ + λ₂)
         q₂ = 1 - q₁
 
-        π_optimal = map(r -> sample(r, Weights([q₁,q₂])), eachrow(ineq_mat))
-    end 
-    return(π_optimal)
+        π_optimal = map(r -> sample(r, Weights([q₁, q₂])), eachrow(ineq_mat))
+    end
+    return (π_optimal)
 end
 
 function cubefastflight(πₖ, x)
 
-    ## Initialization ##
-
     # number of auxillary variables
     num_aux = size(x)[1]
 
     # get all non-integer probabilities
     non_int_ind = findall(u -> u .∉ Ref(Set([0, 1])), πₖ)
-    
+
     π_nonint = πₖ[non_int_ind]
 
     Ψ = π_nonint[1:(num_aux + 1)]
@@ -216,11 +224,10 @@ function cubefastflight(πₖ, x)
 
     k = num_aux + 2
 
-
     while k <= length(π_nonint)
-        Ψ = update_psi(Ψ, B) 
+        Ψ = update_psi(Ψ, B)
 
-        if length(findall(z -> z .<0, Ψ)) > 0
+        if length(findall(z -> z .< 0, Ψ)) > 0
             throw(error("Negative inclusion probability"))
         end
 
@@ -244,45 +251,45 @@ function cubefastflight(πₖ, x)
     # now do the final iteration
     Ψ = update_psi(Ψ, B)
     π_nonint[r] = Ψ
-    return(π_nonint)
+    return (π_nonint)
 end
 
 function update_psi(Ψ, B)
-     # get vector u in the kernel of B
-     u = nullspace(B)[:, 1]
+    # get vector u in the kernel of B
+    u = nullspace(B)[:, 1]
 
-     # want max  λ₁, λ₂ such that 0 <= Ψ + λ₁*u <= 1 and 0 <= Ψ - λ₂*u <= 1
-     # solve the inequalities for λ and you get max values for u > 0 and u < 0
-     # for λ₁ : for u > 0, λ₁ = (1-Ψ)/u; for u < 0, λ₁ = -Ψ/u
-     # for λ₂ : for u > 0, λ₂ = Ψ/u; for u < 0, λ₂ = (Ψ - 1)/u
+    # want max  λ₁, λ₂ such that 0 <= Ψ + λ₁*u <= 1 and 0 <= Ψ - λ₂*u <= 1
+    # solve the inequalities for λ and you get max values for u > 0 and u < 0
+    # for λ₁ : for u > 0, λ₁ = (1-Ψ)/u; for u < 0, λ₁ = -Ψ/u
+    # for λ₂ : for u > 0, λ₂ = Ψ/u; for u < 0, λ₂ = (Ψ - 1)/u
 
-     λ₁_max(; u, πₖ) = @. ifelse(u > 0, (1 - πₖ) / u, -πₖ / u)
-     λ₂_max(; u, πₖ) = @. ifelse(u > 0, πₖ / u, (πₖ - 1) / u)
+    λ₁_max(; u, πₖ) = @. ifelse(u > 0, (1 - πₖ) / u, -πₖ / u)
+    λ₂_max(; u, πₖ) = @. ifelse(u > 0, πₖ / u, (πₖ - 1) / u)
 
-     λ₁_vec = filter(x -> isfinite(x), λ₁_max(; u = u, πₖ = Ψ))
-     λ₂_vec = filter(x -> isfinite(x), λ₂_max(; u = u, πₖ = Ψ))
+    λ₁_vec = filter(x -> isfinite(x), λ₁_max(; u = u, πₖ = Ψ))
+    λ₂_vec = filter(x -> isfinite(x), λ₂_max(; u = u, πₖ = Ψ))
 
-     deleteat!(λ₁_vec, λ₁_vec .<= 0)
-     deleteat!(λ₂_vec, λ₂_vec .<= 0)
+    deleteat!(λ₁_vec, λ₁_vec .<= 0)
+    deleteat!(λ₂_vec, λ₂_vec .<= 0)
 
-     λ₁ = minimum(λ₁_vec)
-     λ₂ = minimum(λ₂_vec)
+    λ₁ = minimum(λ₁_vec)
+    λ₂ = minimum(λ₂_vec)
 
-     # calculate the inequality expression for both lambdas
-     λ₁_ineq = @. Ψ + (λ₁ * u)
-     λ₂_ineq = @. Ψ - (λ₂ * u)
+    # calculate the inequality expression for both lambdas
+    λ₁_ineq = @. Ψ + (λ₁ * u)
+    λ₂_ineq = @. Ψ - (λ₂ * u)
 
-     # checking for floating point issues
-     tol = 1e-13
-     λ₁_ineq[abs.(λ₁_ineq) .< tol] .= 0
-     λ₂_ineq[abs.(λ₂_ineq) .< tol] .= 0
+    # checking for floating point issues
+    tol = 1e-13
+    λ₁_ineq[abs.(λ₁_ineq) .< tol] .= 0
+    λ₂_ineq[abs.(λ₂_ineq) .< tol] .= 0
 
-     # the new inclusion probability π is one of the lambda expressions with a given probability q1, q2
-     q₁ = λ₂ / (λ₁ + λ₂)
+    # the new inclusion probability π is one of the lambda expressions with a given probability q1, q2
+    q₁ = λ₂ / (λ₁ + λ₂)
 
-     new_πₖ = rand() < q₁ ? λ₁_ineq : λ₂_ineq
-     return(new_πₖ)
-end    
+    new_πₖ = rand() < q₁ ? λ₁_ineq : λ₂_ineq
+    return (new_πₖ)
+end
 
 function cubeland(pikstar, πₖ, x)
     ### Landing Phase ###
@@ -291,7 +298,7 @@ function cubeland(pikstar, πₖ, x)
 
     pikstar_return = copy(pikstar)
     # get all non-integer probabilities
-    non_int_ind = findall(u -> u .∉ Ref(Set([0,1])), pikstar_return)
+    non_int_ind = findall(u -> u .∉ Ref(Set([0, 1])), pikstar_return)
     non_int_piks = pikstar_return[non_int_ind]
     N_land = length(non_int_piks)
 
@@ -308,7 +315,7 @@ function cubeland(pikstar, πₖ, x)
 
     # then get matrix of potential sample design
     # get vector with appropriate allocation of 0's and 1's
-    base_vec = vcat(repeat([1.0], outer = n_land), repeat([0.0], outer = (N_land - n_land)))
+    base_vec = vcat(repeat([1.0]; outer = n_land), repeat([0.0]; outer = (N_land - n_land)))
 
     samps = reduce(vcat, transpose.(unique_permutations(base_vec)))
 
@@ -321,18 +328,18 @@ function cubeland(pikstar, πₖ, x)
 
     # let's get A for the non-integer units
     A_land = x_land ./ reshape(πₖ[non_int_ind], :, N_land)
-    
 
     sample_pt = A_land * transpose(sub_mat)
     ## FIXME: need to deal with the case that there are fixed zeros in πₖ
     #A = x ./ reshape(πₖ, :, N)
-    zero_pik_ind = findall(u -> u == 0, πₖ)
-    A = x[:, setdiff(1:end, zero_pik_ind)] ./ reshape(πₖ[setdiff(1:end, zero_pik_ind)], :, length(πₖ) - length(zero_pik_ind))
-    
+    zero_pik_ind = findall(isequal(0), πₖ)
+    A =
+        x[:, setdiff(1:end, zero_pik_ind)] ./
+        reshape(πₖ[setdiff(1:end, zero_pik_ind)], :, length(πₖ) - length(zero_pik_ind))
 
-    cost = zeros(size(samps,1))
+    cost = zeros(size(samps, 1))
     for i in 1:size(samps)[1]
-        cost[i] = transpose(sample_pt[:, i]) * inv(A*transpose(A)) * sample_pt[:, i]
+        cost[i] = transpose(sample_pt[:, i]) * inv(A * transpose(A)) * sample_pt[:, i]
     end
 
     # get matrix of samples and costs
@@ -342,22 +349,26 @@ function cubeland(pikstar, πₖ, x)
     ## linear programing ##
     model = Model(HiGHS.Optimizer)
 
-    @variable(model, ps[1:size(samps,1)] >= 0)
+    @variable(model, ps[1:size(samps, 1)] >= 0)
 
     # multiple cost (lp_mat[2]) by ps[id], where id is lp_mat[1]
-    @objective(model, Min, sum(sample[2] * ps[trunc(Int, sample[1])] for sample in eachrow(lp_mat)))
+    @objective(
+        model,
+        Min,
+        sum(sample[2] * ps[trunc(Int, sample[1])] for sample in eachrow(lp_mat))
+    )
 
     @constraint(model, sum(ps[id]) == 1)
 
-    for i in 1:size(samps,2)
-        @constraint(model, sum(ps .* (samps.>0)[:,i]) == non_int_piks[i])
+    for i in 1:size(samps, 2)
+        @constraint(model, sum(ps .* (samps .> 0)[:, i]) == non_int_piks[i])
     end
 
     optimize!(model)
 
-    if has_values(model) 
+    if has_values(model)
         samp_prob = value.(ps)
-        
+
         # pick a sample based on their probabilities
         samp_ind = sample(1:length(samp_prob), Weights(samp_prob))
 
@@ -365,23 +376,26 @@ function cubeland(pikstar, πₖ, x)
         pikstar_return[non_int_ind] = samps[samp_ind, :]
     else
         @warn "The linear program did not find a feasible solution."
-        pikstar_return[non_int_ind] = samps[sample(1:size(samps,1)), :]
-    end 
+        pikstar_return[non_int_ind] = samps[sample(1:size(samps, 1)), :]
+    end
 
-    return(pikstar_return)
+    return pikstar_return
 end
 
 # all credit to stackoverflow https://stackoverflow.com/questions/65051953/julia-generate-all-non-repeating-permutations-in-set-with-duplicates
-function unique_permutations(x::T, prefix=T()) where T
+function unique_permutations(x::T, prefix = T()) where {T}
     if length(x) == 1
         return [[prefix; x]]
     else
         t = T[]
         for i in eachindex(x)
-            if i > firstindex(x) && x[i] == x[i-1]
+            if i > firstindex(x) && x[i] == x[i - 1]
                 continue
             end
-            append!(t, unique_permutations([x[begin:i-1];x[i+1:end]], [prefix; x[i]]))
+            append!(
+                t,
+                unique_permutations([x[begin:(i - 1)]; x[(i + 1):end]], [prefix; x[i]]),
+            )
         end
         return t
     end
@@ -397,21 +411,34 @@ function mahalanobis(πₖ, x)
     p = size(x, 1)
 
     x̂ = x ./ reshape(πₖ, :, N)
-    mean_x̂ = (1/N) .* sum(x̂, dims = 2)
+    mean_x̂ = (1 / N) .* sum(x̂; dims = 2)
 
     k_vecs = x̂ .- mean_x̂
     outer_prods = Array{Float64}(undef, p, p, N)
     for i in 1:N
         outer_prods[:, :, i] = k_vecs[1:p, i] * transpose(k_vecs[1:p, i])
     end
-    
-    sigma = (1/(N-1)) * dropdims(sum(outer_prods, dims = 3), dims = 3)
+
+    sigma = (1 / (N - 1)) * dropdims(sum(outer_prods; dims = 3); dims = 3)
     inv_sigma = inv(sigma)
-    
+
     d = Vector{Float64}(undef, N)
     for i in 1:N
         d[i] = (transpose(x̂[1:p, i] - mean_x̂) * inv_sigma * (x̂[1:p, i] - mean_x̂))[1]
     end
 
     return d
-end
\ No newline at end of file
+end
+
+# ====================================================
+#
+#   Tests
+#
+# =====================================================
+
+@testitem "CubeSampling throws exception with too few points" begin
+    @test_throws TooFewSites CubeSampling(numpoints = -1)
+    @test_throws TooFewSites CubeSampling(numpoints = 0)
+    @test_throws TooFewSites CubeSampling(numpoints = 1)
+end
+