Merge branch 'master' of github.com:baggepinnen/TotalLeastSquares.jl

baggepinnen · Mar 10, 2021 · 87d1a61 · 87d1a61
2 parents 768f69e + 86a9ce1
commit 87d1a61
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 3 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,17 +1,19 @@
 name = "TotalLeastSquares"
 uuid = "028f657a-7ace-5159-a694-8cfd97933b0c"
 authors = ["baggepinnen <[email protected]>"]
-version = "1.6.1"
+version = "1.7.0"
 
 [deps]
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 FillArrays = "0.5, 0.6, 0.8, 0.9, 0.10, 0.11"
+StatsBase = "0.33"
 julia = "1.0"
 
 [extras]

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ These functions are exported:
 - `x = rtls(A,y)` Solves a robust TLS problem. Both `A` and `y` are assumed to be corrupted with high magnitude, but sparse, noise. See analysis below.
 - `x = irls(A,y; tolx=0.001, tol=1.0e-6, verbose=false, iters=100)` minimizeₓ ||Ax-y||₁ using iteratively reweighted least squares.
 - `x = sls(A,y; r = 1, iters = 100, verbose = false, tol = 1.0e-8)` Simplex least-squares: minimizeₓ ||Ax-y||₂ s.t. sum(x) = r
+- `x = flts(A,y; outlier = 0.5, N = 500, maxiter = 100, return_set = false, verbose = true)` Fast least trimmed squares: Minimizing the sum of squared residuals by finding an outlier free subset among N initial subsets. Robust up to 50 % outlier.
 
 
 

diff --git a/src/TotalLeastSquares.jl b/src/TotalLeastSquares.jl
@@ -1,7 +1,9 @@
 module TotalLeastSquares
-export tls, tls!, wtls, wls, rtls, irls, sls, rowcovariance, hankel, ishankel, unhankel
+export tls, tls!, wtls, wls, rtls, irls, sls, rowcovariance, hankel, ishankel, unhankel, flts
 export rpca, lowrankfilter, rpca_ga, entrywise_median, entrywise_trimmed_mean, μ!
-using FillArrays, Printf, LinearAlgebra, SparseArrays, Statistics
+using FillArrays, Printf, LinearAlgebra, SparseArrays, Statistics, StatsBase
+
+include("flts.jl")
 
 """
     wls(A,y,Σ)

diff --git a/src/flts.jl b/src/flts.jl
@@ -0,0 +1,144 @@
+"""
+    flts(A::AbstractArray{<:Real}, y::Vector{<:Real})
+
+Fast least trimmed squares by an algorithm from Rousseeuw et al. - Computing LTS Regression for Large Data Sets (1999)
+The basic idea is to find an outlier-free subset `H` of regressors that minimizes the sum of squared residuals `Q`, resulting in a robust version of least squares up to 50% outliers
+
+# Args
+
+* A::AbstractArray{<:Real}: Vector or Matrix ∈ R(n x p) containing regressors 
+* y::Vector{<:Real}: Output Vector ∈ R(n)
+
+# Keyword arguments
+
+* h::Int: The size of the subset H, has to be between 0.5(n + p + 1)) <= h <= n and defaults to 0.5(n + p + 1)
+* outliers::Real: Amount of outliers, between 0 <= outliers <= 0.5. A valid definition of h overwrites an outlier definition!
+* N::Int: Number of initial p-subsets, defaults to 500 (minimum is 10). This is the main parameter regarding speed, to estimate a reasonable N, the probability of having at least one outlier free p-subset is given by: 1-(1-(1-outliers)^p)^N > 0. If verbose = true information about this gets printed.
+* maxiter::Int: Maximum number of C-steps applied in the final optimization, defaults to 100
+* ΔQmin::Real: Minimum change of the objective function (squared residuals) that defines convergence
+* return_set::Bool: If true, the subset H and the regarding value Q are also returned in the form of (H, θ, Q)
+* verbose::Bool: Print some more information
+
+# Return
+The parameters θ that minimize Q and optionally a tuple (H, θ, Q), which also contains the selected subset H and the minimal Q
+
+# Example
+Taken from the original paper
+```jldoctest
+julia> x = rand(Normal(0,10), 1000)
+julia> y = x .+ 1 + randn(1000) 
+julia> y[801:end] = rand(Normal(0,5), 200) 
+julia> x[801:end] = rand(Normal(50,5), 200) 
+julia> A = [x ones(1000)]
+julia> flts(xb,y, return_set = true)
+
+([656, 336, 293, 763, 281, 248, 269, 231, 678, 372  …  734, 220, 434, 658, 174, 465, 745, 371, 759, 732], [1.0003968687209952, 1.0058265326498548], 750.0607838759548)
+```
+"""
+function flts(A::AbstractArray{<:Real}, y::AbstractVector{<:Real};
+    h::Integer = 0, outliers::Real = -1, N::Integer = 500, maxiter::Integer = 100, ΔQmin::Real = 0.0001, return_set::Bool = false, verbose::Bool = false)
+    # Input checking
+    n = length(y)
+    size(A, 1) == n || throw(DimensionMismatch("Both inputs A and y should have the same number of rows"))
+    N >= 10 || throw(DomainError("N needs to be >= 10"))
+    if ndims(A) == 2
+        p = size(A, 2)
+    else
+        # make sure matrix multiplication works
+        p = 1
+        A = reshape(A, :, 1)
+    end
+
+    # Get the subset length h 
+    if (round(0.5(n + p + 1)) <= h <= n)
+        verbose && @info "h was set to: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
+    elseif (0.0 <= outliers <= 0.5)
+        h = Int(round((1-outliers)*n))
+        verbose && @info "h was set to: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
+    else   
+        h = Int(round(0.5(n + p + 1)))
+        verbose && @info "h was set to default: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
+    end
+    # Information about possible success
+    verbose && @info "Chance to find an outlier-free p subset is at least ~$((1-(1-(h/n)^p)^N) * 100) %"
+
+    # Initial subset H1 using p-subsets (Option b from paper)
+    hs = fill(h, N)
+    f(h) = get_initial_H(A, y, p, n, h)
+    # Array of Tuples (H, θ, Q)
+    initials = map(f, hs)
+
+    # Carry out two C-steps, take top 10 candidates for further optimization
+    g1(x) = optimize_H(A, y, h, x, 2, 0)
+    opts = map(g1, initials)
+    sort!(opts, by = x -> x[3])
+    candidates = opts[1:10]
+
+    # Optimze until convergence and return the best
+    g2(x) = optimize_H(A, y, h, x, maxiter, ΔQmin)
+    results = map(g2, candidates)
+    sort!(results, by = x -> x[3])
+
+    # Return the winner
+    winner = results[1]
+    if return_set
+        return winner
+    else
+        return winner[2]
+    end
+end
+
+# Helper function that applies a certain number of C-steps to a subset H and checks for convergence
+function optimize_H(A, y, h::Int, initial, maxiter::Int, ΔQmin)
+    Q_old = initial[3]
+    # Create object in outer scope
+    opt = nothing
+    # Apply C-steps until convergence / maxiter
+    for i in 1:maxiter
+        opt = C_step(A, y, initial[2], h)
+        ΔQ = Q_old - opt[3]
+        if ΔQ < ΔQmin
+            break
+        end
+        Q_old = opt[3]
+    end    
+    return opt
+end
+
+# Helper function that gets an initial set H from a random p subset
+function get_initial_H(A, y, p, n, h)
+    inds = 1:n
+    # sample a random p subset
+    J = sample(inds, p, replace = false)
+    # if subset does not specify a unique hyperplane add random observation
+    i = 1
+    while ((p+i+1) < n) && (rank(A[J, :]) < p)
+        J = sample(inds, p+i, replace = false)
+        i += 1
+    end
+
+    # Estimate first parameters
+    θ_J = \(A[J, :], y[J])
+    # Use a C-step for an initial H, θ, Q
+    H_initial, θ_initial, Q_initial = C_step(A, y, θ_J, h)
+    return H_initial, θ_initial, Q_initial
+end
+
+# The C-step, workinghorse that guarentees converging parameters
+function C_step(A, y, θ_old, h)
+    # Use the old parameters to get the residuals
+    residuals = y .- A * θ_old
+    # Sort the residuals and return a new subset with the smallest residuals
+    # H_new = sortperm(abs.(residuals))[1:h] # should be more efficient, but apperantly is not
+    H_new = sortperm(abs.(residuals))[1:h]
+    # Get new parameters using oridinary least squares 
+    θ_new = \(A[H_new, :], y[H_new])
+    Q_new = get_Q(A, y, H_new, θ_new)
+    return H_new, θ_new, Q_new
+end
+
+# get the objective function Q
+function get_Q(A, y, H, θ)
+    residuals = y .- A * θ
+    return sum(abs2, residuals[H])
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -475,4 +475,29 @@ end
     end
     @test mean(results) >= 0.98
 end
+
+@testset "flts" begin
+    @info "Testing flts"
+    # Example from paper 
+    x = 10randn(1000)
+    a, b = 1, 2
+    y = a*x .+ b 
+    y[801:end] = 5randn(200) 
+    x[801:end] = 5randn(200) .+ 50 
+    xb = [x ones(1000)]
+    # default
+    res = flts(xb,y, verbose = true)
+    @test a ≈ res[1]
+    @test b ≈ res[2]
+    # define % of outliers
+    res1 = flts(xb,y, N = 10, outliers = 0.20, verbose = true)
+    @test a ≈ res1[1]
+    @test b ≈ res1[2]
+    # define length of set H and check the subset
+    (H, res2, Q) = flts(xb,y, N = 10, h = 800, return_set = true, verbose = true)
+    @test a ≈ res2[1]
+    @test b ≈ res2[2]
+    @test all(H .< 801)
+    @test Q + 1 ≈ 1
+end
 end