Skip to content

Commit

Permalink
Merge branch 'master' of github.com:baggepinnen/TotalLeastSquares.jl
Browse files Browse the repository at this point in the history
  • Loading branch information
baggepinnen committed Mar 10, 2021
2 parents 768f69e + 86a9ce1 commit 87d1a61
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 3 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
name = "TotalLeastSquares"
uuid = "028f657a-7ace-5159-a694-8cfd97933b0c"
authors = ["baggepinnen <[email protected]>"]
version = "1.6.1"
version = "1.7.0"

[deps]
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
FillArrays = "0.5, 0.6, 0.8, 0.9, 0.10, 0.11"
StatsBase = "0.33"
julia = "1.0"

[extras]
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ These functions are exported:
- `x = rtls(A,y)` Solves a robust TLS problem. Both `A` and `y` are assumed to be corrupted with high magnitude, but sparse, noise. See analysis below.
- `x = irls(A,y; tolx=0.001, tol=1.0e-6, verbose=false, iters=100)` minimizeₓ ||Ax-y||₁ using iteratively reweighted least squares.
- `x = sls(A,y; r = 1, iters = 100, verbose = false, tol = 1.0e-8)` Simplex least-squares: minimizeₓ ||Ax-y||₂ s.t. sum(x) = r
- `x = flts(A,y; outlier = 0.5, N = 500, maxiter = 100, return_set = false, verbose = true)` Fast least trimmed squares: Minimizing the sum of squared residuals by finding an outlier free subset among N initial subsets. Robust up to 50 % outlier.



Expand Down
6 changes: 4 additions & 2 deletions src/TotalLeastSquares.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
module TotalLeastSquares
export tls, tls!, wtls, wls, rtls, irls, sls, rowcovariance, hankel, ishankel, unhankel
export tls, tls!, wtls, wls, rtls, irls, sls, rowcovariance, hankel, ishankel, unhankel, flts
export rpca, lowrankfilter, rpca_ga, entrywise_median, entrywise_trimmed_mean, μ!
using FillArrays, Printf, LinearAlgebra, SparseArrays, Statistics
using FillArrays, Printf, LinearAlgebra, SparseArrays, Statistics, StatsBase

include("flts.jl")

"""
wls(A,y,Σ)
Expand Down
144 changes: 144 additions & 0 deletions src/flts.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
flts(A::AbstractArray{<:Real}, y::Vector{<:Real})
Fast least trimmed squares by an algorithm from Rousseeuw et al. - Computing LTS Regression for Large Data Sets (1999)
The basic idea is to find an outlier-free subset `H` of regressors that minimizes the sum of squared residuals `Q`, resulting in a robust version of least squares up to 50% outliers
# Args
* A::AbstractArray{<:Real}: Vector or Matrix ∈ R(n x p) containing regressors
* y::Vector{<:Real}: Output Vector ∈ R(n)
# Keyword arguments
* h::Int: The size of the subset H, has to be between 0.5(n + p + 1)) <= h <= n and defaults to 0.5(n + p + 1)
* outliers::Real: Amount of outliers, between 0 <= outliers <= 0.5. A valid definition of h overwrites an outlier definition!
* N::Int: Number of initial p-subsets, defaults to 500 (minimum is 10). This is the main parameter regarding speed, to estimate a reasonable N, the probability of having at least one outlier free p-subset is given by: 1-(1-(1-outliers)^p)^N > 0. If verbose = true information about this gets printed.
* maxiter::Int: Maximum number of C-steps applied in the final optimization, defaults to 100
* ΔQmin::Real: Minimum change of the objective function (squared residuals) that defines convergence
* return_set::Bool: If true, the subset H and the regarding value Q are also returned in the form of (H, θ, Q)
* verbose::Bool: Print some more information
# Return
The parameters θ that minimize Q and optionally a tuple (H, θ, Q), which also contains the selected subset H and the minimal Q
# Example
Taken from the original paper
```jldoctest
julia> x = rand(Normal(0,10), 1000)
julia> y = x .+ 1 + randn(1000)
julia> y[801:end] = rand(Normal(0,5), 200)
julia> x[801:end] = rand(Normal(50,5), 200)
julia> A = [x ones(1000)]
julia> flts(xb,y, return_set = true)
([656, 336, 293, 763, 281, 248, 269, 231, 678, 372 … 734, 220, 434, 658, 174, 465, 745, 371, 759, 732], [1.0003968687209952, 1.0058265326498548], 750.0607838759548)
```
"""
function flts(A::AbstractArray{<:Real}, y::AbstractVector{<:Real};
h::Integer = 0, outliers::Real = -1, N::Integer = 500, maxiter::Integer = 100, ΔQmin::Real = 0.0001, return_set::Bool = false, verbose::Bool = false)
# Input checking
n = length(y)
size(A, 1) == n || throw(DimensionMismatch("Both inputs A and y should have the same number of rows"))
N >= 10 || throw(DomainError("N needs to be >= 10"))
if ndims(A) == 2
p = size(A, 2)
else
# make sure matrix multiplication works
p = 1
A = reshape(A, :, 1)
end

# Get the subset length h
if (round(0.5(n + p + 1)) <= h <= n)
verbose && @info "h was set to: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
elseif (0.0 <= outliers <= 0.5)
h = Int(round((1-outliers)*n))
verbose && @info "h was set to: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
else
h = Int(round(0.5(n + p + 1)))
verbose && @info "h was set to default: h = $h. Breakdown point is at ~$(round(100 - h / n * 100))% outliers"
end
# Information about possible success
verbose && @info "Chance to find an outlier-free p subset is at least ~$((1-(1-(h/n)^p)^N) * 100) %"

# Initial subset H1 using p-subsets (Option b from paper)
hs = fill(h, N)
f(h) = get_initial_H(A, y, p, n, h)
# Array of Tuples (H, θ, Q)
initials = map(f, hs)

# Carry out two C-steps, take top 10 candidates for further optimization
g1(x) = optimize_H(A, y, h, x, 2, 0)
opts = map(g1, initials)
sort!(opts, by = x -> x[3])
candidates = opts[1:10]

# Optimze until convergence and return the best
g2(x) = optimize_H(A, y, h, x, maxiter, ΔQmin)
results = map(g2, candidates)
sort!(results, by = x -> x[3])

# Return the winner
winner = results[1]
if return_set
return winner
else
return winner[2]
end
end

# Helper function that applies a certain number of C-steps to a subset H and checks for convergence
function optimize_H(A, y, h::Int, initial, maxiter::Int, ΔQmin)
Q_old = initial[3]
# Create object in outer scope
opt = nothing
# Apply C-steps until convergence / maxiter
for i in 1:maxiter
opt = C_step(A, y, initial[2], h)
ΔQ = Q_old - opt[3]
if ΔQ < ΔQmin
break
end
Q_old = opt[3]
end
return opt
end

# Helper function that gets an initial set H from a random p subset
function get_initial_H(A, y, p, n, h)
inds = 1:n
# sample a random p subset
J = sample(inds, p, replace = false)
# if subset does not specify a unique hyperplane add random observation
i = 1
while ((p+i+1) < n) && (rank(A[J, :]) < p)
J = sample(inds, p+i, replace = false)
i += 1
end

# Estimate first parameters
θ_J = \(A[J, :], y[J])
# Use a C-step for an initial H, θ, Q
H_initial, θ_initial, Q_initial = C_step(A, y, θ_J, h)
return H_initial, θ_initial, Q_initial
end

# The C-step, workinghorse that guarentees converging parameters
function C_step(A, y, θ_old, h)
# Use the old parameters to get the residuals
residuals = y .- A * θ_old
# Sort the residuals and return a new subset with the smallest residuals
# H_new = sortperm(abs.(residuals))[1:h] # should be more efficient, but apperantly is not
H_new = sortperm(abs.(residuals))[1:h]
# Get new parameters using oridinary least squares
θ_new = \(A[H_new, :], y[H_new])
Q_new = get_Q(A, y, H_new, θ_new)
return H_new, θ_new, Q_new
end

# get the objective function Q
function get_Q(A, y, H, θ)
residuals = y .- A * θ
return sum(abs2, residuals[H])
end
25 changes: 25 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -475,4 +475,29 @@ end
end
@test mean(results) >= 0.98
end

@testset "flts" begin
@info "Testing flts"
# Example from paper
x = 10randn(1000)
a, b = 1, 2
y = a*x .+ b
y[801:end] = 5randn(200)
x[801:end] = 5randn(200) .+ 50
xb = [x ones(1000)]
# default
res = flts(xb,y, verbose = true)
@test a res[1]
@test b res[2]
# define % of outliers
res1 = flts(xb,y, N = 10, outliers = 0.20, verbose = true)
@test a res1[1]
@test b res1[2]
# define length of set H and check the subset
(H, res2, Q) = flts(xb,y, N = 10, h = 800, return_set = true, verbose = true)
@test a res2[1]
@test b res2[2]
@test all(H .< 801)
@test Q + 1 1
end
end

0 comments on commit 87d1a61

Please sign in to comment.