Skip to content

Commit

Permalink
feat(healthcheck) delayed_clear function
Browse files Browse the repository at this point in the history
Added new function delayed_clear. This function marks all targets to be
removed, but do not actually remove them. If before the delay parameter
any of them is re-added, it is unmarked for removal.

This function makes it possible to keep target state during config
changes, where the targets might be removed and then re-added.
  • Loading branch information
locao committed Feb 4, 2022
1 parent 34d6cad commit c940c4a
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 42 deletions.
92 changes: 77 additions & 15 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ local ngx_log = ngx.log
local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local table_insert = table.insert
local table_remove = table.remove
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
Expand Down Expand Up @@ -281,20 +282,29 @@ end
function checker:add_target(ip, port, hostname, is_healthy, hostheader)
ip = tostring(assert(ip, "no ip address provided"))
port = assert(tonumber(port), "no port number provided")
hostname = hostname or ip
if is_healthy == nil then
is_healthy = true
end

local internal_health = is_healthy and "healthy" or "unhealthy"

local ok, err = locking_target_list(self, function(target_list)
local found = false

-- check whether we already have this target
for _, target in ipairs(target_list) do
if target.ip == ip and target.port == port and target.hostname == hostname then
self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip,
":", port, " (ignoring)")
return false
if target.ip == ip and target.port == port and target.hostname == (hostname) then
if target.purge_time == nil then
self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip,
":", port, " (ignoring)")
return
end
target.purge_time = nil
found = true
internal_health = self:get_target_status(ip, port, hostname) and
"healthy" or "unhealthy"
break
end
end

Expand All @@ -308,12 +318,14 @@ function checker:add_target(ip, port, hostname, is_healthy, hostheader)
end

-- target does not exist, go add it
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
if not found then
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
end
target_list = serialize(target_list)

ok, err = self.shm:set(self.TARGET_LIST, target_list)
Expand Down Expand Up @@ -433,6 +445,28 @@ function checker:clear()
end


function checker:delayed_clear(delay)
assert(tonumber(delay), "no delay provided")

return locking_target_list(self, function(target_list)
local purge_time = ngx_now() + delay

-- add purge time to all targets
for _, target in ipairs(target_list) do
target.purge_time = purge_time
end

target_list = serialize(target_list)
local ok, err = self.shm:set(self.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

return true
end)
end


--- Get the current status of the target.
-- @param ip IP address of the target being checked.
-- @param port the port being checked against.
Expand Down Expand Up @@ -1071,7 +1105,7 @@ function checker:event_handler(event_name, ip, port, hostname)
end
end
self:log(DEBUG, "event: target '", hostname or "", " (", ip, ":", port,
"' removed")
")' removed")

else
self:log(WARN, "event: trying to remove an unknown target '",
Expand All @@ -1088,7 +1122,7 @@ function checker:event_handler(event_name, ip, port, hostname)
target_found = { ip = ip, port = port, hostname = hostname }
self.targets[target_found.ip] = self.targets[target_found.ip] or {}
self.targets[target_found.ip][target_found.port] = self.targets[target_found.ip][target_found.port] or {}
self.targets[target_found.ip][target_found.port][target_found.hostname or ip] = target_found
self.targets[target_found.ip][target_found.port][target_found.hostname] = target_found
self.targets[#self.targets + 1] = target_found
self:log(DEBUG, "event: target added '", hostname or "", "(", ip, ":", port, ")'")
end
Expand Down Expand Up @@ -1451,9 +1485,8 @@ function _M.new(opts)
return nil, err
end

-- if active checker is needed and not running, start it
if (self.checks.active.healthy.active or self.checks.active.unhealthy.active)
and active_check_timer == nil then
-- if active checker is not running, start it
if active_check_timer == nil then

self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
"starting active check timer")
Expand All @@ -1475,6 +1508,35 @@ function _M.new(opts)

local cur_time = ngx_now()
for _, checker_obj in ipairs(hcs) do
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
local index = 1
while index <= #target_list do
local target = target_list[index]
if target.purge_time and target.purge_time <= cur_time then
table_insert(removed_targets, target)
table_remove(target_list, index)
else
index = index + 1
end
end

if #removed_targets > 0 then
target_list = serialize(target_list)

local ok, err = shm:set(checker_obj.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

for _, target in ipairs(removed_targets) do
clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname)
checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname)
end
end
end)

if checker_obj.checks.active.healthy.active and
(checker_obj.checks.active.healthy.last_run +
checker_obj.checks.active.healthy.interval <= cur_time)
Expand Down
48 changes: 24 additions & 24 deletions t/09-active_probes.t
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -123,10 +123,10 @@ GET /t
true
--- error_log
checking healthy targets: nothing to do
healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true'
checking unhealthy targets: nothing to do

=== TEST 3: active probes, custom http status (regression test for pre-filled defaults)
Expand Down Expand Up @@ -179,10 +179,10 @@ true
checking unhealthy targets: nothing to do
--- no_error_log
checking healthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'


=== TEST 4: active probes, custom http status, node failing
Expand Down Expand Up @@ -234,10 +234,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -340,10 +340,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy TCP increment (1/3) for '(127.0.0.1:2114)'
unhealthy TCP increment (2/3) for '(127.0.0.1:2114)'
unhealthy TCP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy TCP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy TCP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy TCP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -396,10 +396,10 @@ GET /t
true
--- error_log
checking healthy targets: nothing to do
healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true'
checking unhealthy targets: nothing to do


Expand Down
118 changes: 115 additions & 3 deletions t/11-clear.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use Cwd qw(cwd);

workers(1);

plan tests => repeat_each() * 23;
plan tests => repeat_each() * 27;

my $pwd = cwd();

Expand Down Expand Up @@ -164,7 +164,119 @@ GET /t
true

--- error_log
unhealthy HTTP increment (1/3) for '(127.0.0.1:21120)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:21120)'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:21120)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:21120)'
--- no_error_log
unhealthy HTTP increment (3/3) for '(127.0.0.1:21120)'


=== TEST 4: delayed_clear() clears the list, after interval new checkers don't see it
--- http_config eval: $::HttpConfig
--- config
location = /t {
content_by_lua_block {
local we = require "resty.worker.events"
assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
local healthcheck = require("resty.healthcheck")
local config = {
name = "testing",
shm_name = "test_shm",
checks = {
active = {
healthy = {
interval = 0.1
},
unhealthy = {
interval = 0.1
}
}
}
}
local checker1 = healthcheck.new(config)
for i = 1, 10 do
checker1:add_target("127.0.0.1", 10000 + i, nil, false)
end
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker1:get_target_status("127.0.0.1", 10001))
checker1:delayed_clear(0.2)

local checker2 = healthcheck.new(config)
ngx.say(checker2:get_target_status("127.0.0.1", 10001))
ngx.sleep(0.4) -- wait while the targets are cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
}
}
--- request
GET /t
--- response_body
false
false
target not found

=== TEST 5: delayed_clear() would clear tgt list, but adding again keeps the previous status
--- http_config eval: $::HttpConfig
--- config
location = /t {
content_by_lua_block {
local we = require "resty.worker.events"
assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
local healthcheck = require("resty.healthcheck")
local config = {
name = "testing",
shm_name = "test_shm",
checks = {
active = {
healthy = {
interval = 0.1
},
unhealthy = {
interval = 0.1
}
}
}
}
local checker1 = healthcheck.new(config)
checker1:add_target("127.0.0.1", 10001, nil, false)
checker1:add_target("127.0.0.1", 10002, nil, false)
checker1:add_target("127.0.0.1", 10003, nil, false)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker1:get_target_status("127.0.0.1", 10002))
checker1:delayed_clear(0.2)

local checker2 = healthcheck.new(config)
checker2:add_target("127.0.0.1", 10002, nil, true)
ngx.say(checker2:get_target_status("127.0.0.1", 10002))
ngx.sleep(0.4) -- wait while the targets would be cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
status, err = checker2:get_target_status("127.0.0.1", 10002)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
status, err = checker2:get_target_status("127.0.0.1", 10003)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
}
}
--- request
GET /t
--- response_body
false
false
target not found
false
target not found

0 comments on commit c940c4a

Please sign in to comment.