Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(timer) ensure intervals are not missed #59

Merged
merged 2 commits into from
Sep 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ INSTALL ?= install
all: ;

install: all
$(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/
$(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/
$(INSTALL) lib/resty/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/
$(INSTALL) lib/resty/healthcheck/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/

test: all
PATH=$(OPENRESTY_PREFIX)/nginx/sbin:$$PATH prove -I../test-nginx/lib -r t
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ Versioning is strictly based on [Semantic Versioning](https://semver.org/)
* feature: add headers for probe request [#54](https://github.com/Kong/lua-resty-healthcheck/pull/54)
* fix: exit early when reloading during a probe [#47](https://github.com/Kong/lua-resty-healthcheck/pull/47)
* fix: prevent target-list from being nil, due to async behaviour [#44](https://github.com/Kong/lua-resty-healthcheck/pull/44)
* fix: replace timer and node-wide locks with resty-timer, to prevent interval
skips [#59](https://github.com/Kong/lua-resty-healthcheck/pull/59)

### 1.3.0 (17-Jun-2020)

Expand Down
149 changes: 53 additions & 96 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@ local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local table_remove = table.remove
local utils = require("resty.healthcheck.utils")
local resty_timer = require("resty.timer")
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
local re_find = ngx.re.find
local bit = require("bit")
local ngx_now = ngx.now
local ngx_worker_exiting = ngx.worker.exiting
local ssl = require("ngx.ssl")

Expand Down Expand Up @@ -979,86 +978,37 @@ end
-- results of the checks.


-- @param health_mode either "healthy" or "unhealthy" to indicate what
-- lock to get.
-- @return `true` on success, or false if the lock was not acquired, or `nil + error`
-- in case of errors
function checker:get_periodic_lock(health_mode)
local key = self.PERIODIC_LOCK .. health_mode
local interval = self.checks.active[health_mode].interval

-- The lock is held for the whole interval to prevent multiple
-- worker processes from sending the test request simultaneously.
-- UNLESS: the probing takes longer than the timer interval.
-- Here we substract the lock expiration time by 1ms to prevent
-- a race condition with the next timer event.
local ok, err = self.shm:add(key, true, interval - 0.001)
if not ok then
if err == "exists" then
return false
end
self:log(ERR, "failed to add key '", key, "': ", err)
return nil
end
return true
end


--- Active health check callback function.
-- @param premature default openresty param
-- @param self the checker object this timer runs on
-- @param health_mode either "healthy" or "unhealthy" to indicate what check
local function checker_callback(premature, self, health_mode)
if premature or self.stopping then
self.timer_count = self.timer_count - 1
return
end

local interval
if not self:get_periodic_lock(health_mode) then
-- another worker just ran, or is running the healthcheck
interval = self.checks.active[health_mode].interval

else
-- we're elected to run the active healthchecks
-- create a list of targets to check, here we can still do this atomically
local start_time = ngx_now()
local list_to_check = {}
local targets = fetch_target_list(self)
for _, target in ipairs(targets) do
local tgt = get_target(self, target.ip, target.port, target.hostname)
local internal_health = tgt and tgt.internal_health or nil
if (health_mode == "healthy" and (internal_health == "healthy" or
internal_health == "mostly_healthy"))
or (health_mode == "unhealthy" and (internal_health == "unhealthy" or
internal_health == "mostly_unhealthy"))
then
list_to_check[#list_to_check + 1] = {
ip = target.ip,
port = target.port,
hostname = target.hostname,
hostheader = target.hostheader,
debug_health = internal_health,
}
end
local function checker_callback(self, health_mode)

-- create a list of targets to check, here we can still do this atomically
local list_to_check = {}
local targets = fetch_target_list(self)
for _, target in ipairs(targets) do
local tgt = get_target(self, target.ip, target.port, target.hostname)
local internal_health = tgt and tgt.internal_health or nil
if (health_mode == "healthy" and (internal_health == "healthy" or
internal_health == "mostly_healthy"))
or (health_mode == "unhealthy" and (internal_health == "unhealthy" or
internal_health == "mostly_unhealthy"))
then
list_to_check[#list_to_check + 1] = {
ip = target.ip,
port = target.port,
hostname = target.hostname,
hostheader = target.hostheader,
debug_health = internal_health,
}
end

if not list_to_check[1] then
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
else
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
self:active_check_targets(list_to_check)
end

local run_time = ngx_now() - start_time
interval = math.max(0, self.checks.active[health_mode].interval - run_time)
end

-- reschedule timer
local ok, err = utils.gctimer(interval, checker_callback, self, health_mode)
if not ok then
self.timer_count = self.timer_count - 1
self:log(ERR, "failed to re-create '", health_mode, "' timer: ", err)
if not list_to_check[1] then
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
else
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
self:active_check_targets(list_to_check)
end
end

Expand Down Expand Up @@ -1159,7 +1109,14 @@ end
-- after the current timers have expired they will be marked as stopped.
-- @return `true`
function checker:stop()
self.stopping = true
if self.active_healthy_timer then
self.active_healthy_timer:cancel()
self.active_healthy_timer = nil
end
if self.active_unhealthy_timer then
self.active_unhealthy_timer:cancel()
self.active_unhealthy_timer = nil
end
self:log(DEBUG, "timers stopped")
return true
end
Expand All @@ -1168,31 +1125,33 @@ end
--- Start the background health checks.
-- @return `true`, or `nil + error`.
function checker:start()
if self.timer_count ~= 0 then
return nil, "cannot start, " .. self.timer_count .. " (of 2) timers are still running"
if self.active_healthy_timer or self.active_unhealthy_timer then
return nil, "cannot start, timers are still running"
end

local ok, err
if self.checks.active.healthy.interval > 0 then
ok, err = utils.gctimer(0, checker_callback, self, "healthy")
if not ok then
return nil, "failed to create 'healthy' timer: " .. err
end
self.timer_count = self.timer_count + 1
end

if self.checks.active.unhealthy.interval > 0 then
ok, err = utils.gctimer(0, checker_callback, self, "unhealthy")
if not ok then
return nil, "failed to create 'unhealthy' timer: " .. err
for _, health_mode in ipairs({ "healthy", "unhealthy" }) do
if self.checks.active[health_mode].interval > 0 then
local timer, err = resty_timer({
interval = self.checks.active[health_mode].interval,
recurring = true,
immediate = true,
detached = false,
expire = checker_callback,
cancel = nil,
shm_name = self.shm_name,
key_name = self.PERIODIC_LOCK .. health_mode,
sub_interval = math.min(self.checks.active[health_mode].interval, 0.5),
}, self, health_mode)
if not timer then
return nil, "failed to create '" .. health_mode .. "' timer: " .. err
end
self["active_" .. health_mode .. "_timer"] = timer
end
self.timer_count = self.timer_count + 1
end

worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe
worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE)

self.stopping = false -- do this at the end, so if either creation fails, the other stops also
self:log(DEBUG, "timers started")
return true
end
Expand Down Expand Up @@ -1425,8 +1384,6 @@ function _M.new(opts)
-- other properties
self.targets = {} -- list of targets, initially loaded, maintained by events
self.events = nil -- hash table with supported events (prevent magic strings)
self.stopping = true -- flag to indicate to timers to stop checking
self.timer_count = 0 -- number of running timers
self.ev_callback = nil -- callback closure per checker instance

-- Convert status lists to sets
Expand Down
47 changes: 0 additions & 47 deletions lib/resty/healthcheck/utils.lua

This file was deleted.

2 changes: 1 addition & 1 deletion lua-resty-healthcheck-scm-2.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ description = {
dependencies = {
"lua-resty-worker-events ~> 2",
"penlight ~> 1.7",
"lua-resty-timer ~> 1",
}
build = {
type = "builtin",
modules = {
["resty.healthcheck"] = "lib/resty/healthcheck.lua",
["resty.healthcheck.utils"] = "lib/resty/healthcheck/utils.lua",
}
}
31 changes: 21 additions & 10 deletions t/01-start-stop.t
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ __DATA__
ngx.sleep(0.2) -- wait twice the interval
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
Expand Down Expand Up @@ -83,7 +86,7 @@ true
--- request
GET /t
--- response_body
cannot start, 2 (of 2) timers are still running
cannot start, timers are still running
--- no_error_log
[error]

Expand Down Expand Up @@ -115,7 +118,10 @@ cannot start, 2 (of 2) timers are still running
ngx.say(ok)
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
Expand Down Expand Up @@ -152,16 +158,17 @@ true
})
local ok, err = checker:stop()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
GET /t
--- response_body
true
2
0
--- no_error_log
[error]
Expand Down Expand Up @@ -191,20 +198,24 @@ checking
})
local ok, err = checker:stop()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
ngx.sleep(0.2) -- wait twice the interval
}
}
--- request
GET /t
--- response_body
true
2
0
true
2
Expand Down
3 changes: 1 addition & 2 deletions t/19-status-ver.t
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use Test::Nginx::Socket::Lua 'no_plan';
use Cwd qw(cwd);

workers(2);
workers(1);
master_on();

my $pwd = cwd();
Expand Down Expand Up @@ -59,4 +59,3 @@ true
checking unhealthy targets: nothing to do
checking unhealthy targets: #1
from 'true' to 'false', ver: 2
from 'true' to 'false', ver: 1