From a317f5d81e1183574121c65486f428592f394f91 Mon Sep 17 00:00:00 2001 From: Thijs Schreijer Date: Mon, 21 Sep 2020 12:20:32 +0200 Subject: [PATCH 1/2] fix(timer) ensure intervals are not missed use resty-timer lib to achieve that with a 0.5 sub_interval --- Makefile | 3 +- README.md | 2 + lib/resty/healthcheck.lua | 149 ++++++++++----------------- lib/resty/healthcheck/utils.lua | 47 --------- lua-resty-healthcheck-scm-2.rockspec | 2 +- t/01-start-stop.t | 31 ++++-- 6 files changed, 78 insertions(+), 156 deletions(-) delete mode 100644 lib/resty/healthcheck/utils.lua diff --git a/Makefile b/Makefile index 6e7c0662..1b7e6b59 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,8 @@ INSTALL ?= install all: ; install: all - $(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/ + $(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/ $(INSTALL) lib/resty/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/ - $(INSTALL) lib/resty/healthcheck/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/ test: all PATH=$(OPENRESTY_PREFIX)/nginx/sbin:$$PATH prove -I../test-nginx/lib -r t diff --git a/README.md b/README.md index 2d5e5269..4962c807 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,8 @@ Versioning is strictly based on [Semantic Versioning](https://semver.org/) * feature: add headers for probe request [#54](https://github.com/Kong/lua-resty-healthcheck/pull/54) * fix: exit early when reloading during a probe [#47](https://github.com/Kong/lua-resty-healthcheck/pull/47) * fix: prevent target-list from being nil, due to async behaviour [#44](https://github.com/Kong/lua-resty-healthcheck/pull/44) +* fix: replace timer and node-wide locks with resty-timer, to prevent interval + skips [#59](https://github.com/Kong/lua-resty-healthcheck/pull/59) ### 1.3.0 (17-Jun-2020) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 7648907f..8a9d4b16 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -32,12 +32,11 @@ local tostring = tostring local ipairs = ipairs local cjson = require("cjson.safe").new() local table_remove = table.remove -local utils = require("resty.healthcheck.utils") +local resty_timer = require("resty.timer") local worker_events = require("resty.worker.events") local resty_lock = require ("resty.lock") local re_find = ngx.re.find local bit = require("bit") -local ngx_now = ngx.now local ngx_worker_exiting = ngx.worker.exiting local ssl = require("ngx.ssl") @@ -979,86 +978,37 @@ end -- results of the checks. --- @param health_mode either "healthy" or "unhealthy" to indicate what --- lock to get. --- @return `true` on success, or false if the lock was not acquired, or `nil + error` --- in case of errors -function checker:get_periodic_lock(health_mode) - local key = self.PERIODIC_LOCK .. health_mode - local interval = self.checks.active[health_mode].interval - - -- The lock is held for the whole interval to prevent multiple - -- worker processes from sending the test request simultaneously. - -- UNLESS: the probing takes longer than the timer interval. - -- Here we substract the lock expiration time by 1ms to prevent - -- a race condition with the next timer event. - local ok, err = self.shm:add(key, true, interval - 0.001) - if not ok then - if err == "exists" then - return false - end - self:log(ERR, "failed to add key '", key, "': ", err) - return nil - end - return true -end - - --- Active health check callback function. --- @param premature default openresty param -- @param self the checker object this timer runs on -- @param health_mode either "healthy" or "unhealthy" to indicate what check -local function checker_callback(premature, self, health_mode) - if premature or self.stopping then - self.timer_count = self.timer_count - 1 - return - end - - local interval - if not self:get_periodic_lock(health_mode) then - -- another worker just ran, or is running the healthcheck - interval = self.checks.active[health_mode].interval - - else - -- we're elected to run the active healthchecks - -- create a list of targets to check, here we can still do this atomically - local start_time = ngx_now() - local list_to_check = {} - local targets = fetch_target_list(self) - for _, target in ipairs(targets) do - local tgt = get_target(self, target.ip, target.port, target.hostname) - local internal_health = tgt and tgt.internal_health or nil - if (health_mode == "healthy" and (internal_health == "healthy" or - internal_health == "mostly_healthy")) - or (health_mode == "unhealthy" and (internal_health == "unhealthy" or - internal_health == "mostly_unhealthy")) - then - list_to_check[#list_to_check + 1] = { - ip = target.ip, - port = target.port, - hostname = target.hostname, - hostheader = target.hostheader, - debug_health = internal_health, - } - end +local function checker_callback(self, health_mode) + + -- create a list of targets to check, here we can still do this atomically + local list_to_check = {} + local targets = fetch_target_list(self) + for _, target in ipairs(targets) do + local tgt = get_target(self, target.ip, target.port, target.hostname) + local internal_health = tgt and tgt.internal_health or nil + if (health_mode == "healthy" and (internal_health == "healthy" or + internal_health == "mostly_healthy")) + or (health_mode == "unhealthy" and (internal_health == "unhealthy" or + internal_health == "mostly_unhealthy")) + then + list_to_check[#list_to_check + 1] = { + ip = target.ip, + port = target.port, + hostname = target.hostname, + hostheader = target.hostheader, + debug_health = internal_health, + } end - - if not list_to_check[1] then - self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") - else - self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) - self:active_check_targets(list_to_check) - end - - local run_time = ngx_now() - start_time - interval = math.max(0, self.checks.active[health_mode].interval - run_time) end - -- reschedule timer - local ok, err = utils.gctimer(interval, checker_callback, self, health_mode) - if not ok then - self.timer_count = self.timer_count - 1 - self:log(ERR, "failed to re-create '", health_mode, "' timer: ", err) + if not list_to_check[1] then + self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") + else + self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) + self:active_check_targets(list_to_check) end end @@ -1159,7 +1109,14 @@ end -- after the current timers have expired they will be marked as stopped. -- @return `true` function checker:stop() - self.stopping = true + if self.active_healthy_timer then + self.active_healthy_timer:cancel() + self.active_healthy_timer = nil + end + if self.active_unhealthy_timer then + self.active_unhealthy_timer:cancel() + self.active_unhealthy_timer = nil + end self:log(DEBUG, "timers stopped") return true end @@ -1168,31 +1125,33 @@ end --- Start the background health checks. -- @return `true`, or `nil + error`. function checker:start() - if self.timer_count ~= 0 then - return nil, "cannot start, " .. self.timer_count .. " (of 2) timers are still running" + if self.active_healthy_timer or self.active_unhealthy_timer then + return nil, "cannot start, timers are still running" end - local ok, err - if self.checks.active.healthy.interval > 0 then - ok, err = utils.gctimer(0, checker_callback, self, "healthy") - if not ok then - return nil, "failed to create 'healthy' timer: " .. err - end - self.timer_count = self.timer_count + 1 - end - - if self.checks.active.unhealthy.interval > 0 then - ok, err = utils.gctimer(0, checker_callback, self, "unhealthy") - if not ok then - return nil, "failed to create 'unhealthy' timer: " .. err + for _, health_mode in ipairs({ "healthy", "unhealthy" }) do + if self.checks.active[health_mode].interval > 0 then + local timer, err = resty_timer({ + interval = self.checks.active[health_mode].interval, + recurring = true, + immediate = true, + detached = false, + expire = checker_callback, + cancel = nil, + shm_name = self.shm_name, + key_name = self.PERIODIC_LOCK .. health_mode, + sub_interval = math.min(self.checks.active[health_mode].interval, 0.5), + }, self, health_mode) + if not timer then + return nil, "failed to create '" .. health_mode .. "' timer: " .. err + end + self["active_" .. health_mode .. "_timer"] = timer end - self.timer_count = self.timer_count + 1 end worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE) - self.stopping = false -- do this at the end, so if either creation fails, the other stops also self:log(DEBUG, "timers started") return true end @@ -1425,8 +1384,6 @@ function _M.new(opts) -- other properties self.targets = {} -- list of targets, initially loaded, maintained by events self.events = nil -- hash table with supported events (prevent magic strings) - self.stopping = true -- flag to indicate to timers to stop checking - self.timer_count = 0 -- number of running timers self.ev_callback = nil -- callback closure per checker instance -- Convert status lists to sets diff --git a/lib/resty/healthcheck/utils.lua b/lib/resty/healthcheck/utils.lua deleted file mode 100644 index c0e2d6f0..00000000 --- a/lib/resty/healthcheck/utils.lua +++ /dev/null @@ -1,47 +0,0 @@ --------------------------------------------------------------------------- --- Utilities used by the healthcheck library. --- --- Code originally from https://github.com/Kong/lua-resty-dns-client --- @copyright 2017-2020 Kong Inc. --- @author Hisham Muhammad, Thijs Schreijer --- @license Apache 2.0 - -local timer_at = ngx.timer.at - -local _M = {} - --------------------------------------------------------- --- GC'able timer implementation with 'self' --------------------------------------------------------- -local timer_registry = setmetatable({},{ __mode = "v" }) -local timer_id = 0 - -local timer_callback = function(premature, cb, id, ...) - local self = timer_registry[id] - if not self then return end -- GC'ed, nothing more to do - timer_registry[id] = nil - return cb(premature, self, ...) -end - ---- A garbage-collectible timer implementation. --- Provides a timer that can be attached to an object, and GC'ed along --- with that object, as opposed to regular timers that keep running and --- prevent the object from being GC'ed. --- @param t time in ms --- @param cb callback for the timer. The signature is `function(premature, self, ...)` --- @param self the object the timer belongs to --- @param ... any additional parameters to pass to the timer callback --- @return timer handle (same as `ngx.timer.at`) -_M.gctimer = function(t, cb, self, ...) - assert(type(cb) == "function", "expected arg #2 to be a function") - assert(type(self) == "table", "expected arg #3 to be a table") - timer_id = timer_id + 1 - timer_registry[timer_id] = self - -- if in the call below we'd be passing `self` instead of the scalar `timer_id`, it - -- would prevent the whole `self` object from being garbage collected because - -- it is anchored on the timer. - return timer_at(t, timer_callback, cb, timer_id, ...) -end - - -return _M diff --git a/lua-resty-healthcheck-scm-2.rockspec b/lua-resty-healthcheck-scm-2.rockspec index 30224fcc..3171fcc8 100644 --- a/lua-resty-healthcheck-scm-2.rockspec +++ b/lua-resty-healthcheck-scm-2.rockspec @@ -16,11 +16,11 @@ description = { dependencies = { "lua-resty-worker-events ~> 2", "penlight ~> 1.7", + "lua-resty-timer ~> 1", } build = { type = "builtin", modules = { ["resty.healthcheck"] = "lib/resty/healthcheck.lua", - ["resty.healthcheck.utils"] = "lib/resty/healthcheck/utils.lua", } } diff --git a/t/01-start-stop.t b/t/01-start-stop.t index a3e189e3..88b59de3 100644 --- a/t/01-start-stop.t +++ b/t/01-start-stop.t @@ -43,7 +43,10 @@ __DATA__ ngx.sleep(0.2) -- wait twice the interval local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) + ngx.say( + (checker.active_healthy_timer and 1 or 0) + + (checker.active_unhealthy_timer and 1 or 0) + ) } } --- request @@ -83,7 +86,7 @@ true --- request GET /t --- response_body -cannot start, 2 (of 2) timers are still running +cannot start, timers are still running --- no_error_log [error] @@ -115,7 +118,10 @@ cannot start, 2 (of 2) timers are still running ngx.say(ok) local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) + ngx.say( + (checker.active_healthy_timer and 1 or 0) + + (checker.active_unhealthy_timer and 1 or 0) + ) } } --- request @@ -152,16 +158,17 @@ true }) local ok, err = checker:stop() ngx.say(ok) - ngx.say(checker.timer_count) ngx.sleep(0.2) -- wait twice the interval - ngx.say(checker.timer_count) + ngx.say( + (checker.active_healthy_timer and 1 or 0) + + (checker.active_unhealthy_timer and 1 or 0) + ) } } --- request GET /t --- response_body true -2 0 --- no_error_log [error] @@ -191,12 +198,17 @@ checking }) local ok, err = checker:stop() ngx.say(ok) - ngx.say(checker.timer_count) ngx.sleep(0.2) -- wait twice the interval - ngx.say(checker.timer_count) + ngx.say( + (checker.active_healthy_timer and 1 or 0) + + (checker.active_unhealthy_timer and 1 or 0) + ) local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) + ngx.say( + (checker.active_healthy_timer and 1 or 0) + + (checker.active_unhealthy_timer and 1 or 0) + ) ngx.sleep(0.2) -- wait twice the interval } } @@ -204,7 +216,6 @@ checking GET /t --- response_body true -2 0 true 2 From b14de9990f8eb87725ff9e7924c8c3cacfb1204c Mon Sep 17 00:00:00 2001 From: Thijs Schreijer Date: Mon, 21 Sep 2020 14:10:44 +0200 Subject: [PATCH 2/2] fix(test) flaky test due to event order, timer based --- t/19-status-ver.t | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/t/19-status-ver.t b/t/19-status-ver.t index c7d2166f..51ed929a 100644 --- a/t/19-status-ver.t +++ b/t/19-status-ver.t @@ -1,7 +1,7 @@ use Test::Nginx::Socket::Lua 'no_plan'; use Cwd qw(cwd); -workers(2); +workers(1); master_on(); my $pwd = cwd(); @@ -59,4 +59,3 @@ true checking unhealthy targets: nothing to do checking unhealthy targets: #1 from 'true' to 'false', ver: 2 -from 'true' to 'false', ver: 1