Skip to content

Commit

Permalink
fix(timer) ensure intervals are not missed
Browse files Browse the repository at this point in the history
use resty-timer lib to achieve that with a 0.5 sub_interval
  • Loading branch information
Tieske committed Sep 21, 2020
1 parent 07c6878 commit 7942008
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 155 deletions.
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ INSTALL ?= install
all: ;

install: all
$(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/
$(INSTALL) -d $(DESTDIR)/$(LUA_LIB_DIR)/resty/
$(INSTALL) lib/resty/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/
$(INSTALL) lib/resty/healthcheck/*.lua $(DESTDIR)/$(LUA_LIB_DIR)/resty/healthcheck/

test: all
PATH=$(OPENRESTY_PREFIX)/nginx/sbin:$$PATH prove -I../test-nginx/lib -r t
Expand Down
149 changes: 53 additions & 96 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@ local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local table_remove = table.remove
local utils = require("resty.healthcheck.utils")
local resty_timer = require("resty.timer")
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
local re_find = ngx.re.find
local bit = require("bit")
local ngx_now = ngx.now
local ngx_worker_exiting = ngx.worker.exiting
local ssl = require("ngx.ssl")

Expand Down Expand Up @@ -979,86 +978,37 @@ end
-- results of the checks.


-- @param health_mode either "healthy" or "unhealthy" to indicate what
-- lock to get.
-- @return `true` on success, or false if the lock was not acquired, or `nil + error`
-- in case of errors
function checker:get_periodic_lock(health_mode)
local key = self.PERIODIC_LOCK .. health_mode
local interval = self.checks.active[health_mode].interval

-- The lock is held for the whole interval to prevent multiple
-- worker processes from sending the test request simultaneously.
-- UNLESS: the probing takes longer than the timer interval.
-- Here we substract the lock expiration time by 1ms to prevent
-- a race condition with the next timer event.
local ok, err = self.shm:add(key, true, interval - 0.001)
if not ok then
if err == "exists" then
return false
end
self:log(ERR, "failed to add key '", key, "': ", err)
return nil
end
return true
end


--- Active health check callback function.
-- @param premature default openresty param
-- @param self the checker object this timer runs on
-- @param health_mode either "healthy" or "unhealthy" to indicate what check
local function checker_callback(premature, self, health_mode)
if premature or self.stopping then
self.timer_count = self.timer_count - 1
return
end

local interval
if not self:get_periodic_lock(health_mode) then
-- another worker just ran, or is running the healthcheck
interval = self.checks.active[health_mode].interval

else
-- we're elected to run the active healthchecks
-- create a list of targets to check, here we can still do this atomically
local start_time = ngx_now()
local list_to_check = {}
local targets = fetch_target_list(self)
for _, target in ipairs(targets) do
local tgt = get_target(self, target.ip, target.port, target.hostname)
local internal_health = tgt and tgt.internal_health or nil
if (health_mode == "healthy" and (internal_health == "healthy" or
internal_health == "mostly_healthy"))
or (health_mode == "unhealthy" and (internal_health == "unhealthy" or
internal_health == "mostly_unhealthy"))
then
list_to_check[#list_to_check + 1] = {
ip = target.ip,
port = target.port,
hostname = target.hostname,
hostheader = target.hostheader,
debug_health = internal_health,
}
end
local function checker_callback(self, health_mode)

-- create a list of targets to check, here we can still do this atomically
local list_to_check = {}
local targets = fetch_target_list(self)
for _, target in ipairs(targets) do
local tgt = get_target(self, target.ip, target.port, target.hostname)
local internal_health = tgt and tgt.internal_health or nil
if (health_mode == "healthy" and (internal_health == "healthy" or
internal_health == "mostly_healthy"))
or (health_mode == "unhealthy" and (internal_health == "unhealthy" or
internal_health == "mostly_unhealthy"))
then
list_to_check[#list_to_check + 1] = {
ip = target.ip,
port = target.port,
hostname = target.hostname,
hostheader = target.hostheader,
debug_health = internal_health,
}
end

if not list_to_check[1] then
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
else
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
self:active_check_targets(list_to_check)
end

local run_time = ngx_now() - start_time
interval = math.max(0, self.checks.active[health_mode].interval - run_time)
end

-- reschedule timer
local ok, err = utils.gctimer(interval, checker_callback, self, health_mode)
if not ok then
self.timer_count = self.timer_count - 1
self:log(ERR, "failed to re-create '", health_mode, "' timer: ", err)
if not list_to_check[1] then
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
else
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
self:active_check_targets(list_to_check)
end
end

Expand Down Expand Up @@ -1159,7 +1109,14 @@ end
-- after the current timers have expired they will be marked as stopped.
-- @return `true`
function checker:stop()
self.stopping = true
if self.active_healthy_timer then
self.active_healthy_timer:cancel()
self.active_healthy_timer = nil
end
if self.active_unhealthy_timer then
self.active_unhealthy_timer:cancel()
self.active_unhealthy_timer = nil
end
self:log(DEBUG, "timers stopped")
return true
end
Expand All @@ -1168,31 +1125,33 @@ end
--- Start the background health checks.
-- @return `true`, or `nil + error`.
function checker:start()
if self.timer_count ~= 0 then
return nil, "cannot start, " .. self.timer_count .. " (of 2) timers are still running"
if self.active_healthy_timer or self.active_unhealthy_timer then
return nil, "cannot start, timers are still running"
end

local ok, err
if self.checks.active.healthy.interval > 0 then
ok, err = utils.gctimer(0, checker_callback, self, "healthy")
if not ok then
return nil, "failed to create 'healthy' timer: " .. err
end
self.timer_count = self.timer_count + 1
end

if self.checks.active.unhealthy.interval > 0 then
ok, err = utils.gctimer(0, checker_callback, self, "unhealthy")
if not ok then
return nil, "failed to create 'unhealthy' timer: " .. err
for _, health_mode in ipairs({ "healthy", "unhealthy" }) do
if self.checks.active[health_mode].interval > 0 then
local timer, err = resty_timer({
interval = self.checks.active[health_mode].interval,
recurring = true,
immediate = true,
detached = false,
expire = checker_callback,
cancel = nil,
shm_name = self.shm_name,
key_name = self.PERIODIC_LOCK .. health_mode,
sub_interval = math.min(self.checks.active[health_mode].interval, 0.5),
}, self, health_mode)
if not timer then
return nil, "failed to create '" .. health_mode .. "' timer: " .. err
end
self["active_" .. health_mode .. "_timer"] = timer
end
self.timer_count = self.timer_count + 1
end

worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe
worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE)

self.stopping = false -- do this at the end, so if either creation fails, the other stops also
self:log(DEBUG, "timers started")
return true
end
Expand Down Expand Up @@ -1425,8 +1384,6 @@ function _M.new(opts)
-- other properties
self.targets = {} -- list of targets, initially loaded, maintained by events
self.events = nil -- hash table with supported events (prevent magic strings)
self.stopping = true -- flag to indicate to timers to stop checking
self.timer_count = 0 -- number of running timers
self.ev_callback = nil -- callback closure per checker instance

-- Convert status lists to sets
Expand Down
47 changes: 0 additions & 47 deletions lib/resty/healthcheck/utils.lua

This file was deleted.

1 change: 1 addition & 0 deletions lua-resty-healthcheck-scm-2.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ description = {
dependencies = {
"lua-resty-worker-events ~> 2",
"penlight ~> 1.7",
"lua-resty-timer ~> 1",
}
build = {
type = "builtin",
Expand Down
31 changes: 21 additions & 10 deletions t/01-start-stop.t
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ __DATA__
ngx.sleep(0.2) -- wait twice the interval
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
Expand Down Expand Up @@ -83,7 +86,7 @@ true
--- request
GET /t
--- response_body
cannot start, 2 (of 2) timers are still running
cannot start, timers are still running
--- no_error_log
[error]

Expand Down Expand Up @@ -115,7 +118,10 @@ cannot start, 2 (of 2) timers are still running
ngx.say(ok)
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
Expand Down Expand Up @@ -152,16 +158,17 @@ true
})
local ok, err = checker:stop()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
}
}
--- request
GET /t
--- response_body
true
2
0
--- no_error_log
[error]
Expand Down Expand Up @@ -191,20 +198,24 @@ checking
})
local ok, err = checker:stop()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
local ok, err = checker:start()
ngx.say(ok)
ngx.say(checker.timer_count)
ngx.say(
(checker.active_healthy_timer and 1 or 0) +
(checker.active_unhealthy_timer and 1 or 0)
)
ngx.sleep(0.2) -- wait twice the interval
}
}
--- request
GET /t
--- response_body
true
2
0
true
2
Expand Down

0 comments on commit 7942008

Please sign in to comment.