Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick] fix(*): avoid creating multiple timers to run the same active check #157

Merged
merged 2 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ Versioning is strictly based on [Semantic Versioning](https://semver.org/)
* push commit and tag
* upload rock to luarocks: `luarocks upload rockspecs/[name] --api-key=abc`

### Unreleased

* Fix: avoid creating multiple timers to run the same active check [#157](https://github.com/Kong/lua-resty-healthcheck/pull/157)

### 3.0.1 (22-Dec-2023)

* Fix: fix delay clean logic when multiple healthchecker was started [#146](https://github.com/Kong/lua-resty-healthcheck/pull/146)
Expand Down
48 changes: 48 additions & 0 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ local table_insert = table.insert
local table_remove = table.remove
local table_concat = table.concat
local string_format = string.format
local math_max = math.max
local ssl = require("ngx.ssl")
local resty_timer = require "resty.timer"
local bit = require("bit")
Expand Down Expand Up @@ -1201,6 +1202,32 @@ local function renew_periodic_lock(shm, key)
end


local function get_callback_lock(shm, key, ttl)
local value = shm:get(key)
if value == nil then
-- no worker is checking, try to acquire the lock
local ok, err = shm:add(key, true, ttl or LOCK_PERIOD)
if not ok then
if err == "exists" then
-- another worker got the lock before
return false
end

return nil, err
end

return true
end

return false
end


local function remove_callback_lock(shm, key)
return shm:delete(key)
end


--- Active health check callback function.
-- @param self the checker object this timer runs on
-- @param health_mode either "healthy" or "unhealthy" to indicate what check
Expand All @@ -1209,10 +1236,27 @@ local function checker_callback(self, health_mode)
self.checker_callback_count = self.checker_callback_count + 1
end

-- Set a callback pending lock will exist for 2x the time of the active check.
-- An active check should be finished within this time and next timer will be
-- executed to exit a pending status.
local callback_pending_ttl = (math_max(self.checks.active.healthy.active and
self.checks.active.healthy.interval or 0,
self.checks.active.unhealthy.active and
self.checks.active.unhealthy.interval or 0)
+ self.checks.active.timeout) * 2

local callback_lock = self.CALLBACK_LOCK .. health_mode
-- a pending timer already exists, so skip this time
local ok, _ = get_callback_lock(self.shm, callback_lock, callback_pending_ttl)
if not ok then
return
end

local list_to_check = {}
local targets, err = fetch_target_list(self)
if not targets then
self:log(ERR, "checker_callback: ", err)
remove_callback_lock(self.shm, callback_lock)
return
end

Expand All @@ -1236,6 +1280,7 @@ local function checker_callback(self, health_mode)

if not list_to_check[1] then
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
remove_callback_lock(self.shm, callback_lock)
else
local timer = resty_timer({
interval = 0,
Expand All @@ -1245,10 +1290,12 @@ local function checker_callback(self, health_mode)
expire = function()
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
self:active_check_targets(list_to_check)
remove_callback_lock(self.shm, callback_lock)
end,
})
if timer == nil then
self:log(ERR, "failed to create timer to check ", health_mode)
remove_callback_lock(self.shm, callback_lock)
end
end
end
Expand Down Expand Up @@ -1618,6 +1665,7 @@ function _M.new(opts)
self.TARGET_LIST_LOCK = SHM_PREFIX .. self.name .. ":target_list_lock"
self.TARGET_LOCK = SHM_PREFIX .. self.name .. ":target_lock"
self.PERIODIC_LOCK = SHM_PREFIX .. ":period_lock:"
self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock:"
-- prepare constants
self.EVENT_SOURCE = EVENT_SOURCE_PREFIX .. " [" .. self.name .. "]"
self.LOG_PREFIX = LOG_PREFIX .. "(" .. self.name .. ") "
Expand Down
15 changes: 9 additions & 6 deletions t/with_resty-events/14-tls_active_probes.t
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ __DATA__
events_module = "resty.events",
checks = {
active = {
timeout = 2,
type = "https",
http_path = "/",
healthy = {
Expand All @@ -60,7 +61,7 @@ events_module = "resty.events",
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true
}
}
Expand All @@ -69,7 +70,7 @@ GET /t
--- response_body
true
--- timeout
15
20

=== TEST 2: active probes, invalid cert
--- http_config eval: $::HttpConfig
Expand All @@ -87,6 +88,7 @@ true
events_module = "resty.events",
checks = {
active = {
timeout = 2,
type = "https",
http_path = "/",
healthy = {
Expand All @@ -101,7 +103,7 @@ events_module = "resty.events",
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false
}
}
Expand All @@ -110,7 +112,7 @@ GET /t
--- response_body
false
--- timeout
15
20

=== TEST 3: active probes, accept invalid cert when disabling check
--- http_config eval: $::HttpConfig
Expand All @@ -128,6 +130,7 @@ false
events_module = "resty.events",
checks = {
active = {
timeout = 2,
type = "https",
https_verify_certificate = false,
http_path = "/",
Expand All @@ -143,7 +146,7 @@ events_module = "resty.events",
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true
}
}
Expand All @@ -152,4 +155,4 @@ GET /t
--- response_body
true
--- timeout
15
20
93 changes: 93 additions & 0 deletions t/with_resty-events/19-timer.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
use Test::Nginx::Socket::Lua;
use Cwd qw(cwd);

workers(1);

plan tests => repeat_each() * blocks() * 2;

my $pwd = cwd();
$ENV{TEST_NGINX_SERVROOT} = server_root();

our $HttpConfig = qq{
lua_package_path "$pwd/lib/?.lua;;";
lua_shared_dict test_shm 8m;

init_worker_by_lua_block {
local we = require "resty.events.compat"
assert(we.configure({
unique_timeout = 5,
broker_id = 0,
listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
}))
assert(we.configured())
}

server {
server_name kong_worker_events;
listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
access_log off;
location / {
content_by_lua_block {
require("resty.events.compat").run()
}
}
}
};

run_tests();

__DATA__




=== TEST 1: active probes, http node failing
--- http_config eval
qq{
$::HttpConfig

server {
listen 2130;
location = /status {
content_by_lua_block {
ngx.sleep(2)
ngx.exit(500);
}
}
}
}
--- config
location = /t {
content_by_lua_block {
local healthcheck = require("resty.healthcheck")
local checker = healthcheck.new({
name = "testing",
shm_name = "test_shm",
events_module = "resty.events",
type = "http",
checks = {
active = {
timeout = 1,
http_path = "/status",
healthy = {
interval = 0.1,
successes = 3,
},
unhealthy = {
interval = 0.1,
http_failures = 3,
}
},
}
})
local ok, err = checker:add_target("127.0.0.1", 2130, nil, true)
ngx.sleep(3) -- wait for some time to let the checks run
-- There should be no more than 3 timers running atm, but
-- add a few spaces for worker events
ngx.say(tonumber(ngx.timer.running_count()) <= 5)
}
}
--- request
GET /t
--- response_body
true
15 changes: 9 additions & 6 deletions t/with_worker-events/14-tls_active_probes.t
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ __DATA__
shm_name = "test_shm",
checks = {
active = {
timeout = 2,
type = "https",
http_path = "/",
healthy = {
Expand All @@ -48,7 +49,7 @@ __DATA__
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true
}
}
Expand All @@ -57,7 +58,7 @@ GET /t
--- response_body
true
--- timeout
15
20

=== TEST 2: active probes, invalid cert
--- http_config eval: $::HttpConfig
Expand All @@ -74,6 +75,7 @@ true
shm_name = "test_shm",
checks = {
active = {
timeout = 2,
type = "https",
http_path = "/",
healthy = {
Expand All @@ -88,7 +90,7 @@ true
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false
}
}
Expand All @@ -97,7 +99,7 @@ GET /t
--- response_body
false
--- timeout
15
20

=== TEST 3: active probes, accept invalid cert when disabling check
--- http_config eval: $::HttpConfig
Expand All @@ -114,6 +116,7 @@ false
shm_name = "test_shm",
checks = {
active = {
timeout = 2,
type = "https",
https_verify_certificate = false,
http_path = "/",
Expand All @@ -129,7 +132,7 @@ false
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.sleep(16) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true
}
}
Expand All @@ -138,4 +141,4 @@ GET /t
--- response_body
true
--- timeout
15
20
Loading
Loading