Skip to content

Commit

Permalink
Merge pull request #3075 from rabbitmq/remove-randomized-startup-delays
Browse files Browse the repository at this point in the history
Remove randomized startup delays
  • Loading branch information
michaelklishin authored Jun 5, 2021
2 parents 49a70a0 + 0876746 commit ed0ba6a
Show file tree
Hide file tree
Showing 19 changed files with 305 additions and 260 deletions.
36 changes: 21 additions & 15 deletions deps/rabbit/priv/schema/rabbit.schema
Original file line number Diff line number Diff line change
Expand Up @@ -945,31 +945,37 @@ fun(Conf) ->
end}.

%% Cluster formation: Randomized startup delay
%%
%% DEPRECATED: This is a no-op. Old configs are still allowed, but a warning will be printed.

{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range",
[{datatype, integer}]}.
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range",
[{datatype, integer}]}.
{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range", []}.
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range", []}.

{translation, "rabbit.cluster_formation.randomized_startup_delay_range",
fun(Conf) ->
Min = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.min", Conf, undefined),
Max = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.max", Conf, undefined),

case {Min, Max} of
{undefined, undefined} ->
cuttlefish:unset();
{undefined, Max} ->
%% fallback default
{5, Max};
{Min, undefined} ->
%% fallback default
{Min, 60};
{Min, Max} ->
{Min, Max}
end
{undefined, undefined} ->
ok;
_ ->
cuttlefish:warn("cluster_formation.randomized_startup_delay_range.min and "
"cluster_formation.randomized_startup_delay_range.max are deprecated")
end,
cuttlefish:unset()
end}.

%% Cluster formation: lock acquisition retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
%%
%% Currently used in classic, k8s, and aws peer discovery backends.

{mapping, "cluster_formation.internal_lock_retries", "rabbit.cluster_formation.internal_lock_retries",
[
{datatype, integer},
{validators, ["non_zero_positive_integer"]}
]}.

%% Cluster formation: discovery failure retries

{mapping, "cluster_formation.lock_retry_limit", "rabbit.cluster_formation.lock_retry_limit",
Expand Down
15 changes: 6 additions & 9 deletions deps/rabbit/src/rabbit_mnesia.erl
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,18 @@ init_with_lock(Retries, Timeout, RunPeerDiscovery) ->
rabbit_log:debug("rabbit_peer_discovery:lock returned ~p", [LockResult]),
case LockResult of
not_supported ->
rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"),
%% See rabbitmq/rabbitmq-server#1202 for details.
rabbit_peer_discovery:maybe_inject_randomized_delay(),
RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register();
{error, _Reason} ->
timer:sleep(Timeout),
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery);
{ok, Data} ->
try
RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register()
after
rabbit_peer_discovery:unlock(Data)
end
end;
{error, _Reason} ->
timer:sleep(Timeout),
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery)
end.

-spec run_peer_discovery() -> ok | {[node()], node_type()}.
Expand Down Expand Up @@ -178,7 +175,7 @@ join_discovered_peers(TryNodes, NodeType) ->
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval).

join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) ->
rabbit_log:warning(
rabbit_log:info(
"Could not successfully contact any node of: ~s (as in Erlang distribution). "
"Starting as a blank standalone node...",
[string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
Expand All @@ -193,7 +190,7 @@ join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterva
rabbit_node_monitor:notify_joined_cluster();
none ->
RetriesLeft1 = RetriesLeft - 1,
rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
rabbit_log:info("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
[DelayInterval, RetriesLeft1]),
timer:sleep(DelayInterval),
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval)
Expand Down
20 changes: 20 additions & 0 deletions deps/rabbit/src/rabbit_nodes.erl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
boot/0]).
-export([persistent_cluster_id/0, seed_internal_cluster_id/0, seed_user_provided_cluster_name/0]).
-export([all_running_with_hashes/0]).
-export([lock_id/1, lock_retries/0]).

-include_lib("kernel/include/inet.hrl").
-include_lib("rabbit_common/include/rabbit.hrl").
Expand All @@ -23,6 +24,12 @@

-define(INTERNAL_CLUSTER_ID_PARAM_NAME, internal_cluster_id).

% Retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
% To understand how retries map to the timeout, read
% https://github.com/erlang/otp/blob/d256ae477014158a49bb860b283df9c040011197/lib/kernel/src/global.erl#L2062-L2075
% 80 corresponds to a timeout of ca 300 seconds.
-define(DEFAULT_LOCK_RETRIES, 80).

%%----------------------------------------------------------------------------
%% API
%%----------------------------------------------------------------------------
Expand Down Expand Up @@ -162,3 +169,16 @@ await_running_count_with_retries(TargetCount, Retries) ->
-spec all_running_with_hashes() -> #{non_neg_integer() => node()}.
all_running_with_hashes() ->
maps:from_list([{erlang:phash2(Node), Node} || Node <- all_running()]).

-spec lock_id(Node :: node()) -> {ResourceId :: string(), LockRequesterId :: node()}.
lock_id(Node) ->
{cookie_hash(), Node}.

-spec lock_retries() -> integer().
lock_retries() ->
case application:get_env(rabbit, cluster_formation) of
{ok, PropList} ->
proplists:get_value(internal_lock_retries, PropList, ?DEFAULT_LOCK_RETRIES);
undefined ->
?DEFAULT_LOCK_RETRIES
end.
61 changes: 1 addition & 60 deletions deps/rabbit/src/rabbit_peer_discovery.erl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
-export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0,
normalize/1, format_discovered_nodes/1, log_configured_backend/0,
register/0, unregister/0, maybe_register/0, maybe_unregister/0,
maybe_inject_randomized_delay/0, lock/0, unlock/1,
discovery_retries/0]).
lock/0, unlock/1, discovery_retries/0]).
-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0,
lock_acquisition_failure_mode/0]).

Expand All @@ -28,9 +27,6 @@
%% default node prefix to attach to discovered hostnames
-define(DEFAULT_PREFIX, "rabbit").

%% default randomized delay range, in seconds
-define(DEFAULT_STARTUP_RANDOMIZED_DELAY, {5, 60}).

%% default discovery retries and interval.
-define(DEFAULT_DISCOVERY_RETRY_COUNT, 10).
-define(DEFAULT_DISCOVERY_RETRY_INTERVAL_MS, 500).
Expand Down Expand Up @@ -159,61 +155,6 @@ discovery_retries() ->
{?DEFAULT_DISCOVERY_RETRY_COUNT, ?DEFAULT_DISCOVERY_RETRY_INTERVAL_MS}
end.


-spec maybe_inject_randomized_delay() -> ok.
maybe_inject_randomized_delay() ->
Backend = backend(),
case Backend:supports_registration() of
true ->
rabbit_log:info("Peer discovery backend ~s supports registration.", [Backend]),
inject_randomized_delay();
false ->
rabbit_log:info("Peer discovery backend ~s does not support registration, skipping randomized startup delay.", [Backend]),
ok
end.

-spec inject_randomized_delay() -> ok.

inject_randomized_delay() ->
{Min, Max} = randomized_delay_range_in_ms(),
case {Min, Max} of
%% When the max value is set to 0, consider the delay to be disabled.
%% In addition, `rand:uniform/1` will fail with a "no function clause"
%% when the argument is 0.
{_, 0} ->
rabbit_log:info("Randomized delay range's upper bound is set to 0. Considering it disabled."),
ok;
{_, N} when is_number(N) ->
rand:seed(exsplus),
RandomVal = rand:uniform(round(N)),
rabbit_log:debug("Randomized startup delay: configured range is from ~p to ~p milliseconds, PRNG pick: ~p...",
[Min, Max, RandomVal]),
Effective = case RandomVal < Min of
true -> Min;
false -> RandomVal
end,
rabbit_log:info("Will wait for ~p milliseconds before proceeding with registration...", [Effective]),
timer:sleep(Effective),
ok
end.

-spec randomized_delay_range_in_ms() -> {integer(), integer()}.

randomized_delay_range_in_ms() ->
Backend = backend(),
Default = case erlang:function_exported(Backend, randomized_startup_delay_range, 0) of
true -> Backend:randomized_startup_delay_range();
false -> ?DEFAULT_STARTUP_RANDOMIZED_DELAY
end,
{Min, Max} = case application:get_env(rabbit, cluster_formation) of
{ok, Proplist} ->
proplists:get_value(randomized_startup_delay_range, Proplist, Default);
undefined ->
Default
end,
{Min * 1000, Max * 1000}.


-spec register() -> ok.

register() ->
Expand Down
56 changes: 27 additions & 29 deletions deps/rabbit/src/rabbit_peer_discovery_classic_config.erl
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,36 @@ list_nodes() ->
Nodes when is_list(Nodes) -> {ok, {Nodes, disc}}
end.

-spec lock(Node :: node()) -> {ok, {ResourceId :: string(), LockRequesterId :: node()}} | {error, Reason :: string()}.

lock(Node) ->
{ok, {Nodes, _NodeType}} = list_nodes(),
case lists:member(Node, Nodes) of
false when Nodes =/= [] ->
rabbit_log:warning("Local node ~s is not part of configured nodes ~p. "
"This might lead to incorrect cluster formation.", [Node, Nodes]);
_ -> ok
end,
LockId = rabbit_nodes:lock_id(Node),
Retries = rabbit_nodes:lock_retries(),
case global:set_lock(LockId, Nodes, Retries) of
true ->
{ok, LockId};
false ->
{error, io_lib:format("Acquiring lock taking too long, bailing out after ~b retries", [Retries])}
end.

-spec unlock({ResourceId :: string(), LockRequesterId :: node()}) -> ok.

unlock(LockId) ->
{ok, {Nodes, _NodeType}} = list_nodes(),
global:del_lock(LockId, Nodes),
ok.

-spec supports_registration() -> boolean().

supports_registration() ->
%% If we don't have any nodes configured, skip randomized delay and similar operations
%% as we don't want to delay startup for no reason. MK.
has_any_peer_nodes_configured().
false.

-spec register() -> ok.

Expand All @@ -47,29 +71,3 @@ unregister() ->

post_registration() ->
ok.

-spec lock(Node :: atom()) -> not_supported.

lock(_Node) ->
not_supported.

-spec unlock(Data :: term()) -> ok.

unlock(_Data) ->
ok.

%%
%% Helpers
%%

has_any_peer_nodes_configured() ->
case application:get_env(rabbit, cluster_nodes, []) of
{[], _NodeType} ->
false;
{Nodes, _NodeType} when is_list(Nodes) ->
true;
[] ->
false;
Nodes when is_list(Nodes) ->
true
end.
17 changes: 8 additions & 9 deletions deps/rabbit/test/config_schema_SUITE_data/rabbit.snippets
Original file line number Diff line number Diff line change
Expand Up @@ -540,23 +540,22 @@ tcp_listen_options.exit_on_close = false",
{cluster_formation_randomized_startup_delay_both_values,
"cluster_formation.randomized_startup_delay_range.min = 10
cluster_formation.randomized_startup_delay_range.max = 30",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {10, 30}}
]}]}],
[],
[]},

{cluster_formation_randomized_startup_delay_min_only,
"cluster_formation.randomized_startup_delay_range.min = 10",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {10, 60}}
]}]}],
[],
[]},

{cluster_formation_randomized_startup_delay_max_only,
"cluster_formation.randomized_startup_delay_range.max = 30",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {5, 30}}
]}]}],
[],
[]},

{cluster_formation_internal_lock_retries,
"cluster_formation.internal_lock_retries = 10",
[{rabbit,[{cluster_formation,[{internal_lock_retries,10}]}]}],
[]},

{cluster_formation_dns,
Expand Down
Loading

0 comments on commit ed0ba6a

Please sign in to comment.