Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Avoid rapidly backing-off a server if we ignore the retry interval #5335

Merged
merged 1 commit into from
Jun 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/5335.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug where we could rapidly mark a server as unreachable even though it was only down for a few minutes.
60 changes: 37 additions & 23 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def __init__(self, retry_last_ts, retry_interval, destination):


@defer.inlineCallbacks
def get_retry_limiter(destination, clock, store, ignore_backoff=False,
**kwargs):
def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs):
"""For a given destination check if we have previously failed to
send a request there and are waiting before retrying the destination.
If we are not ready to retry the destination, this will raise a
Expand All @@ -60,8 +59,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
clock (synapse.util.clock): timing source
store (synapse.storage.transactions.TransactionStore): datastore
ignore_backoff (bool): true to ignore the historical backoff data and
try the request anyway. We will still update the next
retry_interval on success/failure.
try the request anyway. We will still reset the retry_interval on success.

Example usage:

Expand All @@ -75,13 +73,12 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
"""
retry_last_ts, retry_interval = (0, 0)

retry_timings = yield store.get_destination_retry_timings(
destination
)
retry_timings = yield store.get_destination_retry_timings(destination)

if retry_timings:
retry_last_ts, retry_interval = (
retry_timings["retry_last_ts"], retry_timings["retry_interval"]
retry_timings["retry_last_ts"],
retry_timings["retry_interval"],
)

now = int(clock.time_msec())
Expand All @@ -93,22 +90,31 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
destination=destination,
)

# if we are ignoring the backoff data, we should also not increment the backoff
# when we get another failure - otherwise a server can very quickly reach the
# maximum backoff even though it might only have been down briefly
backoff_on_failure = not ignore_backoff

defer.returnValue(
RetryDestinationLimiter(
destination,
clock,
store,
retry_interval,
**kwargs
destination, clock, store, retry_interval, backoff_on_failure, **kwargs
)
)


class RetryDestinationLimiter(object):
def __init__(self, destination, clock, store, retry_interval,
min_retry_interval=10 * 60 * 1000,
max_retry_interval=24 * 60 * 60 * 1000,
multiplier_retry_interval=5, backoff_on_404=False):
def __init__(
self,
destination,
clock,
store,
retry_interval,
min_retry_interval=10 * 60 * 1000,
max_retry_interval=24 * 60 * 60 * 1000,
multiplier_retry_interval=5,
backoff_on_404=False,
backoff_on_failure=True,
):
"""Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500.

Expand All @@ -128,6 +134,9 @@ def __init__(self, destination, clock, store, retry_interval,
multiplier_retry_interval (int): The multiplier to use to increase
the retry interval after a failed request.
backoff_on_404 (bool): Back off if we get a 404

backoff_on_failure (bool): set to False if we should not increase the
retry interval on a failure.
"""
self.clock = clock
self.store = store
Expand All @@ -138,6 +147,7 @@ def __init__(self, destination, clock, store, retry_interval,
self.max_retry_interval = max_retry_interval
self.multiplier_retry_interval = multiplier_retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure

def __enter__(self):
pass
Expand Down Expand Up @@ -173,10 +183,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
if not self.retry_interval:
return

logger.debug("Connection to %s was successful; clearing backoff",
self.destination)
logger.debug(
"Connection to %s was successful; clearing backoff", self.destination
)
retry_last_ts = 0
self.retry_interval = 0
elif not self.backoff_on_failure:
return
else:
# We couldn't connect.
if self.retry_interval:
Expand All @@ -190,7 +203,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):

logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
self.destination, exc_type, exc_val, self.retry_interval
self.destination,
exc_type,
exc_val,
self.retry_interval,
)
retry_last_ts = int(self.clock.time_msec())

Expand All @@ -201,9 +217,7 @@ def store_retry_timings():
self.destination, retry_last_ts, self.retry_interval
)
except Exception:
logger.exception(
"Failed to store destination_retry_timings",
)
logger.exception("Failed to store destination_retry_timings")

# we deliberately do this in the background.
synapse.util.logcontext.run_in_background(store_retry_timings)