Skip to content

Commit

Permalink
watchdog: Optimize WatchdogImpl::touch in preparation to more frequen…
Browse files Browse the repository at this point in the history
…t petting of the watchdog. (#13103)

Signed-off-by: Antonio Vicente <[email protected]>
  • Loading branch information
antoniovicente authored Oct 20, 2020
1 parent 58bc780 commit 4797e7a
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 46 deletions.
1 change: 0 additions & 1 deletion include/envoy/server/watchdog.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class WatchDog {
*/
virtual void touch() PURE;
virtual Thread::ThreadId threadId() const PURE;
virtual MonotonicTime lastTouchTime() const PURE;
};

using WatchDogSharedPtr = std::shared_ptr<WatchDog>;
Expand Down
36 changes: 20 additions & 16 deletions source/server/guarddog_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,13 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio
GuardDogImpl::~GuardDogImpl() { stop(); }

void GuardDogImpl::step() {
{
Thread::LockGuard guard(mutex_);
if (!run_thread_) {
return;
}
// Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks
// and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in
// a full iteration of the step() function to process recent watchdog touches and monotonic time
// changes.
Thread::LockGuard guard(mutex_);
if (!run_thread_) {
return;
}

const auto now = time_source_.monotonicTime();
Expand All @@ -123,7 +125,13 @@ void GuardDogImpl::step() {
static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));

for (auto& watched_dog : watched_dogs_) {
const auto last_checkin = watched_dog->dog_->lastTouchTime();
if (watched_dog->dog_->getTouchedAndReset()) {
// Watchdog was touched since the guard dog last checked; update last check-in time.
watched_dog->last_checkin_ = now;
continue;
}

const auto last_checkin = watched_dog->last_checkin_;
const auto tid = watched_dog->dog_->threadId();
const auto delta = now - last_checkin;
if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) {
Expand Down Expand Up @@ -170,12 +178,9 @@ void GuardDogImpl::step() {
invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now);
}

{
Thread::LockGuard guard(mutex_);
test_interlock_hook_->signalFromImpl(now);
if (run_thread_) {
loop_timer_->enableTimer(loop_interval_);
}
test_interlock_hook_->signalFromImpl();
if (run_thread_) {
loop_timer_->enableTimer(loop_interval_);
}
}

Expand All @@ -186,14 +191,13 @@ WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
// accessed out of the locked section below is const (time_source_ has no
// state).
const auto wd_interval = loop_interval_ / 2;
WatchDogSharedPtr new_watchdog =
std::make_shared<WatchDogImpl>(std::move(thread_id), time_source_, wd_interval);
auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id), wd_interval);
WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
new_watchdog->touch();
{
Thread::LockGuard guard(wd_lock_);
watched_dogs_.push_back(std::move(watched_dog));
}
new_watchdog->touch();
return new_watchdog;
}

Expand Down Expand Up @@ -242,7 +246,7 @@ void GuardDogImpl::invokeGuardDogActions(
}

GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
const WatchDogSharedPtr& watch_dog)
const WatchDogImplSharedPtr& watch_dog)
: dog_(watch_dog),
miss_counter_(stats_scope.counterFromStatName(
Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name),
Expand Down
29 changes: 16 additions & 13 deletions source/server/guarddog_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "common/common/thread.h"
#include "common/event/libevent.h"

#include "server/watchdog_impl.h"

#include "absl/types/optional.h"

namespace Envoy {
Expand Down Expand Up @@ -45,16 +47,17 @@ class GuardDogImpl : public GuardDog {
virtual ~TestInterlockHook() = default;

/**
* Called from GuardDogImpl to indicate that it has evaluated all watch-dogs
* up to a particular point in time.
* Called from GuardDogImpl to indicate that it has evaluated all watch-dogs up to a particular
* point in time. Called while the GuardDog mutex is held.
*/
virtual void signalFromImpl(MonotonicTime) {}
virtual void signalFromImpl() {}

/**
* Called from GuardDog tests to block until the implementation has reached
* the desired point in time.
* Called from GuardDog tests to block until the implementation has reached the desired
* condition. Called while the GuardDog mutex is held.
* @param mutex The GuardDog's mutex for use by Thread::CondVar::wait.
*/
virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {}
virtual void waitFromTest(Thread::MutexBasicLockable& /*mutex*/) {}
};

/**
Expand All @@ -79,15 +82,13 @@ class GuardDogImpl : public GuardDog {
const std::chrono::milliseconds loopIntervalForTest() const { return loop_interval_; }

/**
* Test hook to force a step() to catch up with the current simulated
* time. This is inlined so that it does not need to be present in the
* production binary.
* Test hook to force a step() to catch up with the current watchdog state and simulated time.
* This is inlined so that it does not need to be present in the production binary.
*/
void forceCheckForTest() {
Thread::LockGuard guard(mutex_);
MonotonicTime now = time_source_.monotonicTime();
loop_timer_->enableTimer(std::chrono::milliseconds(0));
test_interlock_hook_->waitFromTest(mutex_, now);
test_interlock_hook_->waitFromTest(mutex_);
}

// Server::GuardDog
Expand All @@ -111,11 +112,13 @@ class GuardDogImpl : public GuardDog {
std::vector<std::pair<Thread::ThreadId, MonotonicTime>> thread_last_checkin_pairs,
MonotonicTime now);

using WatchDogImplSharedPtr = std::shared_ptr<WatchDogImpl>;
struct WatchedDog {
WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name,
const WatchDogSharedPtr& watch_dog);
const WatchDogImplSharedPtr& watch_dog);

const WatchDogSharedPtr dog_;
const WatchDogImplSharedPtr dog_;
MonotonicTime last_checkin_;
absl::optional<MonotonicTime> last_alert_time_;
bool miss_alerted_{};
bool megamiss_alerted_{};
Expand Down
18 changes: 8 additions & 10 deletions source/server/watchdog_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,24 @@ class WatchDogImpl : public WatchDog {
/**
* @param interval WatchDog timer interval (used after startWatchdog())
*/
WatchDogImpl(Thread::ThreadId thread_id, TimeSource& tsource, std::chrono::milliseconds interval)
: thread_id_(thread_id), time_source_(tsource),
latest_touch_time_since_epoch_(tsource.monotonicTime().time_since_epoch()),
timer_interval_(interval) {}
WatchDogImpl(Thread::ThreadId thread_id, std::chrono::milliseconds interval)
: thread_id_(thread_id), timer_interval_(interval) {}

Thread::ThreadId threadId() const override { return thread_id_; }
MonotonicTime lastTouchTime() const override {
return MonotonicTime(latest_touch_time_since_epoch_.load());
}
// Used by GuardDogImpl determine if the watchdog was touched recently and reset the touch status.
bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_relaxed); }

// Server::WatchDog
void startWatchdog(Event::Dispatcher& dispatcher) override;
void touch() override {
latest_touch_time_since_epoch_.store(time_source_.monotonicTime().time_since_epoch());
// Set touched_ if not already set.
bool expected = false;
touched_.compare_exchange_strong(expected, true, std::memory_order_relaxed);
}

private:
const Thread::ThreadId thread_id_;
TimeSource& time_source_;
std::atomic<std::chrono::steady_clock::duration> latest_touch_time_since_epoch_;
std::atomic<bool> touched_{false};
Event::TimerPtr timer_;
const std::chrono::milliseconds timer_interval_;
};
Expand Down
1 change: 0 additions & 1 deletion test/mocks/server/watch_dog.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class MockWatchDog : public WatchDog {
MOCK_METHOD(void, startWatchdog, (Event::Dispatcher & dispatcher));
MOCK_METHOD(void, touch, ());
MOCK_METHOD(Thread::ThreadId, threadId, (), (const));
MOCK_METHOD(MonotonicTime, lastTouchTime, (), (const));
};
} // namespace Server
} // namespace Envoy
21 changes: 16 additions & 5 deletions test/server/guarddog_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,23 @@ const int DISABLE_MEGAMISS = 1000000;
class DebugTestInterlock : public GuardDogImpl::TestInterlockHook {
public:
// GuardDogImpl::TestInterlockHook
void signalFromImpl(MonotonicTime time) override {
impl_reached_ = time;
void signalFromImpl() override {
waiting_for_signal_ = false;
impl_.notifyAll();
}

void waitFromTest(Thread::MutexBasicLockable& mutex, MonotonicTime time) override
void waitFromTest(Thread::MutexBasicLockable& mutex) override
ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex) {
while (impl_reached_ < time) {
ASSERT(!waiting_for_signal_);
waiting_for_signal_ = true;
while (waiting_for_signal_) {
impl_.wait(mutex);
}
}

private:
Thread::CondVar impl_;
MonotonicTime impl_reached_;
bool waiting_for_signal_ = false;
};

// We want to make sure guard-dog is tested with both simulated time and real
Expand Down Expand Up @@ -307,6 +309,7 @@ TEST_P(GuardDogMissTest, MissTest) {
initGuardDog(stats_store_, config_miss_);
auto unpet_dog =
guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread");
guard_dog_->forceCheckForTest();
// We'd better start at 0:
checkMiss(0, "MissTest check 1");
// At 300ms we shouldn't have hit the timeout yet:
Expand All @@ -332,6 +335,7 @@ TEST_P(GuardDogMissTest, MegaMissTest) {
initGuardDog(stats_store_, config_mega_);
auto unpet_dog =
guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread");
guard_dog_->forceCheckForTest();
// We'd better start at 0:
checkMegaMiss(0, "MegaMissTest check 1");
// This shouldn't be enough to increment the stat:
Expand All @@ -358,6 +362,7 @@ TEST_P(GuardDogMissTest, MissCountTest) {
initGuardDog(stats_store_, config_miss_);
auto sometimes_pet_dog =
guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread");
guard_dog_->forceCheckForTest();
// These steps are executed once without ever touching the watchdog.
// Then the last step is to touch the watchdog and repeat the steps.
// This verifies that the behavior is reset back to baseline after a touch.
Expand All @@ -380,9 +385,11 @@ TEST_P(GuardDogMissTest, MissCountTest) {
// When we finally touch the dog we should get one more increment once the
// timeout value expires:
sometimes_pet_dog->touch();
guard_dog_->forceCheckForTest();
}
time_system_->advanceTimeWait(std::chrono::milliseconds(1000));
sometimes_pet_dog->touch();
guard_dog_->forceCheckForTest();
// Make sure megamiss still works:
checkMegaMiss(0UL, "MissCountTest check 5");
time_system_->advanceTimeWait(std::chrono::milliseconds(1500));
Expand Down Expand Up @@ -656,6 +663,7 @@ TEST_P(GuardDogActionsTest, MissShouldSaturateOnMissEvent) {

// Touch the watchdog, which should allow the event to trigger again.
first_dog_->touch();
guard_dog_->forceCheckForTest();

time_system_->advanceTimeWait(std::chrono::milliseconds(101));
guard_dog_->forceCheckForTest();
Expand Down Expand Up @@ -718,6 +726,7 @@ TEST_P(GuardDogActionsTest, MegaMissShouldSaturateOnMegaMissEvent) {

// Touch the watchdog, which should allow the event to trigger again.
first_dog_->touch();
guard_dog_->forceCheckForTest();

time_system_->advanceTimeWait(std::chrono::milliseconds(101));
guard_dog_->forceCheckForTest();
Expand All @@ -733,6 +742,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) {
initGuardDog(fake_stats_, config);
auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread");
auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread");
guard_dog_->forceCheckForTest();
time_system_->advanceTimeWait(std::chrono::milliseconds(101));
guard_dog_->forceCheckForTest();
};
Expand All @@ -747,6 +757,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) {
initGuardDog(fake_stats_, config);
auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread");
auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread");
guard_dog_->forceCheckForTest();
time_system_->advanceTimeWait(std::chrono::milliseconds(101));
guard_dog_->forceCheckForTest();
};
Expand Down

0 comments on commit 4797e7a

Please sign in to comment.