-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
watchdog: Optimize WatchdogImpl::touch in preparation to more frequent petting of the watchdog. #13103
watchdog: Optimize WatchdogImpl::touch in preparation to more frequent petting of the watchdog. #13103
Changes from 4 commits
bd92bc2
e6afe7b
08ee875
3b8b2a9
6b623b6
3f962a1
bcad016
d436eaf
c971f62
1d5b149
e9db878
5a3ecaf
3b6313a
8e54372
39dbb11
fc49f33
9ee42f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,11 +81,9 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio | |
GuardDogImpl::~GuardDogImpl() { stop(); } | ||
|
||
void GuardDogImpl::step() { | ||
{ | ||
Thread::LockGuard guard(mutex_); | ||
if (!run_thread_) { | ||
return; | ||
} | ||
Thread::LockGuard guard(mutex_); | ||
if (!run_thread_) { | ||
return; | ||
} | ||
|
||
const auto now = time_source_.monotonicTime(); | ||
|
@@ -102,7 +100,14 @@ void GuardDogImpl::step() { | |
static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size()))); | ||
|
||
for (auto& watched_dog : watched_dogs_) { | ||
const auto ltt = watched_dog->dog_->lastTouchTime(); | ||
if (watched_dog->dog_->touchCount() > 0) { | ||
// Watchdog was touched since guarddog last checked; update time and reset the count. | ||
watched_dog->dog_->resetTouchCount(); | ||
watched_dog->last_touch_time_ = now; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we'll lose some accuracy since the last touch time is set here instead of when the watchdog is actually touched. Is this ok? Should we then rename it to something else? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is still a timestamp from which the most recent touch was detected. I agree that there's a change in precision; watchdog miss/megamiss/kill/multikill would be delayed by up to min(miss_timeout / 2, megamiss_timeout / 2, kill_timeout / 2, multikill_timeout / 2). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed to last_checkin_ for consistency after your recent change |
||
continue; | ||
} | ||
|
||
const auto ltt = watched_dog->last_touch_time_; | ||
const auto tid = watched_dog->dog_->threadId(); | ||
const auto delta = now - ltt; | ||
if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < ltt) { | ||
|
@@ -156,12 +161,9 @@ void GuardDogImpl::step() { | |
invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now); | ||
} | ||
|
||
{ | ||
Thread::LockGuard guard(mutex_); | ||
test_interlock_hook_->signalFromImpl(now); | ||
if (run_thread_) { | ||
loop_timer_->enableTimer(loop_interval_); | ||
} | ||
test_interlock_hook_->signalFromImpl(); | ||
if (run_thread_) { | ||
loop_timer_->enableTimer(loop_interval_); | ||
} | ||
} | ||
|
||
|
@@ -172,14 +174,13 @@ WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id, | |
// accessed out of the locked section below is const (time_source_ has no | ||
// state). | ||
const auto wd_interval = loop_interval_ / 2; | ||
WatchDogSharedPtr new_watchdog = | ||
std::make_shared<WatchDogImpl>(std::move(thread_id), time_source_, wd_interval); | ||
auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id), wd_interval); | ||
WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog); | ||
new_watchdog->touch(); | ||
{ | ||
Thread::LockGuard guard(wd_lock_); | ||
watched_dogs_.push_back(std::move(watched_dog)); | ||
} | ||
new_watchdog->touch(); | ||
return new_watchdog; | ||
} | ||
|
||
|
@@ -227,7 +228,7 @@ void GuardDogImpl::invokeGuardDogActions( | |
} | ||
|
||
GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, | ||
const WatchDogSharedPtr& watch_dog) | ||
const WatchDogImplSharedPtr& watch_dog) | ||
: dog_(watch_dog), | ||
miss_counter_(stats_scope.counterFromStatName( | ||
Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,26 +19,24 @@ class WatchDogImpl : public WatchDog { | |
/** | ||
* @param interval WatchDog timer interval (used after startWatchdog()) | ||
*/ | ||
WatchDogImpl(Thread::ThreadId thread_id, TimeSource& tsource, std::chrono::milliseconds interval) | ||
: thread_id_(thread_id), time_source_(tsource), | ||
latest_touch_time_since_epoch_(tsource.monotonicTime().time_since_epoch()), | ||
timer_interval_(interval) {} | ||
WatchDogImpl(Thread::ThreadId thread_id, std::chrono::milliseconds interval) | ||
: thread_id_(thread_id), timer_interval_(interval) {} | ||
|
||
Thread::ThreadId threadId() const override { return thread_id_; } | ||
MonotonicTime lastTouchTime() const override { | ||
return MonotonicTime(latest_touch_time_since_epoch_.load()); | ||
} | ||
// Accessors for use by GuardDogImpl to tell if the watchdog was touched recently and reset the | ||
// count back to 0 when it has. | ||
uint64_t touchCount() const { return touch_count_.load(); } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're going with uint32_t internally, perhaps we should return that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, fixed. |
||
void resetTouchCount() { touch_count_.store(0); } | ||
|
||
// Server::WatchDog | ||
void startWatchdog(Event::Dispatcher& dispatcher) override; | ||
void touch() override { | ||
latest_touch_time_since_epoch_.store(time_source_.monotonicTime().time_since_epoch()); | ||
} | ||
void touch() override { touch_count_.fetch_add(1); } | ||
|
||
private: | ||
const Thread::ThreadId thread_id_; | ||
TimeSource& time_source_; | ||
std::atomic<std::chrono::steady_clock::duration> latest_touch_time_since_epoch_; | ||
// Use a 32-bit atomic counter since 64-bit atomics are not generally available in 32-bit ARM and | ||
// 32-bit is large enough for our purposes. | ||
std::atomic<uint32_t> touch_count_ = 0; | ||
Event::TimerPtr timer_; | ||
const std::chrono::milliseconds timer_interval_; | ||
}; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not clear to me why this lock now needs to be held for the entire function. Can you possibly add some more comments? Should there by GUARDED_BY annotations to make this more clear? Maybe this is just to simplify the logic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a comment. This is part of the fix to test interlock; we need a full run of the step function after forceCheckForTest. Performance shouldn't suffer since the use of mutex_ in the step() function should never be contended; the other uses of this mutex are GuardDogImpl::start, stop and forceCheckForTest.