Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add request timeout for ValidatorSites: #2902

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/ripple/app/misc/ValidatorSite.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ namespace ripple {

@li @c "version": 1

@li @c "refreshInterval" (optional)
@li @c "refreshInterval" (optional, integer minutes).
This value is clamped internally to [1,1440] (1 min - 1 day)
*/
class ValidatorSite
{
Expand Down Expand Up @@ -125,11 +126,15 @@ class ValidatorSite
// The configured list of URIs for fetching lists
std::vector<Site> sites_;

// time to allow for requests to complete
const std::chrono::seconds requestTimeout_;

public:
ValidatorSite (
boost::asio::io_service& ios,
ValidatorList& validators,
beast::Journal j);
beast::Journal j,
std::chrono::seconds timeout = std::chrono::seconds{20});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just an observation. When I first saw this my inclination was to move the Journal to the end of the arguments, since that matches our usual parameter order. I see you made an exception to that expectation so you could default the timeout.

After playing around with it a bit I'm comfortable with your choice. There's (currently) only one place where the timeout is explicitly passed in by the caller, and that's in a unit test.

Not asking for a change. I just thought you'd want to know that I played around with the options and agree that you made the right choice.

~ValidatorSite ();

/** Load configured site URIs.
Expand Down Expand Up @@ -184,8 +189,15 @@ class ValidatorSite

private:
/// Queue next site to be fetched
/// lock over state_mutex_ required
void
setTimer (std::lock_guard<std::mutex>&);

/// request took too long
void
setTimer ();
onRequestTimeout (
std::size_t siteIdx,
error_code const& ec);

/// Fetch site whose time has come
void
Expand Down
139 changes: 103 additions & 36 deletions src/ripple/app/misc/impl/ValidatorSite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@
#include <ripple/basics/Slice.h>
#include <ripple/json/json_reader.h>
#include <ripple/protocol/JsonFields.h>
#include <boost/algorithm/clamp.hpp>
#include <boost/regex.hpp>
#include <algorithm>

namespace ripple {

// default site query frequency - 5 minutes
auto constexpr DEFAULT_REFRESH_INTERVAL = std::chrono::minutes{5};
auto constexpr ERROR_RETRY_INTERVAL = std::chrono::seconds{30};
unsigned short constexpr MAX_REDIRECTS = 3;
auto constexpr default_refresh_interval = std::chrono::minutes{5};
auto constexpr error_retry_interval = std::chrono::seconds{30};
unsigned short constexpr max_redirects = 3;

ValidatorSite::Site::Resource::Resource (std::string uri_)
: uri {std::move(uri_)}
Expand Down Expand Up @@ -82,22 +82,24 @@ ValidatorSite::Site::Site (std::string uri)
: loadedResource {std::make_shared<Resource>(std::move(uri))}
, startingResource {loadedResource}
, redirCount {0}
, refreshInterval {DEFAULT_REFRESH_INTERVAL}
, refreshInterval {default_refresh_interval}
, nextRefresh {clock_type::now()}
{
}

ValidatorSite::ValidatorSite (
boost::asio::io_service& ios,
ValidatorList& validators,
beast::Journal j)
beast::Journal j,
std::chrono::seconds timeout)
: ios_ (ios)
, validators_ (validators)
, j_ (j)
, timer_ (ios_)
, fetching_ (false)
, pending_ (false)
, stopping_ (false)
, requestTimeout_ (timeout)
{
}

Expand Down Expand Up @@ -153,7 +155,7 @@ ValidatorSite::start ()
{
std::lock_guard <std::mutex> lock{state_mutex_};
if (timer_.expires_at() == clock_type::time_point{})
setTimer ();
setTimer (lock);
}

void
Expand All @@ -168,20 +170,29 @@ ValidatorSite::stop()
{
std::unique_lock<std::mutex> lock{state_mutex_};
stopping_ = true;
cv_.wait(lock, [&]{ return ! fetching_; });

// work::cancel() must be called before the
// cv wait in order to kick any asio async operations
// that might be pending.
if(auto sp = work_.lock())
sp->cancel();
cv_.wait(lock, [&]{ return ! fetching_; });

error_code ec;
timer_.cancel(ec);
// docs indicate cancel() can throw, but this should be
// reconsidered if it changes to noexcept
try
{
timer_.cancel();
}
catch (boost::system::system_error const&)
{
}
stopping_ = false;
pending_ = false;
cv_.notify_all();
}

void
ValidatorSite::setTimer ()
ValidatorSite::setTimer (std::lock_guard<std::mutex>& state_lock)
{
std::lock_guard <std::mutex> lock{sites_mutex_};

Expand All @@ -196,31 +207,54 @@ ValidatorSite::setTimer ()
pending_ = next->nextRefresh <= clock_type::now();
cv_.notify_all();
timer_.expires_at (next->nextRefresh);
timer_.async_wait (std::bind (&ValidatorSite::onTimer, this,
std::distance (sites_.begin (), next), std::placeholders::_1));
auto idx = std::distance (sites_.begin (), next);
timer_.async_wait ([this, idx] (boost::system::error_code const& ec)
{
this->onTimer (idx, ec);
});
}
}

void
ValidatorSite::makeRequest (
std::shared_ptr<Site::Resource> resource,
std::size_t siteIdx,
std::lock_guard<std::mutex>& lock)
std::lock_guard<std::mutex>& sites_lock)
{
fetching_ = true;
sites_[siteIdx].activeResource = resource;
std::shared_ptr<detail::Work> sp;
auto timeoutCancel =
[this] ()
{
std::lock_guard <std::mutex> lock_state{state_mutex_};
// docs indicate cancel_one() can throw, but this
// should be reconsidered if it changes to noexcept
try
{
timer_.cancel_one();
}
catch (boost::system::system_error const&)
{
}
};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This overload of cancel_one is deprecated. You can use the non-error_code overload. eg. timer_.cancel_one();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

auto onFetch =
[this, siteIdx] (error_code const& err, detail::response_type&& resp)
[this, siteIdx, timeoutCancel] (
error_code const& err, detail::response_type&& resp)
{
timeoutCancel ();
onSiteFetch (err, std::move(resp), siteIdx);
};

auto onFetchFile =
[this, siteIdx] (error_code const& err, std::string const& resp)
{
onTextFetch (err, resp, siteIdx);
};
[this, siteIdx, timeoutCancel] (
error_code const& err, std::string const& resp)
{
timeoutCancel ();
onTextFetch (err, resp, siteIdx);
};

JLOG (j_.debug()) << "Starting request for " << resource->uri;

if (resource->pUrl.scheme == "https")
{
Expand Down Expand Up @@ -252,6 +286,34 @@ ValidatorSite::makeRequest (

work_ = sp;
sp->run ();
// start a timer for the request, which shouldn't take more
// than requestTimeout_ to complete
std::lock_guard <std::mutex> lock_state{state_mutex_};
timer_.expires_after (requestTimeout_);
timer_.async_wait ([this, siteIdx] (boost::system::error_code const& ec)
{
this->onRequestTimeout (siteIdx, ec);
});
}

void
ValidatorSite::onRequestTimeout (
std::size_t siteIdx,
error_code const& ec)
{
if (ec)
return;

{
std::lock_guard <std::mutex> lock_site{sites_mutex_};
JLOG (j_.warn()) <<
"Request for " << sites_[siteIdx].activeResource->uri <<
" took too long";
}

std::lock_guard<std::mutex> lock_state{state_mutex_};
if(auto sp = work_.lock())
sp->cancel();
}

void
Expand All @@ -268,14 +330,12 @@ ValidatorSite::onTimer (
return;
}

std::lock_guard <std::mutex> lock{sites_mutex_};
sites_[siteIdx].nextRefresh =
clock_type::now() + sites_[siteIdx].refreshInterval;

assert(! fetching_);
sites_[siteIdx].redirCount = 0;
try
{
std::lock_guard <std::mutex> lock{sites_mutex_};
sites_[siteIdx].nextRefresh =
clock_type::now() + sites_[siteIdx].refreshInterval;
sites_[siteIdx].redirCount = 0;
// the WorkSSL client can throw if SSL init fails
makeRequest(sites_[siteIdx].startingResource, siteIdx, lock);
}
Expand All @@ -292,7 +352,7 @@ void
ValidatorSite::parseJsonResponse (
std::string const& res,
std::size_t siteIdx,
std::lock_guard<std::mutex>& lock)
std::lock_guard<std::mutex>& sites_lock)
{
Json::Reader r;
Json::Value body;
Expand Down Expand Up @@ -370,18 +430,23 @@ ValidatorSite::parseJsonResponse (
if (body.isMember ("refresh_interval") &&
body["refresh_interval"].isNumeric ())
{
// TODO: should we sanity check/clamp this value
// to something reasonable?
sites_[siteIdx].refreshInterval =
std::chrono::minutes{body["refresh_interval"].asUInt ()};
using namespace std::chrono_literals;
std::chrono::minutes const refresh =
boost::algorithm::clamp(
std::chrono::minutes {body["refresh_interval"].asUInt ()},
1min,
24h);
sites_[siteIdx].refreshInterval = refresh;
sites_[siteIdx].nextRefresh =
clock_type::now() + sites_[siteIdx].refreshInterval;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the following code is a bit more consistent with time units and possibly easier to read:

        using namespace std::chrono_literals;
        std::chrono::minutes const refresh =
            boost::algorithm::clamp(
                std::chrono::minutes {body["refresh_interval"].asUInt ()},
                1min,
                1h * 24);
        sites_[siteIdx].refreshInterval = refresh;
        sites_[siteIdx].nextRefresh =
            clock_type::now() + sites_[siteIdx].refreshInterval;

Your choice.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 - fixed

Copy link
Contributor

@miguelportilla miguelportilla Apr 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion... 1h * 24 -> 24h

}
}

std::shared_ptr<ValidatorSite::Site::Resource>
ValidatorSite::processRedirect (
detail::response_type& res,
std::size_t siteIdx,
std::lock_guard<std::mutex>& lock)
std::lock_guard<std::mutex>& sites_lock)
{
using namespace boost::beast::http;
std::shared_ptr<Site::Resource> newLocation;
Expand All @@ -395,7 +460,7 @@ ValidatorSite::processRedirect (
throw std::runtime_error{"missing location"};
}

if (sites_[siteIdx].redirCount == MAX_REDIRECTS)
if (sites_[siteIdx].redirCount == max_redirects)
{
JLOG (j_.warn()) <<
"Exceeded max redirects for validator list at " <<
Expand Down Expand Up @@ -435,6 +500,8 @@ ValidatorSite::onSiteFetch(
{
{
std::lock_guard <std::mutex> lock_sites{sites_mutex_};
JLOG (j_.debug()) << "Got completion for "
<< sites_[siteIdx].activeResource->uri;
auto onError = [&](std::string const& errMsg, bool retry)
{
sites_[siteIdx].lastRefreshStatus.emplace(
Expand All @@ -443,7 +510,7 @@ ValidatorSite::onSiteFetch(
errMsg});
if (retry)
sites_[siteIdx].nextRefresh =
clock_type::now() + ERROR_RETRY_INTERVAL;
clock_type::now() + error_retry_interval;
};
if (ec)
{
Expand Down Expand Up @@ -506,7 +573,7 @@ ValidatorSite::onSiteFetch(
std::lock_guard <std::mutex> lock_state{state_mutex_};
fetching_ = false;
if (! stopping_)
setTimer ();
setTimer (lock_state);
cv_.notify_all();
}

Expand Down Expand Up @@ -547,7 +614,7 @@ ValidatorSite::onTextFetch(
std::lock_guard <std::mutex> lock_state{state_mutex_};
fetching_ = false;
if (! stopping_)
setTimer ();
setTimer (lock_state);
cv_.notify_all();
}

Expand Down
Loading