diff --git a/Builds/CMake/RippledCore.cmake b/Builds/CMake/RippledCore.cmake index ad9925b10ba..11f02bbd919 100644 --- a/Builds/CMake/RippledCore.cmake +++ b/Builds/CMake/RippledCore.cmake @@ -513,6 +513,7 @@ target_sources (rippled PRIVATE src/ripple/nodestore/impl/EncodedBlob.cpp src/ripple/nodestore/impl/ManagerImp.cpp src/ripple/nodestore/impl/NodeObject.cpp + src/ripple/nodestore/impl/RetryFinalize.cpp src/ripple/nodestore/impl/Shard.cpp src/ripple/nodestore/impl/TaskQueue.cpp #[===============================[ diff --git a/src/ripple/nodestore/RetryFinalize.h b/src/ripple/nodestore/RetryFinalize.h new file mode 100644 index 00000000000..c115e210273 --- /dev/null +++ b/src/ripple/nodestore/RetryFinalize.h @@ -0,0 +1,61 @@ +//------------------------------------------------------------------------------ +/* + This file is part of rippled: https://github.com/ripple/rippled + Copyright (c) 2020 Ripple Labs Inc. + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#ifndef RIPPLE_NODESTORE_RETRYFINALIZE_H_INCLUDED +#define RIPPLE_NODESTORE_RETRYFINALIZE_H_INCLUDED + +#include +#include + +namespace ripple { +namespace NodeStore { + +class RetryFinalize +{ +public: + using retryFunction = std::function; + + RetryFinalize() = default; + + bool + retry(Application& app, retryFunction f, std::uint32_t shardIndex); + + // Must match the imported shard's last ledger hash + uint256 referenceHash{0}; + +private: + using waitable_timer = + boost::asio::basic_waitable_timer; + + static constexpr std::chrono::seconds retryInterval_ = + std::chrono::seconds{60}; + + // Maximum attempts to retrieve a shard's last ledger hash + static constexpr uint32_t maxAttempts_{5}; + + std::unique_ptr timer_; + + // Number of attempts to retrieve a shard's last ledger hash + std::uint32_t numAttempts_{0}; +}; + +} // namespace NodeStore +} // namespace ripple + +#endif // RIPPLE_NODESTORE_RETRYFINALIZE_H_INCLUDED diff --git a/src/ripple/nodestore/ShardValidation.md b/src/ripple/nodestore/ShardValidation.md new file mode 100644 index 00000000000..ec66f054508 --- /dev/null +++ b/src/ripple/nodestore/ShardValidation.md @@ -0,0 +1,130 @@ +# Downloaded Shard Validation + +## Overview + +In order to validate shards that have been downloaded from file servers (as opposed to shards acquired from peers), the application must confirm the validity of the downloaded shard's last ledger. The following sections describe this confirmation process in greater detail. + +## Execution Concept + +### Flag Ledger + +Since the number of ledgers contained in each shard is always a multiple of 256, a shard's last ledger is always a flag ledger. Conveniently, the application provides a mechanism for retrieving the hash for a given flag ledger: + +```C++ +boost::optional +hashOfSeq (ReadView const& ledger, + LedgerIndex seq, + beast::Journal journal) +``` + +When validating downloaded shards, we use this function to retrieve the hash of the shard's last ledger. If the function returns a hash that matches the hash stored in the shard, validation of the shard can proceed. + +### Caveats + +#### Later Ledger + +The `getHashBySeq` function will provide the hash of a flag ledger only if the application has stored a later ledger. When validating a downloaded shard, if there is no later ledger stored, validation of the shard will be deferred until a later ledger has been stored. + +We employ a simple heuristic for determining whether the application has stored a ledger later than the last ledger of the downloaded shard: + +```C++ +// We use the presence (or absense) of the validated +// ledger as a heuristic for determining whether or +// not we have stored a ledger that comes after the +// last ledger in this shard. A later ledger must +// be present in order to reliably retrieve the hash +// of the shard's last ledger. +if (app_.getLedgerMaster().getValidatedLedger()) +{ + auto const hash = app_.getLedgerMaster().getHashBySeq( + lastLedgerSeq(shardIndex)); + + . + . + . +} +``` + +The `getHashBySeq` function will be invoked only when a call to `LedgerMaster::getValidatedLedger` returns a validated ledger, rather than a `nullptr`. Otherwise validation of the shard will be deferred. + +### Retries + +#### Retry Limit + +If the server must defer shard validation, the software will initiate a timer that upon expiration, will re-attempt confirming the last ledger hash. We place an upper limit on the number of attempts the server makes to achieve this confirmation. When the maximum number of attempts has been reached, validation of the shard will fail, resulting in the removal of the shard. An attempt counts toward the limit only when we are able to get a validated ledger (implying a current view of the network), but are unable to retrieve the last ledger hash. Retries that occur because no validated ledger was available are not counted. + +#### ShardInfo + +The `DatabaseShardImp` class stores a container of `ShardInfo` structs, each of which contains information pertaining to a shard held by the server. These structs will be used during shard import to store the last ledger hash (when available) and to track the number of hash confirmation attempts that have been made. + +```C++ +struct ShardInfo +{ + . + . + . + + // Used to limit the number of times we attempt + // to retrieve a shard's last ledger hash, when + // the hash should have been found. See + // scheduleFinalizeShard(). Once this limit has + // been exceeded, the shard has failed and is + // removed. + bool + attemptHashConfirmation() + { + if (lastLedgerHashAttempts + 1 <= maxLastLedgerHashAttempts) + { + ++lastLedgerHashAttempts; + return true; + } + + return false; + } + + // This variable is used during the validation + // of imported shards and must match the + // imported shard's last ledger hash. + uint256 lastLedgerHash; + + // The number of times we've attempted to + // confirm this shard's last ledger hash. + uint16_t lastLedgerHashAttempts; + + // The upper limit on attempts to confirm + // the shard's last ledger hash. + static const uint8_t maxLastLedgerHashAttempts = 5; +}; +``` + +### Shard Import + +Once a shard has been successfully downloaded by the `ShardArchiveHandler`, this class invokes the `importShard` method on the shard database: + +```C++ +bool +DatabaseShardImp::importShard( + std::uint32_t shardIndex, + boost::filesystem::path const& srcDir) +``` + +At the end of this method, `DatabaseShardImp::finalizeShard` is invoked which begins validation of the downloaded shard. This will be changed so that instead, the software first creates a task to confirm the last ledger hash. Upon the successful completion of this task, shard validation will begin. + +```C++ +bool +DatabaseShardImp::importShard( + std::uint32_t shardIndex, + boost::filesystem::path const& srcDir) +{ + . + . + . + + taskQueue_->addTask([this]() + { + // Verify hash. + // Invoke DatabaseShardImp::finalizeShard on success. + // Defer task if necessary. + }); +} +``` diff --git a/src/ripple/nodestore/impl/DatabaseShardImp.cpp b/src/ripple/nodestore/impl/DatabaseShardImp.cpp index ae1072b36ae..ac54d6aa352 100644 --- a/src/ripple/nodestore/impl/DatabaseShardImp.cpp +++ b/src/ripple/nodestore/impl/DatabaseShardImp.cpp @@ -391,7 +391,8 @@ DatabaseShardImp::importShard( if (auto const it {shards_.find(shardIndex)}; it == shards_.end() || it->second.shard || - it->second.state != ShardInfo::State::import) + it->second.state != ShardInfo::State::import || + it->second.retryFinalize) { JLOG(j_.error()) << "shard " << shardIndex << " failed to import"; @@ -416,19 +417,27 @@ DatabaseShardImp::importShard( return false; } - std::lock_guard lock(mutex_); - auto const it {shards_.find(shardIndex)}; - if (it == shards_.end() || - it->second.shard || - it->second.state != ShardInfo::State::import) { - JLOG(j_.error()) << - "shard " << shardIndex << " failed to import"; - return false; + std::lock_guard lock(mutex_); + auto const it {shards_.find(shardIndex)}; + if (it == shards_.end() || + it->second.shard || + it->second.state != ShardInfo::State::import || + it->second.retryFinalize) + { + JLOG(j_.error()) << + "shard " << shardIndex << " failed to import"; + shard.reset(); + renameDir(dstDir, srcDir); + return false; + } + + it->second.shard = std::move(shard); + it->second.retryFinalize = + std::make_unique(); } - it->second.shard = std::move(shard); - finalizeShard(it->second, true, lock); + finalizeWithRefHash(shardIndex); return true; } @@ -437,25 +446,29 @@ DatabaseShardImp::fetchLedger(uint256 const& hash, std::uint32_t seq) { auto const shardIndex {seqToShardIndex(seq)}; { - ShardInfo shardInfo; + std::shared_ptr shard; + ShardInfo::State state; { std::lock_guard lock(mutex_); assert(init_); if (auto const it {shards_.find(shardIndex)}; it != shards_.end()) - shardInfo = it->second; + { + shard = it->second.shard; + state = it->second.state; + } else return {}; } // Check if the ledger is stored in a final shard // or in the shard being acquired - switch (shardInfo.state) + switch (state) { case ShardInfo::State::final: break; case ShardInfo::State::acquire: - if (shardInfo.shard->containsLedger(seq)) + if (shard->containsLedger(seq)) break; [[fallthrough]]; default: @@ -596,7 +609,7 @@ DatabaseShardImp::validate() for (auto const& e : shards) { if (auto shard {e.lock()}; shard) - shard->finalize(true); + shard->finalize(true, boost::none); } app_.shardFamily()->reset(); @@ -612,10 +625,15 @@ DatabaseShardImp::onStop() if (shards_.empty()) return; - // Notify shards to stop - for (auto const& e : shards_) + // Notify and destroy shards + for (auto& e : shards_) + { if (e.second.shard) e.second.shard->stop(); + + if (e.second.retryFinalize) + e.second.retryFinalize.reset(); + } shards_.clear(); } @@ -1202,6 +1220,89 @@ DatabaseShardImp::findAcquireIndex( return boost::none; } +void +DatabaseShardImp::finalizeWithRefHash(std::uint32_t shardIndex) +{ + // We use the presence (or absence) of the validated + // ledger as a heuristic for determining whether or + // not we have stored a ledger that comes after the + // last ledger in this shard. A later ledger must + // be present in order to reliably retrieve the hash + // of the shard's last ledger. + boost::optional referenceHash; + if (app_.getLedgerMaster().getValidatedLedger()) + { + referenceHash = + app_.getLedgerMaster().walkHashBySeq(lastLedgerSeq(shardIndex)); + } + + // Make sure the shard was found in an + // expected state. + auto confirmShard = [this, shardIndex]( + auto const it, std::lock_guard&) { + if (it == shards_.end() || + it->second.state != ShardInfo::State::import || + !it->second.retryFinalize) + { + JLOG(j_.error()) << "shard " << shardIndex << " failed to import"; + return false; + } + + return true; + }; + + // The node is shutting down; remove the shard + // and return. + if (isStopping()) + { + std::shared_ptr shard; + + { + std::lock_guard lock(mutex_); + auto const it{shards_.find(shardIndex)}; + + if(!confirmShard(it, lock)) + return; + + shard = it->second.shard; + } + + JLOG(j_.warn()) + << "shard " << shardIndex + << " will not be imported due to system shutdown, removing"; + + removeFailedShard(shard); + return; + } + + std::lock_guard lock(mutex_); + auto const it{shards_.find(shardIndex)}; + + if(!confirmShard(it, lock)) + return; + + if (referenceHash && referenceHash->isNonZero()) + { + it->second.retryFinalize->referenceHash = *referenceHash; + finalizeShard(it->second, true, lock); + return; + } + + // Failed to find a reference hash, schedule to try again + if (!it->second.retryFinalize->retry( + app_, + std::bind( + &DatabaseShardImp::finalizeWithRefHash, + this, + std::placeholders::_1), + shardIndex)) + { + JLOG(j_.error()) << "shard " << shardIndex + << " failed to import, maximum attempts reached"; + removeFailedShard(it->second.shard); + } +} + void DatabaseShardImp::finalizeShard( ShardInfo& shardInfo, @@ -1222,10 +1323,15 @@ DatabaseShardImp::finalizeShard( return; std::shared_ptr shard; + boost::optional referenceHash; { std::lock_guard lock(mutex_); if (auto const it {shards_.find(shardIndex)}; it != shards_.end()) + { shard = it->second.shard; + if (it->second.retryFinalize) + *referenceHash = it->second.retryFinalize->referenceHash; + } else { JLOG(j_.error()) << @@ -1234,30 +1340,13 @@ DatabaseShardImp::finalizeShard( } } - if (!shard->finalize(writeSQLite)) + if (!shard->finalize(writeSQLite, referenceHash)) { if (isStopping()) return; // Bad shard, remove it - { - std::lock_guard lock(mutex_); - shards_.erase(shardIndex); - updateStatus(lock); - - using namespace boost::filesystem; - path const dir {shard->getDir()}; - shard.reset(); - try - { - remove_all(dir); - } - catch (std::exception const& e) - { - JLOG(j_.error()) << - "exception " << e.what() << " in function " << __func__; - } - } + removeFailedShard(shard); setFileStats(); return; @@ -1405,26 +1494,7 @@ DatabaseShardImp::storeLedgerInShard( if (!shard->store(ledger)) { // Shard may be corrupt, remove it - std::lock_guard lock(mutex_); - - shards_.erase(shard->index()); - if (shard->index() == acquireIndex_) - acquireIndex_ = 0; - - updateStatus(lock); - - using namespace boost::filesystem; - path const dir {shard->getDir()}; - shard.reset(); - try - { - remove_all(dir); - } - catch (std::exception const& e) - { - JLOG(j_.error()) << - "exception " << e.what() << " in function " << __func__; - } + removeFailedShard(shard); result = false; } @@ -1453,6 +1523,35 @@ DatabaseShardImp::storeLedgerInShard( return result; } +void +DatabaseShardImp::removeFailedShard( + std::shared_ptr shard) +{ + { + std::lock_guard lock(mutex_); + shards_.erase(shard->index()); + + if (shard->index() == acquireIndex_) + acquireIndex_ = 0; + + if (shard->isFinal()) + updateStatus(lock); + } + + using namespace boost::filesystem; + path const dir {shard->getDir()}; + shard.reset(); + try + { + remove_all(dir); + } + catch (std::exception const& e) + { + JLOG(j_.error()) << + "exception " << e.what() << " in function " << __func__; + } +} + //------------------------------------------------------------------------------ std::unique_ptr diff --git a/src/ripple/nodestore/impl/DatabaseShardImp.h b/src/ripple/nodestore/impl/DatabaseShardImp.h index 21a117bdab1..c3a53bc5d60 100644 --- a/src/ripple/nodestore/impl/DatabaseShardImp.h +++ b/src/ripple/nodestore/impl/DatabaseShardImp.h @@ -23,6 +23,9 @@ #include #include #include +#include + +#include namespace ripple { namespace NodeStore { @@ -189,6 +192,9 @@ class DatabaseShardImp : public DatabaseShard std::shared_ptr shard; State state {State::none}; + + // Used during the validation of imported shards + std::unique_ptr retryFinalize; }; Application& app_; @@ -261,6 +267,13 @@ class DatabaseShardImp : public DatabaseShard std::uint32_t validLedgerSeq, std::lock_guard&); +public: + // Attempts to retrieve a reference last ledger hash + // for a shard and finalize it + void + finalizeWithRefHash(std::uint32_t shardIndex); + +private: // Queue a task to finalize a shard by validating its databases // Lock must be held void @@ -290,6 +303,9 @@ class DatabaseShardImp : public DatabaseShard storeLedgerInShard( std::shared_ptr& shard, std::shared_ptr const& ledger); + + void + removeFailedShard(std::shared_ptr shard); }; } // NodeStore diff --git a/src/ripple/nodestore/impl/RetryFinalize.cpp b/src/ripple/nodestore/impl/RetryFinalize.cpp new file mode 100644 index 00000000000..2b87c05268e --- /dev/null +++ b/src/ripple/nodestore/impl/RetryFinalize.cpp @@ -0,0 +1,53 @@ +//------------------------------------------------------------------------------ +/* + This file is part of rippled: https://github.com/ripple/rippled + Copyright (c) 2020 Ripple Labs Inc. + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include +#include + +namespace ripple { +namespace NodeStore { + +bool +RetryFinalize::retry( + Application& app, + retryFunction f, + std::uint32_t shardIndex) +{ + if (numAttempts_ >= maxAttempts_) + return false; + + // Retry attempts only count when we have a validated ledger + if (app.getLedgerMaster().getValidatedLedger()) + ++numAttempts_; + + if (!timer_) + timer_ = std::make_unique(app.getIOService()); + + timer_->expires_from_now(retryInterval_); + timer_->async_wait([f{std::move(f)}, shardIndex_ = shardIndex]( + boost::system::error_code const& ec) { + if (ec != boost::asio::error::operation_aborted) + f(shardIndex_); + }); + + return true; +} + +} // namespace NodeStore +} // namespace ripple diff --git a/src/ripple/nodestore/impl/Shard.cpp b/src/ripple/nodestore/impl/Shard.cpp index 086e4e4c37c..fe9e214e110 100644 --- a/src/ripple/nodestore/impl/Shard.cpp +++ b/src/ripple/nodestore/impl/Shard.cpp @@ -384,7 +384,9 @@ Shard::isLegacy() const } bool -Shard::finalize(const bool writeSQLite) +Shard::finalize( + bool const writeSQLite, + boost::optional const& referenceHash) { assert(backend_); @@ -491,6 +493,11 @@ Shard::finalize(const bool writeSQLite) e.what() + " in function " + __func__); } + // Validate the last ledger hash of a downloaded shard + // using a ledger hash obtained from the peer network + if (referenceHash && *referenceHash != hash) + return fail("invalid last ledger hash"); + // Validate every ledger stored in the backend std::shared_ptr ledger; std::shared_ptr next; diff --git a/src/ripple/nodestore/impl/Shard.h b/src/ripple/nodestore/impl/Shard.h index a75362ba05e..6a70d44f473 100644 --- a/src/ripple/nodestore/impl/Shard.h +++ b/src/ripple/nodestore/impl/Shard.h @@ -118,9 +118,13 @@ class Shard final @param writeSQLite If true, SQLite entries will be rewritten using verified backend data. + @param referenceHash If present, this hash must match the hash + of the last ledger in the shard. */ bool - finalize(const bool writeSQLite); + finalize( + bool const writeSQLite, + boost::optional const& referenceHash); void stop() {stop_ = true;}