diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c3dcb6d27f..a83c566f2d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ else() endif() project(foundationdb - VERSION 7.1.19 + VERSION 7.1.21 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." HOMEPAGE_URL "http://www.foundationdb.org/" LANGUAGES C CXX ASM) diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index 2ffc94065a6..25b7834698b 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -277,6 +277,12 @@ def __init__(self, bb, protocol_version, full_output=True): if protocol_version >= PROTOCOL_VERSION_6_3: self.report_conflicting_keys = bb.get_bool() + if protocol_version >= PROTOCOL_VERSION_7_1: + lock_aware = bb.get_bool() + if bb.get_bool(): + spanId = bb.get_bytes(16) + + class UnsupportedProtocolVersionError(Exception): def __init__(self, protocol_version): super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version) diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index 7847ea457f7..bdd39865233 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -1,9 +1,32 @@ -.. _release-notes: - ############# Release Notes ############# +======= +7.1.21 +====== +* Same as 7.1.20 release with AVX enabled. + +7.1.20 +====== +* Released with AVX disabled. +* Fixed missing localities for fdbserver that can cause cross DC calls among storage servers. `(PR #7995) `_ +* Removed extremely spammy trace event in FetchKeys and fixed transaction_profiling_analyzer.py. `(PR #7934) `_ +* Fixed bugs when GRV proxy returns an error. `(PR #7860) `_ + +7.1.19 +====== +* Same as 7.1.18 release with AVX enabled. + +7.1.18 +====== +* Released with AVX disabled. +* Added knobs for the minimum and the maximum of the Ratekeeper's default priority. `(PR #7820) `_ +* Fixed bugs in ``getRange`` of the special key space. `(PR #7778) `_, `(PR #7720) `_ +* Added debug ID for secondary queries in index prefetching. `(PR #7755) `_ +* Changed hostname resolving to prefer IPv6 addresses. `(PR #7750) `_ +* Added more transaction debug events for prefetch queries. `(PR #7732) `_ + 7.1.17 ====== * Same as 7.1.16 release with AVX enabled. @@ -17,7 +40,7 @@ Release Notes * Fixed ScopeEventFieldTypeMismatch error for TLogMetrics. `(PR #7640) `_ * Added getMappedRange latency metrics. `(PR #7632) `_ * Fixed a version vector performance bug due to not updating client side tag cache. `(PR #7616) `_ -* Fixed DiskReadSeconds and DiskWriteSeconds calculaion in ProcessMetrics. `(PR #7609) `_ +* Fixed DiskReadSeconds and DiskWriteSeconds calculation in ProcessMetrics. `(PR #7609) `_ * Added Rocksdb compression and data size stats. `(PR #7596) `_ 7.1.15 @@ -76,7 +99,7 @@ Release Notes * Added support of the reboot command in go bindings. `(PR #7270) `_ * Fixed several issues in profiling special keys using GlobalConfig. `(PR #7120) `_ * Fixed a stuck transaction system bug due to inconsistent recovery transaction version. `(PR #7261) `_ -* Fixed a unknown_error crash due to not resolving hostnames. `(PR #7254) `_ +* Fixed an unknown_error crash due to not resolving hostnames. `(PR #7254) `_ * Fixed a heap-use-after-free bug. `(PR #7250) `_ * Fixed a performance issue that remote TLogs are sending too many pops to log routers. `(PR #7235) `_ * Fixed an issue that SharedTLogs are not displaced and leaking disk space. `(PR #7246) `_ diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 7427078d5b5..8e1c1a277cd 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -63,6 +63,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + init( GRV_ERROR_RETRY_DELAY, 5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01(); init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01(); init( REPLY_BYTE_LIMIT, 80000 ); init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01(); diff --git a/fdbclient/ClientKnobs.h b/fdbclient/ClientKnobs.h index ca3dd1494ee..5978c43b1fe 100644 --- a/fdbclient/ClientKnobs.h +++ b/fdbclient/ClientKnobs.h @@ -61,6 +61,7 @@ class ClientKnobs : public KnobsImpl { double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is // mostly wrong (e.g. dumping the database after a test) double FUTURE_VERSION_RETRY_DELAY; + double GRV_ERROR_RETRY_DELAY; double UNKNOWN_TENANT_RETRY_DELAY; int REPLY_BYTE_LIMIT; double DEFAULT_BACKOFF; diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 6eed163162f..470e3488386 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -528,6 +528,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll Counter transactionsExpensiveClearCostEstCount; Counter transactionGrvFullBatches; Counter transactionGrvTimedOutBatches; + Counter transactionCommitVersionNotFoundForSS; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit, bgLatencies, bgGranulesPerRequest; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 794e56d151f..7f881fc0fa5 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -240,7 +240,7 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc return; } - if (ssVersionVectorCache.getMaxVersion() != invalidVersion && readVersion > ssVersionVectorCache.getMaxVersion()) { + if (readVersion > ssVersionVectorCache.getMaxVersion()) { if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { return; } else { @@ -253,16 +253,32 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc std::map> versionMap; // order the versions to be returned for (int i = 0; i < locationInfo->locations()->size(); i++) { - UID uid = locationInfo->locations()->getId(i); - if (ssidTagMapping.find(uid) != ssidTagMapping.end()) { - Tag tag = ssidTagMapping[uid]; + bool updatedVersionMap = false; + Version commitVersion = invalidVersion; + Tag tag = invalidTag; + auto iter = ssidTagMapping.find(locationInfo->locations()->getId(i)); + if (iter != ssidTagMapping.end()) { + tag = iter->second; if (ssVersionVectorCache.hasVersion(tag)) { - Version commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version + commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version if (commitVersion < readVersion) { + updatedVersionMap = true; versionMap[commitVersion].insert(tag); } } } + // do not log if commitVersion == readVersion as it is common. + if (!updatedVersionMap && commitVersion != readVersion) { + TraceEvent(SevDebug, "CommitVersionNotFoundForSS") + .detail("InSSIDMap", iter != ssidTagMapping.end() ? 1 : 0) + .detail("Tag", tag) + .detail("CommitVersion", commitVersion) + .detail("ReadVersion", readVersion) + .detail("VersionVector", ssVersionVectorCache.toString()) + .setMaxEventLength(11000) + .setMaxFieldLength(10000); + ++transactionCommitVersionNotFoundForSS; + } } // insert the commit versions in the version vector. @@ -1445,13 +1461,13 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), latencies(1000), readLatencies(1000), + commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), + bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), + lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), + transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), + coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), + detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { @@ -1723,8 +1739,9 @@ DatabaseContext::DatabaseContext(const Error& err) transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), - bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), latencies(1000), readLatencies(1000), + commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), + bgGranulesPerRequest(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} @@ -3460,8 +3477,8 @@ ACTOR Future getKey(Reference trState, ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanID spanContext) { state Span span("NAPI:waitForCommittedVersion"_loc, { spanContext }); - try { - loop { + loop { + try { choose { when(wait(cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance( @@ -3487,10 +3504,16 @@ ACTOR Future waitForCommittedVersion(Database cx, Version version, Span wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID)); } } + } catch (Error& e) { + if (e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_proxy_memory_limit_exceeded) { + // GRV Proxy returns an error + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); + } else { + TraceEvent(SevError, "WaitForCommittedVersionError").error(e); + throw; + } } - } catch (Error& e) { - TraceEvent(SevError, "WaitForCommittedVersionError").error(e); - throw; } } @@ -6632,9 +6655,12 @@ ACTOR Future getConsistentReadVersion(SpanID parentSpan, } } } catch (Error& e) { - if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled) + if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled && + e.code() != error_code_proxy_memory_limit_exceeded) TraceEvent(SevError, "GetConsistentReadVersionError").error(e); - if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) { + if ((e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_proxy_memory_limit_exceeded) && + !cx->apiVersionAtLeast(630)) { wait(delayJittered(5.0)); } else { throw; diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 76326c48a4e..b25a1ea73e4 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -486,6 +486,12 @@ Future loadBalance( // server count is within "LOAD_BALANCE_MAX_BAD_OPTIONS". We // do not need to consider any remote servers. break; + } else if (badServers == alternatives->countBest() && i == badServers) { + TraceEvent("AllLocalAlternativesFailed") + .suppressFor(1.0) + .detail("Alternatives", alternatives->description()) + .detail("Total", alternatives->size()) + .detail("Best", alternatives->countBest()); } RequestStream const* thisStream = &alternatives->get(i, channel); @@ -587,6 +593,7 @@ Future loadBalance( // nextAlt. This logic matters only if model == nullptr. Otherwise, the // bestAlt and nextAlt have been decided. state RequestStream const* stream = nullptr; + state LBDistance::Type distance; for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) { int useAlt = nextAlt; if (nextAlt == startAlt) @@ -595,6 +602,7 @@ Future loadBalance( useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size(); stream = &alternatives->get(useAlt, channel); + distance = alternatives->getDistance(useAlt); if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed && (!firstRequestEndpoint.present() || stream->getEndpoint().token.first() != firstRequestEndpoint.get())) break; @@ -602,6 +610,7 @@ Future loadBalance( if (nextAlt == startAlt) triedAllOptions = TriedAllOptions::True; stream = nullptr; + distance = LBDistance::DISTANT; } if (!stream && !firstRequestData.isValid()) { @@ -637,6 +646,18 @@ Future loadBalance( firstRequestEndpoint = Optional(); } else if (firstRequestData.isValid()) { // Issue a second request, the first one is taking a long time. + if (distance == LBDistance::DISTANT) { + TraceEvent("LBDistant2nd") + .suppressFor(0.1) + .detail("Distance", (int)distance) + .detail("BackOff", backoff) + .detail("TriedAllOptions", triedAllOptions) + .detail("Alternatives", alternatives->description()) + .detail("Token", stream->getEndpoint().token) + .detail("Total", alternatives->size()) + .detail("Best", alternatives->countBest()) + .detail("Attempts", numAttempts); + } secondRequestData.startRequest(backoff, triedAllOptions, stream, request, model, alternatives, channel); state bool firstFinished = false; @@ -666,6 +687,18 @@ Future loadBalance( } } else { // Issue a request, if it takes too long to get a reply, go around the loop + if (distance == LBDistance::DISTANT) { + TraceEvent("LBDistant") + .suppressFor(0.1) + .detail("Distance", (int)distance) + .detail("BackOff", backoff) + .detail("TriedAllOptions", triedAllOptions) + .detail("Alternatives", alternatives->description()) + .detail("Token", stream->getEndpoint().token) + .detail("Total", alternatives->size()) + .detail("Best", alternatives->countBest()) + .detail("Attempts", numAttempts); + } firstRequestData.startRequest(backoff, triedAllOptions, stream, request, model, alternatives, channel); firstRequestEndpoint = stream->getEndpoint().token.first(); diff --git a/fdbrpc/MultiInterface.h b/fdbrpc/MultiInterface.h index 4f15dbf0873..85fa195206b 100644 --- a/fdbrpc/MultiInterface.h +++ b/fdbrpc/MultiInterface.h @@ -226,6 +226,7 @@ class MultiInterface> : public ReferenceCountedinterf; } + LBDistance::Type getDistance(int index) const { return (LBDistance::Type)alternatives[index]->distance; } UID getId(int index) const { return alternatives[index]->interf.id(); } bool hasInterface(UID id) const { for (const auto& ref : alternatives) { diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 0ac5b56a7d6..789935bd2cd 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -428,20 +428,30 @@ struct BackupData { ACTOR static Future _getMinKnownCommittedVersion(BackupData* self) { state Span span("BA:GetMinCommittedVersion"_loc); loop { - GetReadVersionRequest request(span.context, - 0, - TransactionPriority::DEFAULT, - invalidVersion, - GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION); - choose { - when(wait(self->cx->onProxiesChanged())) {} - when(GetReadVersionReply reply = - wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False), - &GrvProxyInterface::getConsistentReadVersion, - request, - self->cx->taskID))) { - self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta); - return reply.version; + try { + GetReadVersionRequest request(span.context, + 0, + TransactionPriority::DEFAULT, + invalidVersion, + GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION); + choose { + when(wait(self->cx->onProxiesChanged())) {} + when(GetReadVersionReply reply = + wait(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False), + &GrvProxyInterface::getConsistentReadVersion, + request, + self->cx->taskID))) { + self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta); + return reply.version; + } + } + } catch (Error& e) { + if (e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_proxy_memory_limit_exceeded) { + // GRV Proxy returns an error + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); + } else { + throw; } } } diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index 61b100779ec..fc3d9b5d233 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -29,6 +29,7 @@ #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbrpc/sim_validation.h" +#include "flow/IRandom.h" #include "flow/flow.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -425,7 +426,9 @@ ACTOR Future queueGetReadVersionRequests(Reference // WARNING: this code is run at a high priority, so it needs to do as little work as possible bool canBeQueued = true; if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() > - SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) { + SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE || + (g_network->isSimulated() && !g_simulator.speedUpSimulation && + deterministicRandom()->random01() < 0.01)) { // When the limit is hit, try to drop requests from the lower priority queues. if (req.priority == TransactionPriority::BATCH) { canBeQueued = false; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 9d742989ffe..5bcf54bb0df 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -5913,8 +5913,10 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { } // FIXME: remove when we no longer support upgrades from 5.X - data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True; - TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID); + if (!data->cx->enableLocalityLoadBalance) { + data->cx->enableLocalityLoadBalance = EnableLocalityLoadBalance::True; + TraceEvent(SevWarnAlways, "FKReenableLB").detail("FKID", fetchKeysID); + } // We have completed the fetch and write of the data, now we wait for MVCC window to pass. // As we have finished this work, we will allow more work to start... diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 4b720a54c01..301a9ed622b 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2917,7 +2917,10 @@ ACTOR Future fdbd(Reference connRecord, auto ci = makeReference>>(); auto asyncPriorityInfo = makeReference>(getCCPriorityInfo(fitnessFilePath, processClass)); - auto dbInfo = makeReference>(); + auto serverDBInfo = ServerDBInfo(); + serverDBInfo.myLocality = localities; + auto dbInfo = makeReference>(serverDBInfo); + TraceEvent("MyLocality").detail("Locality", dbInfo->get().myLocality.toString()); actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo")); diff --git a/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/workloads/BulkSetup.actor.h index 4d1acdbb449..07f30052af1 100644 --- a/fdbserver/workloads/BulkSetup.actor.h +++ b/fdbserver/workloads/BulkSetup.actor.h @@ -158,6 +158,37 @@ ACTOR Future>> trackInsertionCount(Datab std::vector countsOfInterest, double checkInterval); +ACTOR template +Future waitForLowInFlight(Database cx, T* workload) { + state Future timeout = delay(600.0); + loop { + try { + if (timeout.isReady()) { + throw timed_out(); + } + + int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo)); + TraceEvent("DynamicWarming").detail("InFlight", inFlight); + if (inFlight > 1e6) { // Wait for just 1 MB to be in flight + wait(delay(1.0)); + } else { + wait(delay(1.0)); + TraceEvent("DynamicWarmingDone").log(); + break; + } + } catch (Error& e) { + if (e.code() == error_code_attribute_not_found) { + // DD may not be initialized yet and attribute "DataInFlight" can be missing + wait(delay(1.0)); + } else { + TraceEvent(SevWarn, "WaitForLowInFlightError").error(e); + throw; + } + } + } + return Void(); +} + ACTOR template Future bulkSetup(Database cx, T* workload, @@ -279,17 +310,7 @@ Future bulkSetup(Database cx, if (postSetupWarming != 0) { try { wait(delay(5.0)); // Wait for the data distribution in a small test to start - loop { - int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo)); - TraceEvent("DynamicWarming").detail("InFlight", inFlight); - if (inFlight > 1e6) { // Wait for just 1 MB to be in flight - wait(delay(1.0)); - } else { - wait(delay(1.0)); - TraceEvent("DynamicWarmingDone").log(); - break; - } - } + wait(waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index d648b1d097b..e04ce4c8dc6 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -284,6 +284,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { } ACTOR Future _check(ConfigureDatabaseWorkload* self, Database cx) { + wait(delay(30.0)); // only storage_migration_type=gradual && perpetual_storage_wiggle=1 need this check because in QuietDatabase // perpetual wiggle will be forced to close For other cases, later ConsistencyCheck will check KV store type // there diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index bec0240257a..e04ea8a48cb 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -61,7 +61,7 @@ struct ExceptionContract { e.code() == error_code_future_version || e.code() == error_code_transaction_cancelled || e.code() == error_code_key_too_large || e.code() == error_code_value_too_large || e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled || - e.code() == error_code_tag_throttled) { + e.code() == error_code_tag_throttled || e.code() == error_code_proxy_memory_limit_exceeded) { return; } diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp index 97f8af04bc2..67d84d309f5 100644 --- a/fdbserver/workloads/LocalRatekeeper.actor.cpp +++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/FDBTypes.h" #include "fdbserver/workloads/workloads.actor.h" #include #include @@ -82,7 +83,16 @@ struct LocalRatekeeperWorkload : TestWorkload { .detail("Actual", metrics.localRateLimit); } tr.reset(); - Version readVersion = wait(tr.getReadVersion()); + state Version readVersion = invalidVersion; + loop { + try { + Version v = wait(tr.getReadVersion()); + readVersion = v; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } requests.clear(); // we send 100 requests to this storage node and count how many of those get rejected for (int i = 0; i < 100; ++i) { diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index 5437ca418ac..495dd050dfe 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -874,41 +874,58 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } TraceEvent(SevDebug, "DatabaseLocked").log(); // if database locked, fdb read should get database_locked error - try { - tx->reset(); - tx->setOption(FDBTransactionOptions::RAW_ACCESS); - RangeResult res = wait(tx->getRange(normalKeys, 1)); - } catch (Error& e) { - if (e.code() == error_code_actor_cancelled) - throw; - ASSERT(e.code() == error_code_database_locked); + tx->reset(); + loop { + try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); + RangeResult res = wait(tx->getRange(normalKeys, 1)); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + if (e.code() == error_code_proxy_memory_limit_exceeded || + e.code() == error_code_batch_transaction_throttled) { + wait(tx->onError(e)); + } else { + ASSERT(e.code() == error_code_database_locked); + break; + } + } } // make sure we unlock the database // unlock is idempotent, thus we can commit many times until successful + tx->reset(); loop { try { - tx->reset(); tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); // unlock the database tx->clear(SpecialKeySpace::getManagementApiCommandPrefix("lock")); wait(tx->commit()); TraceEvent(SevDebug, "DatabaseUnlocked").log(); - tx->reset(); + break; + } catch (Error& e) { + TraceEvent(SevDebug, "DatabaseUnlockFailure").error(e); + ASSERT(e.code() != error_code_database_locked); + wait(tx->onError(e)); + } + } + + tx->reset(); + loop { + try { // read should be successful tx->setOption(FDBTransactionOptions::RAW_ACCESS); RangeResult res = wait(tx->getRange(normalKeys, 1)); - tx->reset(); break; } catch (Error& e) { - TraceEvent(SevDebug, "DatabaseUnlockFailure").error(e); - ASSERT(e.code() != error_code_database_locked); wait(tx->onError(e)); } } + // test consistencycheck which only used by ConsistencyCheck Workload // Note: we have exclusive ownership of fdbShouldConsistencyCheckBeSuspended, // no existing workloads can modify the key + tx->reset(); { try { tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); diff --git a/packaging/docker/build-images.sh b/packaging/docker/build-images.sh index 02574515d94..4b306a1535b 100755 --- a/packaging/docker/build-images.sh +++ b/packaging/docker/build-images.sh @@ -2,12 +2,17 @@ set -Eeuo pipefail script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P) reset=$(tput sgr0) +red=$(tput setaf 1) blue=$(tput setaf 4) -function logg() { +function logg () { printf "${blue}##### $(date +"%H:%M:%S") # %-56.55s #####${reset}\n" "${1}" } + function loge () { + printf "${red}##### $(date +"%H:%M:%S") # %-56.55s #####${reset}\n" "${1}" + } + function pushd () { command pushd "$@" > /dev/null } @@ -16,7 +21,19 @@ function popd () { command popd > /dev/null } -function create_fake_website_directory() { +function error_exit () { + echo "${red}################################################################################${reset}" + loge "${0} FAILED" + echo "${red}################################################################################${reset}" + } + + trap error_exit ERR + + function create_fake_website_directory () { + if [ ${#} -ne 1 ]; then + loge "INCORRECT NUMBER OF ARGS FOR ${FUNCNAME[0]}" + fi + local stripped_binaries_and_from_where="${1}" fdb_binaries=( 'fdbbackup' 'fdbcli' 'fdbserver' 'fdbmonitor' ) logg "PREPARING WEBSITE" website_directory="${script_dir}/website" @@ -112,7 +129,7 @@ function create_fake_website_directory() { fdb_website="file:///tmp/website" } -function compile_ycsb() { +function compile_ycsb () { logg "COMPILING YCSB" if [ "${use_development_java_bindings}" == "true" ]; then logg "INSTALL JAVA BINDINGS" @@ -150,7 +167,13 @@ function compile_ycsb() { popd || exit 128 } -function build_and_push_images(){ +function build_and_push_images () { + if [ ${#} -ne 3 ]; then + loge "INCORRECT NUMBER OF ARGS FOR ${FUNCNAME[0]}" + fi + local dockerfile_name="${1}" + local use_development_java_bindings="${2}" + local push_docker_images="${3}" declare -a tags_to_push=() for image in "${image_list[@]}"; do logg "BUILDING ${image}" @@ -237,11 +260,6 @@ image_list=( ) registry="" tag_base="foundationdb/" -# THESE CONTROL THE PATH OF FUNCTIONS THAT ARE CALLED BELOW -stripped_binaries_and_from_where="stripped_local" # MUST BE ONE OF ( "unstripped_artifactory" "stripped_artifactory" "unstripped_local" "stripped_local" ) -dockerfile_name="Dockerfile" -use_development_java_bindings="false" -push_docker_images="false" if [ -n "${OKTETO_NAMESPACE+x}" ]; then logg "RUNNING IN OKTETO/AWS" @@ -258,20 +276,24 @@ if [ -n "${OKTETO_NAMESPACE+x}" ]; then else tag_postfix="${OKTETO_NAME:-dev}" fi - stripped_binaries_and_from_where="unstripped_local" # MUST BE ONE OF ( "unstripped_artifactory" "stripped_artifactory" "unstripped_local" "stripped_local" ) - dockerfile_name="Dockerfile.eks" - use_development_java_bindings="true" - push_docker_images="true" + + # build regular images + create_fake_website_directory stripped_local + build_and_push_images Dockerfile true true + + # build debug images + create_fake_website_directory unstripped_local + build_and_push_images Dockerfile.eks true true else echo "Dear ${USER}, you probably need to edit this file before running it. " echo "${0} has a very narrow set of situations where it will be successful," echo "or even useful, when executed unedited" exit 1 + # this set of options will creat standard images from a local build + # create_fake_website_directory stripped_local + # build_and_push_images Dockerfile false false fi -create_fake_website_directory -build_and_push_images - echo "${blue}################################################################################${reset}" logg "COMPLETED ${0}" echo "${blue}################################################################################${reset}"