Skip to content

Commit

Permalink
queue-runner: limit parallelism of CPU intensive operations
Browse files Browse the repository at this point in the history
My current theory is that running more parallel xz than available CPU
cores is reducing our overall throughput by requiring more scheduling
overhead and more cache thrashing.
  • Loading branch information
delroth committed Apr 11, 2024
1 parent a596d6c commit 885c8c8
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 0 deletions.
18 changes: 18 additions & 0 deletions src/hydra-queue-runner/build-remote.cc
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,16 @@ void RemoteResult::updateWithBuildResult(const nix::BuildResult & buildResult)

}

/* Utility guard object to auto-release a semaphore on destruction. */
template <typename T>
class SemaphoreReleaser {
public:
SemaphoreReleaser(T* s) : sem(s) {}
~SemaphoreReleaser() { sem->release(); }

private:
T* sem;
};

void State::buildRemote(ref<Store> destStore,
::Machine::ptr machine, Step::ptr step,
Expand Down Expand Up @@ -612,6 +622,14 @@ void State::buildRemote(ref<Store> destStore,
result.logFile = "";
}

/* Throttle CPU-bound work. Opportunistically skip updating the current
* step, since this requires a DB roundtrip. */
if (!localWorkThrottler.try_acquire()) {
updateStep(ssWaitingForLocalSlot);
localWorkThrottler.acquire();
}
SemaphoreReleaser releaser(&localWorkThrottler);

StorePathSet outputs;
for (auto & [_, realisation] : buildResult.builtOutputs)
outputs.insert(realisation.outPath);
Expand Down
1 change: 1 addition & 0 deletions src/hydra-queue-runner/hydra-queue-runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ State::State(std::optional<std::string> metricsAddrOpt)
: config(std::make_unique<HydraConfig>())
, maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0))
, dbPool(config->getIntOption("max_db_connections", 128))
, localWorkThrottler(config->getIntOption("max_local_worker_threads", std::min(maxSupportedLocalWorkers, std::min(4u, std::thread::hardware_concurrency()) - 2)))
, maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30))
, maxLogSize(config->getIntOption("max_log_size", 64ULL << 20))
, uploadLogsToBinaryCache(config->getBoolOption("upload_logs_to_binary_cache", false))
Expand Down
6 changes: 6 additions & 0 deletions src/hydra-queue-runner/state.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <memory>
#include <queue>
#include <regex>
#include <semaphore>

#include <prometheus/counter.h>
#include <prometheus/gauge.h>
Expand Down Expand Up @@ -56,6 +57,7 @@ typedef enum {
ssConnecting = 10,
ssSendingInputs = 20,
ssBuilding = 30,
ssWaitingForLocalSlot = 35,
ssReceivingOutputs = 40,
ssPostProcessing = 50,
} StepState;
Expand Down Expand Up @@ -387,6 +389,10 @@ private:
typedef std::map<std::string, Machine::ptr> Machines;
nix::Sync<Machines> machines; // FIXME: use atomic_shared_ptr

/* Throttler for CPU-bound local work. */
static constexpr unsigned int maxSupportedLocalWorkers = 1024;
std::counting_semaphore<maxSupportedLocalWorkers> localWorkThrottler;

/* Various stats. */
time_t startedAt;
counter nrBuildsRead{0};
Expand Down
2 changes: 2 additions & 0 deletions src/root/build.tt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ END;
<strong>Sending inputs</strong>
[% ELSIF step.busy == 30 %]
<strong>Building</strong>
[% ELSIF step.busy == 35 %]
<strong>Waiting to receive outputs</strong>
[% ELSIF step.busy == 40 %]
<strong>Receiving outputs</strong>
[% ELSIF step.busy == 50 %]
Expand Down

0 comments on commit 885c8c8

Please sign in to comment.