Skip to content

Commit

Permalink
patched v4 GPU (without cuQuantum)
Browse files Browse the repository at this point in the history
such that the deprecated unit tests pass on Ubuntu in GPU-accelerated deployment, distributed over up to 8 nodes, when not using cuQuantum. Alas we cannot presently test cuQuantum for want of a compatible GPU!

See PR #534 for the list of patches
  • Loading branch information
TysonRayJones authored Feb 6, 2025
1 parent 14855c9 commit 2525681
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 54 deletions.
29 changes: 20 additions & 9 deletions quest/src/core/accelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ using std::min;
GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs ))


// TODO:
// GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS,
// as defined below, is only ever called by used by anyCtrlAnyTargDenseMatr,
// which only ever receives numTargs>=3 (due to accelerator redirecting
// fewer targets to faster bespoke functions which e.g. avoid global GPU
// cache emory access). This means its instantiation with numTargs=0,1,2
// is useless, though contributes to 42% of the function's compilation
// time which is large because of the 7*7*2=98 unique instantiations. We
// can ergo non-negligibly speed up compilation by avoiding these redundant
// instances at the cost of increased code complexity/asymmetry. Consider!

#define GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c) \
(vector <CONJ_ARR(f)> { \
CONJ_ARR(f) {&f<0,0,c>, &f<0,1,c>, &f<0,2,c>, &f<0,3,c>, &f<0,4,c>, &f<0,5,c>, &f<0,-1,c>}, \
Expand Down Expand Up @@ -412,7 +423,7 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc

bool hasPower = exponent != qcomp(1, 0);
auto cpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( cpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );
auto gpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( cpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );
auto gpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( gpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );

// when deployments match, we trivially call the common backend
if ( quregGPU && matrGPU) gpuFunc(qureg, matr, exponent);
Expand Down Expand Up @@ -446,7 +457,7 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc
temp.isGpuAccelerated = 1;
temp.gpuElems = (qureg.isDistributed)?
qureg.gpuCommBuffer :
gpu_allocArray(temp.numElems);
gpu_allocArray(temp.numElemsPerNode);

// error if that (relatively) small allocation failed (always succeeds if buffer)
assert_applyFullStateDiagMatrTempGpuAllocSucceeded(temp.gpuElems);
Expand Down Expand Up @@ -870,12 +881,12 @@ qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qu
void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {

auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
return func(outProbs, qureg, qubits);
func(outProbs, qureg, qubits);
}
void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {

auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
return func(outProbs, qureg, qubits);
func(outProbs, qureg, qubits);
}


Expand Down Expand Up @@ -937,14 +948,14 @@ qcomp accel_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi, bool co
temp.isGpuAccelerated = 1;
temp.gpuAmps = (rho.isDistributed)?
rho.gpuCommBuffer :
gpu_allocArray(temp.numAmps);
gpu_allocArray(temp.numAmpsPerNode);

// error if that (relatively) small allocation failed (always succeeds if buffer)
assert_calcFidTempGpuAllocSucceeded(temp.gpuAmps);

// harmlessly overwrite new memory or rho's buffer, and call GPU routine
gpu_copyCpuToGpu(temp);
qcomp prod = gpuFunc(rho, psi);
qcomp prod = gpuFunc(rho, temp);

// free new GPU memory, but do NOT free rho's communication buffer
if (!rho.isDistributed)
Expand Down Expand Up @@ -1069,7 +1080,7 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa

// harmlessly overwrite new memory or qureg's buffer, and call GPU routine
gpu_copyCpuToGpu(temp);
qcomp value = gpuFunc(qureg, matr, exponent);
qcomp value = gpuFunc(qureg, temp, exponent);

// free new GPU memory, but do NOT free qureg's communication buffer
if (!mem_isAllocated(qureg.gpuCommBuffer))
Expand All @@ -1088,12 +1099,12 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
void accel_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {

auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_multiQubitProjector_sub, qureg, qubits.size() );
return func(qureg, qubits, outcomes, prob);
func(qureg, qubits, outcomes, prob);
}
void accel_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {

auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_multiQubitProjector_sub, qureg, qubits.size() );
return func(qureg, qubits, outcomes, prob);
func(qureg, qubits, outcomes, prob);
}


Expand Down
4 changes: 4 additions & 0 deletions quest/src/core/errors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ void assert_quregAndFullStateDiagMatrAreBothOrNeitherDistrib(Qureg qureg, FullSt

void assert_quregGpuBufferIsNotGraftedToMatrix(Qureg qureg, FullStateDiagMatr matr) {

// permit both pointers to be null-ptr, of course
if (!mem_isAllocated(matr.gpuElems))
return;

if (matr.gpuElems == qureg.gpuCommBuffer)
raiseInternalError("An accelerator function received a FullStateDiagMatr with a GPU pointer which was a Qureg's GPU communication buffer, in a setting where the buffer was separately needed.");
}
Expand Down
2 changes: 1 addition & 1 deletion quest/src/core/printer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ string printer_getMemoryWithUnitStr(size_t numBytes) {
sizes[i] = powerOf2(10 * i); // = 1024^i

// find the biggest unit for which numBytes/sizes[ind] > 1
int ind = 0;
int ind = 1;
while (numBytes > sizes[ind])
ind++;
ind--;
Expand Down
8 changes: 4 additions & 4 deletions quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1737,7 +1737,7 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
#pragma omp parallel for if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {

// i = local index of nth local diagonal element
// i = local index of nth local diagonal element
qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
qreal prob = std::real(qureg.cpuAmps[i]);

Expand Down Expand Up @@ -2039,14 +2039,14 @@ qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
#pragma omp parallel for reduction(+:value) if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {

// i = local index of nth local diagonal element
qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
qcomp elem = matr.cpuElems[n];

// compile-time decide if applying power to avoid in-loop branching
qcomp elem = matr.cpuElems[n];
if constexpr (HasPower)
elem = pow(elem, exponent);

// i = local index of nth local diagonal element
qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
value += elem * qureg.cpuAmps[i];
}

Expand Down
11 changes: 6 additions & 5 deletions quest/src/gpu/gpu_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr


__forceinline__ __device__ qindex getFlattenedMatrInd(qindex row, qindex col, qindex dim) {
return (row * col) + dim;
return (row * dim) + col;
}


Expand Down Expand Up @@ -752,7 +752,7 @@ __global__ void kernel_densmatr_oneQubitDepolarising_subA(

__global__ void kernel_densmatr_oneQubitDepolarising_subB(
cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads,
int braBit, int ketQubit, qreal facAA, qreal facBB, qreal facAB
int ketQubit, int braBit, qreal facAA, qreal facBB, qreal facAB
) {
GET_THREAD_IND(n, numThreads);

Expand Down Expand Up @@ -1078,7 +1078,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(
// use template param to compile-time unroll below loops
SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits);

qreal prob = getCompReal(amps[n]);
qreal prob = getCompNorm(amps[n]);

// i = global index corresponding to n
qindex i = concatenateBits(rank, n, logNumAmpsPerNode);
Expand All @@ -1093,7 +1093,8 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(
template <int NumQubits>
__global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(
qreal* outProbs, cu_qcomp* amps, qindex numThreads,
qindex numColsPerNode, int rank, qindex logNumAmpsPerNode,
qindex firstDiagInd, qindex numAmpsPerCol,
int rank, qindex logNumAmpsPerNode,
int* qubits, int numQubits
) {
GET_THREAD_IND(n, numThreads);
Expand All @@ -1102,7 +1103,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(
SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits);

// i = index of nth local diagonal elem
qindex i = (rank * numColsPerNode) + n * (numColsPerNode + 1);
qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
qreal prob = getCompReal(amps[i]);

// j = global index of i
Expand Down
42 changes: 18 additions & 24 deletions quest/src/gpu/gpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
devints deviceQubits = util_getSorted(ctrls, targs);
qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));

qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numThreads);
qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);

kernel_statevec_anyCtrlAnyTargDenseMatr_sub <NumCtrls, NumTargs, ApplyConj> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
toCuQcomps(qureg.gpuAmps), toCuQcomps(cache), numThreads,
Expand Down Expand Up @@ -1376,12 +1377,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu

// allocate exponentially-big temporary memory (error if failed)
devints devQubits = qubits;
devreals devProbs;
try {
devProbs.resize(powerOf2(qubits.size()));
} catch (thrust::system_error &e) {
error_thrustTempGpuAllocFailed();
}
devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws

kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads,
Expand All @@ -1405,23 +1401,21 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
#if COMPILE_CUDA || COMPILE_CUQUANTUM

// we decouple numColsPerNode and numThreads for clarity
// (and in case parallelisation granularity ever changes)
qindex numColsPerNode = powerOf2(qureg.logNumColsPerNode);
qindex numThreads = numColsPerNode;
// (and in case parallelisation granularity ever changes);
qindex numThreads = powerOf2(qureg.logNumColsPerNode);
qindex numBlocks = getNumBlocks(numThreads);

qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
qindex numAmpsPerCol = powerOf2(qureg.numQubits);

// allocate exponentially-big temporary memory (error if failed)
devints devQubits = qubits;
devreals devProbs;
try {
devProbs = devreals(powerOf2(qubits.size()), 0);
} catch (thrust::system_error &e) {
error_thrustTempGpuAllocFailed();
}
devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws

kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads,
numColsPerNode, qureg.rank, qureg.logNumAmpsPerNode,
getPtr(devProbs), toCuQcomps(qureg.gpuAmps),
numThreads, firstDiagInd, numAmpsPerCol,
qureg.rank, qureg.logNumAmpsPerNode,
getPtr(devQubits), devQubits.size()
);

Expand Down Expand Up @@ -1631,19 +1625,19 @@ void gpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto

assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);

qreal norm = 1 / sqrt(prob);
qreal renorm = 1 / sqrt(prob);

#if COMPILE_CUQUANTUM

// cuQuantum disregards NumQubits template param
cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, norm);
cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, renorm);

#elif COMPILE_CUDA

thrust_statevec_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, norm);
thrust_statevec_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);

#else
(void) norm; // silence not-used
(void) renorm; // silence not-used
error_gpuSimButGpuNotCompiled();
#endif
}
Expand All @@ -1654,8 +1648,8 @@ void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto

#if COMPILE_CUDA || COMPILE_CUQUANTUM

qreal norm = 1 / prob;
thrust_densmatr_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, norm);
qreal renorm = 1 / prob;
thrust_densmatr_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);

#else
error_gpuSimButGpuNotCompiled();
Expand Down
40 changes: 29 additions & 11 deletions quest/src/gpu/gpu_thrust.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <thrust/complex.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <thrust/inner_product.h>
Expand All @@ -61,24 +62,39 @@

using devints = thrust::device_vector<int>;

int* getPtr(devints qubits) {
int* getPtr(devints& qubits) {

return thrust::raw_pointer_cast(qubits.data());
}


using devreals = thrust::device_vector<qreal>;

qreal* getPtr(devreals reals) {
qreal* getPtr(devreals& reals) {

return thrust::raw_pointer_cast(reals.data());
}

void copyFromDeviceVec(devreals reals, qreal* out) {
void copyFromDeviceVec(devreals& reals, qreal* out) {

thrust::copy(reals.begin(), reals.end(), out);
}

devreals getDeviceRealsVec(qindex dim) {

devreals out;

try {
out.resize(dim);
thrust::fill(out.begin(), out.end(), 0.);

} catch (thrust::system_error &e) {
error_thrustTempGpuAllocFailed();
}

return out;
}



/*
Expand Down Expand Up @@ -270,12 +286,13 @@ struct functor_getExpecDensMatrDiagMatrTerm : public thrust::unary_function<qind

__device__ cu_qcomp operator()(qindex n) {

qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);

cu_qcomp elem = elems[n];

if constexpr (HasPower)
elem = getCompPower(elem, expo);

qindex i = fast_getLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);

return amps[i] * elem;
}
};
Expand Down Expand Up @@ -508,14 +525,15 @@ template <int NumTargets>
struct functor_projectDensMatr : public thrust::binary_function<qindex,cu_qcomp,cu_qcomp> {

// this functor multiplies an amp with zero or a
// renormalisation codfficient, depending on whether
// renormalisation coefficient, depending on whether
// the basis state of the amp has qubits in a particular
// configuration. This is used to project density matrix
// qubits into a particular measurement outcome

int* targetsPtr;
int numTargets, rank, numQuregQubits;
qindex logNumAmpsPerNode, retainValue, renorm;
qindex logNumAmpsPerNode, retainValue;
qreal renorm;

functor_projectDensMatr(
int* targetsPtr, int numTargets, int rank, int numQuregQubits,
Expand Down Expand Up @@ -971,13 +989,13 @@ cu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDi


template <int NumQubits>
void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal norm) {
void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal renorm) {

devints devQubits = qubits;
qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
auto projFunctor = functor_projectStateVec<NumQubits>(
getPtr(devQubits), qubits.size(), qureg.rank,
qureg.logNumAmpsPerNode, retainValue, norm);
qureg.logNumAmpsPerNode, retainValue, renorm);

auto indIter = thrust::make_counting_iterator(0);
auto ampIter = getStartPtr(qureg);
Expand All @@ -988,13 +1006,13 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, ve


template <int NumQubits>
void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal norm) {
void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal renorm) {

devints devQubits = qubits;
qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
auto projFunctor = functor_projectDensMatr<NumQubits>(
getPtr(devQubits), qubits.size(), qureg.rank, qureg.numQubits,
qureg.logNumAmpsPerNode, retainValue, norm);
qureg.logNumAmpsPerNode, retainValue, renorm);

auto indIter = thrust::make_counting_iterator(0);
auto ampIter = getStartPtr(qureg);
Expand Down

0 comments on commit 2525681

Please sign in to comment.