Skip to content

Commit

Permalink
cudastf (examples): Fix compiler errors when enabling examples for CU…
Browse files Browse the repository at this point in the history
…DA STF (#3516)

* Make index types consistent in loops
* Add implementation of missing operator

Co-authored-by: Jan Ciesko <[email protected]>
Co-authored-by: Cédric Augonnet <[email protected]>
  • Loading branch information
3 people authored Jan 28, 2025
1 parent abfb7b4 commit 81b1af6
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 90 deletions.
20 changes: 10 additions & 10 deletions cudax/examples/stf/linear_algebra/06-pdgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ public:
{
nvtxRangePushA("FILL");
// Fill blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
for (int rowb = 0; rowb < mt; rowb++)
for (size_t rowb = 0; rowb < mt; rowb++)
{
// Each task fills a block
auto& h = get_handle(rowb, colb);
Expand Down Expand Up @@ -251,9 +251,9 @@ void PDGEMM(stream_ctx& ctx,
double beta,
matrix<double>& C)
{
for (int m = 0; m < C.mt; m++)
for (size_t m = 0; m < C.mt; m++)
{
for (int n = 0; n < C.nt; n++)
for (size_t n = 0; n < C.nt; n++)
{
//=========================================
// alpha*A*B does not contribute; scale C
Expand All @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx,
if (transb == CUBLAS_OP_N)
{
assert(A.nt == B.mt);
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
Expand All @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
else
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
Expand All @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
Expand All @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx,
//==========================================
else
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
Expand All @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB)
cuda_safe_call(cudaGetDeviceCount(&ndevs));

/* Warm up allocators */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
auto lX = ctx.logical_data(shape_of<slice<double>>(1));
ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
}

/* Initializes CUBLAS on all devices */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
cuda_safe_call(cudaSetDevice(d));
get_cublas_handle();
Expand Down
42 changes: 21 additions & 21 deletions cudax/examples/stf/linear_algebra/07-cholesky.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ public:

handles.resize(mt * nt);

for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
T* addr_h = get_block_h(rowb, colb);
auto& h = handle(rowb, colb);
Expand Down Expand Up @@ -171,10 +171,10 @@ public:
{
nvtxRangePushA("FILL");
// Fill blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
// Each task fills a block
auto& h = handle(rowb, colb);
Expand Down Expand Up @@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix<double>* A, double* result)
reserved::dot::set_current_color("red");
#endif

for (int rowb = 0; rowb < A->mt; rowb++)
for (size_t rowb = 0; rowb < A->mt; rowb++)
{
for (int colb = 0; colb < A->nt; colb++)
for (size_t colb = 0; colb < A->nt; colb++)
{
ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) {
double res2 = 0.0;
Expand Down Expand Up @@ -452,17 +452,17 @@ void PDTRSM(cublasSideMode_t side,
//===========================================
if (trans == CUBLAS_OP_N)
{
for (int k = 0; k < B.mt; k++)
for (size_t k = 0; k < B.mt; k++)
{
double lalpha = k == 0 ? alpha : 1.0;
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k)));
DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n);
}
for (int m = k + 1; m < B.mt; m++)
for (size_t m = k + 1; m < B.mt; m++)
{
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k)));
DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n);
Expand All @@ -475,17 +475,17 @@ void PDTRSM(cublasSideMode_t side,
//================================================
else
{
for (int k = 0; k < B.mt; k++)
for (size_t k = 0; k < B.mt; k++)
{
double lalpha = k == 0 ? alpha : 1.0;
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1)));
DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n);
}
for (int m = k + 1; m < B.mt; m++)
for (size_t m = k + 1; m < B.mt; m++)
{
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m)));
DGEMM(
Expand Down Expand Up @@ -543,9 +543,9 @@ void PDGEMM(cublasOperation_t transa,
reserved::dot::set_current_color("blue");
#endif

for (int m = 0; m < C.mt; m++)
for (size_t m = 0; m < C.mt; m++)
{
for (int n = 0; n < C.nt; n++)
for (size_t n = 0; n < C.nt; n++)
{
//=========================================
// alpha*A*B does not contribute; scale C
Expand All @@ -562,7 +562,7 @@ void PDGEMM(cublasOperation_t transa,
//================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
Expand All @@ -573,7 +573,7 @@ void PDGEMM(cublasOperation_t transa,
//=====================================
else
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
Expand All @@ -587,7 +587,7 @@ void PDGEMM(cublasOperation_t transa,
//=====================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
Expand All @@ -598,7 +598,7 @@ void PDGEMM(cublasOperation_t transa,
//==========================================
else
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
Expand Down Expand Up @@ -640,7 +640,7 @@ int main(int argc, char** argv)
int ndevs;
cuda_safe_call(cudaGetDeviceCount(&ndevs));

for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
auto lX = ctx.logical_data(shape_of<slice<double>>(1));
ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
Expand Down
Loading

0 comments on commit 81b1af6

Please sign in to comment.