From 81b1af6b1c29c435d4db0fcef2ce8ae63f4ac46d Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 27 Jan 2025 23:53:43 -0700 Subject: [PATCH] cudastf (examples): Fix compiler errors when enabling examples for CUDA STF (#3516) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make index types consistent in loops * Add implementation of missing operator Co-authored-by: Jan Ciesko Co-authored-by: Cédric Augonnet <158148890+caugonnet@users.noreply.github.com> --- .../examples/stf/linear_algebra/06-pdgemm.cu | 20 ++-- .../stf/linear_algebra/07-cholesky.cu | 42 ++++---- cudax/examples/stf/linear_algebra/07-potri.cu | 96 +++++++++---------- cudax/examples/stf/linear_algebra/cg_csr.cu | 8 ++ .../stf/linear_algebra/cg_dense_2D.cu | 18 ++-- cudax/examples/stf/linear_algebra/strassen.cu | 4 +- 6 files changed, 98 insertions(+), 90 deletions(-) diff --git a/cudax/examples/stf/linear_algebra/06-pdgemm.cu b/cudax/examples/stf/linear_algebra/06-pdgemm.cu index 9df5bc3c260..ec131946adb 100644 --- a/cudax/examples/stf/linear_algebra/06-pdgemm.cu +++ b/cudax/examples/stf/linear_algebra/06-pdgemm.cu @@ -160,9 +160,9 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - for (int rowb = 0; rowb < mt; rowb++) + for (size_t rowb = 0; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -251,9 +251,9 @@ void PDGEMM(stream_ctx& ctx, double beta, matrix& C) { - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB) cuda_safe_call(cudaGetDeviceCount(&ndevs)); /* Warm up allocators */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; } /* Initializes CUBLAS on all devices */ - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { cuda_safe_call(cudaSetDevice(d)); get_cublas_handle(); diff --git a/cudax/examples/stf/linear_algebra/07-cholesky.cu b/cudax/examples/stf/linear_algebra/07-cholesky.cu index 02d90fdf74b..5c7947fc5ac 100644 --- a/cudax/examples/stf/linear_algebra/07-cholesky.cu +++ b/cudax/examples/stf/linear_algebra/07-cholesky.cu @@ -91,10 +91,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = handle(rowb, colb); @@ -171,10 +171,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = handle(rowb, colb); @@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix* A, double* result) reserved::dot::set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -452,17 +452,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -475,17 +475,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -543,9 +543,9 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { //========================================= // alpha*A*B does not contribute; scale C @@ -562,7 +562,7 @@ void PDGEMM(cublasOperation_t transa, //================================ if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -573,7 +573,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -587,7 +587,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -598,7 +598,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -640,7 +640,7 @@ int main(int argc, char** argv) int ndevs; cuda_safe_call(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto lX = ctx.logical_data(shape_of>(1)); ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {}; diff --git a/cudax/examples/stf/linear_algebra/07-potri.cu b/cudax/examples/stf/linear_algebra/07-potri.cu index b17dead1219..e80fbffa663 100644 --- a/cudax/examples/stf/linear_algebra/07-potri.cu +++ b/cudax/examples/stf/linear_algebra/07-potri.cu @@ -93,10 +93,10 @@ public: handles.resize(mt * nt); - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { - int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + size_t low_rowb = sym_matrix ? colb : 0; + for (size_t rowb = low_rowb; rowb < mt; rowb++) { T* addr_h = get_block_h(rowb, colb); auto& h = get_handle(rowb, colb); @@ -173,10 +173,10 @@ public: { nvtxRangePushA("FILL"); // Fill blocks by blocks - for (int colb = 0; colb < nt; colb++) + for (size_t colb = 0; colb < nt; colb++) { int low_rowb = sym_matrix ? colb : 0; - for (int rowb = low_rowb; rowb < mt; rowb++) + for (size_t rowb = low_rowb; rowb < mt; rowb++) { // Each task fills a block auto& h = get_handle(rowb, colb); @@ -804,9 +804,9 @@ void PDNRM2_HOST(matrix* A, double* result) ctx.get_dot()->set_current_color("red"); #endif - for (int rowb = 0; rowb < A->mt; rowb++) + for (size_t rowb = 0; rowb < A->mt; rowb++) { - for (int colb = 0; colb < A->nt; colb++) + for (size_t colb = 0; colb < A->nt; colb++) { ctx.host_launch(A->get_handle(rowb, colb).read())->*[=](auto sA) { double res2 = 0.0; @@ -888,17 +888,17 @@ void PDTRSM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, k))); DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n); @@ -911,17 +911,17 @@ void PDTRSM(cublasSideMode_t side, //================================================ else { - for (int k = 0; k < B.mt; k++) + for (size_t k = 0; k < B.mt; k++) { double lalpha = k == 0 ? alpha : 1.0; - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1))); DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n); } - for (int m = k + 1; m < B.mt; m++) + for (size_t m = k + 1; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m))); DGEMM( @@ -983,9 +983,9 @@ void PDGEMM(cublasOperation_t transa, reserved::dot::set_current_color("blue"); #endif - for (int m = 0; m < C.mt; m++) + for (size_t m = 0; m < C.mt; m++) { - for (int n = 0; n < C.nt; n++) + for (size_t n = 0; n < C.nt; n++) { cuda_try(cudaSetDevice(C.get_preferred_devid(m, n))); @@ -1005,7 +1005,7 @@ void PDGEMM(cublasOperation_t transa, if (transb == CUBLAS_OP_N) { assert(A.nt == B.mt); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n); @@ -1016,7 +1016,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== else { - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n); @@ -1030,7 +1030,7 @@ void PDGEMM(cublasOperation_t transa, //===================================== if (transb == CUBLAS_OP_N) { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n); @@ -1041,7 +1041,7 @@ void PDGEMM(cublasOperation_t transa, //========================================== else { - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { double zbeta = k == 0 ? beta : 1.0; DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n); @@ -1062,22 +1062,22 @@ void PDTRTRI(matrix& A, cublasFillMode_t uplo, cublasDiagType_t diag) nvtxRangePushA("SUBMIT_PDTRTRI"); - for (int k = 0; k < A.nt; k++) + for (size_t k = 0; k < A.nt; k++) { - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, k))); DTRSM(CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, -1.0, A, k, k, A, m, k); } - for (int m = k + 1; m < A.mt; m++) + for (size_t m = k + 1; m < A.mt; m++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, 1.0, A, m, k, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRSM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, diag, 1.0, A, k, k, A, k, n); @@ -1101,20 +1101,20 @@ void PDLAUUM(matrix& A, cublasFillMode_t uplo) nvtxRangePushA("SUBMIT_PDLAUUM"); - for (int k = 0; k < A.mt; k++) + for (size_t k = 0; k < A.mt; k++) { - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(n, n))); DSYRK(CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, 1.0, A, k, n, 1.0, A, n, n); - for (int m = n + 1; m < k; m++) + for (size_t m = n + 1; m < k; m++) { cuda_try(cudaSetDevice(A.get_preferred_devid(m, n))); DGEMM(CUBLAS_OP_T, CUBLAS_OP_N, 1.0, A, k, m, A, k, n, 1.0, A, m, n); } } - for (int n = 0; n < k; n++) + for (size_t n = 0; n < k; n++) { cuda_try(cudaSetDevice(A.get_preferred_devid(k, n))); DTRMM(CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, 1.0, A, k, k, A, k, n); @@ -1136,7 +1136,7 @@ void PDSYMM(cublasSideMode_t side, double beta, matrix& C) { - int k, m, n; + size_t k, m, n; double zbeta; double zone = (double) 1.0; @@ -1272,15 +1272,15 @@ void PDTRMM(cublasSideMode_t side, //=========================================== if (trans == CUBLAS_OP_N) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, m, k, B, k, n, 1.0, B, m, n); } @@ -1294,7 +1294,7 @@ void PDTRMM(cublasSideMode_t side, { for (int m = B.mt - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1317,7 +1317,7 @@ void PDTRMM(cublasSideMode_t side, { for (int m = B.mt - 1; m > -1; m--) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1335,13 +1335,13 @@ void PDTRMM(cublasSideMode_t side, //================================================ else { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { DTRMM(side, uplo, trans, diag, alpha, A, m, m, B, m, n); - for (int k = m + 1; k < A.mt; k++) + for (size_t k = m + 1; k < A.mt; k++) { DGEMM(trans, CUBLAS_OP_N, alpha, A, k, m, B, k, n, 1.0, B, m, n); } @@ -1361,7 +1361,7 @@ void PDTRMM(cublasSideMode_t side, { for (int n = B.nt - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1379,15 +1379,15 @@ void PDTRMM(cublasSideMode_t side, //================================================= else { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, n, k, 1.0, B, m, n); } @@ -1402,15 +1402,15 @@ void PDTRMM(cublasSideMode_t side, //============================================ if (trans == CUBLAS_OP_N) { - for (int n = 0; n < B.nt; n++) + for (size_t n = 0; n < B.nt; n++) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); DTRMM(side, uplo, trans, diag, alpha, A, n, n, B, m, n); - for (int k = n + 1; k < A.mt; k++) + for (size_t k = n + 1; k < A.mt; k++) { DGEMM(CUBLAS_OP_N, trans, alpha, B, m, k, A, k, n, 1.0, B, m, n); } @@ -1424,7 +1424,7 @@ void PDTRMM(cublasSideMode_t side, { for (int n = B.nt - 1; n > -1; n--) { - for (int m = 0; m < B.mt; m++) + for (size_t m = 0; m < B.mt; m++) { cuda_try(cudaSetDevice(B.get_preferred_devid(m, n))); @@ -1462,7 +1462,7 @@ void run(int N, int NB) int ndevs; cuda_try(cudaGetDeviceCount(&ndevs)); - for (size_t d = 0; d < ndevs; d++) + for (int d = 0; d < ndevs; d++) { auto ldummy = ctx.logical_data(shape_of>(1)); ctx.task(exec_place::device(d), ldummy.write())->*[](cudaStream_t, auto) { diff --git a/cudax/examples/stf/linear_algebra/cg_csr.cu b/cudax/examples/stf/linear_algebra/cg_csr.cu index 8d99f237050..ec0d3f1c9ff 100644 --- a/cudax/examples/stf/linear_algebra/cg_csr.cu +++ b/cudax/examples/stf/linear_algebra/cg_csr.cu @@ -100,6 +100,14 @@ public: }; } + // Assign constructor + scalar& operator=(scalar const& rhs) + { + ctx = rhs.ctx; + handle = ctx.logical_data(rhs.handle.shape()); + return *this; + } + // Copy constructor scalar(const scalar& a) { diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu index ed2840dd00f..8bd2f957380 100644 --- a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu +++ b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu @@ -65,7 +65,7 @@ public: if (is_tmp) { // There is no physical backing for this temporary vector - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -91,7 +91,7 @@ public: { handles.resize(nblocks); - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { size_t bs = std::min(N - block_size * b, block_size); handles[b] = to_shared(ctx.logical_data(shape_of>(bs))); @@ -107,12 +107,12 @@ public: void fill(const std::function& f) { size_t bs = block_size; - for (int b = 0; b < nblocks; b++) + for (size_t b = 0; b < nblocks; b++) { ctx.task(exec_place::host, handles[b]->write())->*[&f, b, bs](cudaStream_t stream, auto ds) { cuda_safe_call(cudaStreamSynchronize(stream)); - for (int local_row = 0; local_row < ds.extent(0); local_row++) + for (size_t local_row = 0; local_row < ds.extent(0); local_row++) { ds(local_row) = f(local_row + b * bs); } @@ -234,7 +234,7 @@ class scalar DOT(vector& a, class vector& b) scalar global_res(true); // Loop over all blocks, - for (int bid = 0; bid < a.nblocks; bid++) + for (size_t bid = 0; bid < a.nblocks; bid++) { scalar res(true); @@ -267,7 +267,7 @@ void AXPY(const class scalar& alpha, class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->* @@ -286,7 +286,7 @@ void SCALE_AXPY(const scalar& alpha, const class vector& x, class vector& y) assert(x.N == y.N); assert(x.nblocks == y.nblocks); - for (int b = 0; b < x.nblocks; b++) + for (size_t b = 0; b < x.nblocks; b++) { ctx.task(alpha.handle->read(), x.handles[b]->read(), y.handles[b]->rw()) ->*[](cudaStream_t stream, auto dalpha, auto dx, auto dy) { @@ -315,9 +315,9 @@ void GEMV(double alpha, class matrix& a, class vector& x, double beta, class vec size_t block_size = x.block_size; assert(block_size == y.block_size); - for (int row_y = 0; row_y < y.nblocks; row_y++) + for (size_t row_y = 0; row_y < y.nblocks; row_y++) { - for (int row_x = 0; row_x < x.nblocks; row_x++) + for (size_t row_x = 0; row_x < x.nblocks; row_x++) { double local_beta = (row_x == 0) ? beta : 1.0; diff --git a/cudax/examples/stf/linear_algebra/strassen.cu b/cudax/examples/stf/linear_algebra/strassen.cu index 0ceaa26a422..b28167cfcf8 100644 --- a/cudax/examples/stf/linear_algebra/strassen.cu +++ b/cudax/examples/stf/linear_algebra/strassen.cu @@ -417,9 +417,9 @@ void strassen_test(context& ctx, size_t N) cuda_safe_call(cudaHostRegister(B, N * N * sizeof(double), cudaHostRegisterPortable)); cuda_safe_call(cudaHostRegister(C, N * N * sizeof(double), cudaHostRegisterPortable)); - for (int col = 0; col < N; col++) + for (size_t col = 0; col < N; col++) { - for (int row = 0; row < N; row++) + for (size_t row = 0; row < N; row++) { A[row + N * col] = 1.0; B[row + N * col] = -1.0;