From d4773bcf5ffeda79619d8028ead83e14c703c2ed Mon Sep 17 00:00:00 2001 From: Maximilian Behr Date: Wed, 21 Feb 2024 09:32:11 +0100 Subject: [PATCH] added leadning dimension to interface --- CMakeLists.txt | 2 +- README.md | 11 +++---- cuexpm.cu | 71 ++++++++++++++++++++++++---------------------- cuexpm.h | 8 +++--- example_cuexpmc.cu | 2 +- example_cuexpmd.cu | 2 +- example_cuexpms.cu | 2 +- example_cuexpmz.cu | 2 +- 8 files changed, 52 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35fbaa3..b2d3f89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ # SOFTWARE. cmake_minimum_required(VERSION 3.23) -project(cuexpm LANGUAGES C CUDA VERSION 1.0.2) +project(cuexpm LANGUAGES C CUDA VERSION 2.0.0) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/README.md b/README.md index fc3299c..d11ca02 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ![GitHub Release](https://img.shields.io/github/v/release/maximilianbehr/cuexpm?display_name=release&style=flat) ![GitHub Downloads (all assets, all releases)](https://img.shields.io/github/downloads/maximilianbehr/cuexpm/total) -**Version:** 1.0.2 +**Version:** 2.0.0 **Copyright:** Maximilian Behr @@ -27,21 +27,22 @@ Available functions: ```C int cuexpms_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); -int cuexpms(const float *d_A, const int n, void *d_buffer, void *h_buffer, float *d_expmA); +int cuexpms(const int n, const float *d_A, const int ldA, void *d_buffer, void *h_buffer, float *d_expmA, const int ldexpmA); ``` ```C int cuexpmd_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); -int cuexpmd(const double *d_A, const int n, void *d_buffer, void *h_buffer, double *d_expmA); +int cuexpmd(const int n, const double *d_A, const int ldA, void *d_buffer, void *h_buffer, double *d_expmA, const int ldexpmA); ``` ```C int cuexpmc_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); -int cuexpmc(const cuComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuComplex *d_expmA); +int cuexpmc(const int n, const cuComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuComplex *d_expmA, const int ldexpmA); ``` ```C int cuexpmz_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); -int cuexpmz(const cuDoubleComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA); +int cuexpmz(const int n, const cuDoubleComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA, const int ldexpmA); ``` + ## Algorithm `cuexpm` implements the scaling and squaring method for the matrix exponential approximation. diff --git a/cuexpm.cu b/cuexpm.cu index 7047a1e..4866347 100644 --- a/cuexpm.cu +++ b/cuexpm.cu @@ -292,22 +292,22 @@ struct cuexpm_traits { }; template -__global__ static void cuexpm_absrowsums(const T *__restrict__ d_A, const int n, double *__restrict__ buffer) { +__global__ static void cuexpm_absrowsums(const int n, const T *__restrict__ d_A, const int ldA, double *__restrict__ buffer) { int i = threadIdx.x + blockIdx.x * blockDim.x; for (int j = i; j < n; j += blockDim.x * gridDim.x) { double tmp = 0.; for (int k = 0; k < n; ++k) { - tmp += cuexpm_traits::abs(d_A[k + j * n]); + tmp += cuexpm_traits::abs(d_A[k + j * ldA]); } buffer[j] = tmp; } } template -static int cuexpm_matrix1norm(const T *__restrict__ d_A, const int n, void *d_buffer, double *__restrict__ d_nrmA1) { +static int cuexpm_matrix1norm(const int n, const T *__restrict__ d_A, const int ldA, void *d_buffer, double *__restrict__ d_nrmA1) { *d_nrmA1 = 0.; double *buffer = reinterpret_cast(d_buffer); - cuexpm_absrowsums<<<(n + 255) / 256, 256>>>(d_A, n, buffer); + cuexpm_absrowsums<<<(n + 255) / 256, 256>>>(n, d_A, ldA, buffer); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaDeviceSynchronize()); *d_nrmA1 = *(thrust::max_element(thrust::device_pointer_cast(buffer), thrust::device_pointer_cast(buffer + n))); @@ -315,9 +315,9 @@ static int cuexpm_matrix1norm(const T *__restrict__ d_A, const int n, void *d_bu } template -static int cuexpm_parameters(const T *__restrict__ d_A, const int n, void *d_buffer, int *m, int *s) { +static int cuexpm_parameters(const int n, const T *__restrict__ d_A, const int ldA, void *d_buffer, int *m, int *s) { double eta1 = 0.; - CHECK_CUEXPM(cuexpm_matrix1norm(d_A, n, d_buffer, &eta1)); + CHECK_CUEXPM(cuexpm_matrix1norm(n, d_A, ldA, d_buffer, &eta1)); if constexpr (std::is_same::value || std::is_same::value) { const double theta[] = {1.495585217958292e-002, 2.539398330063230e-001, 9.504178996162932e-001, 2.97847961257068e+000, 5.371920351148152e+000}; *s = 0; @@ -363,15 +363,15 @@ static int cuexpm_parameters(const T *__restrict__ d_A, const int n, void *d_buf } template -__global__ static void setDiag(T *d_A, const int n, const T alpha) { +__global__ static void setDiag(const int n, T *d_A, const int ldA, const T alpha) { int i0 = threadIdx.x + blockIdx.x * blockDim.x; int j0 = threadIdx.y + blockIdx.y * blockDim.y; for (int i = i0; i < n; i += blockDim.x * gridDim.x) { for (int j = j0; j < n; j += blockDim.y * gridDim.y) { if (i == j) { - d_A[i + j * n] = alpha; + d_A[i + j * ldA] = alpha; } else { - d_A[i + j * n] = cuexpm_traits::zero; + d_A[i + j * ldA] = cuexpm_traits::zero; } } } @@ -390,10 +390,10 @@ __device__ static cuDoubleComplex &operator+=(cuDoubleComplex &a, const cuDouble } template -__global__ static void addDiag(T *d_A, const int n, const T alpha) { +__global__ static void addDiag(const int n, T *d_A, const int ldA, const T alpha) { int i = threadIdx.x + blockIdx.x * blockDim.x; for (int j = i; j < n; j += blockDim.x * gridDim.x) { - d_A[j + j * n] += alpha; + d_A[j + j * ldA] += alpha; } } @@ -454,7 +454,7 @@ int cuexpmz_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize) } template -static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T *d_F) { +static int cuexpm(const int n, const T *d_A, const int ldA, void *d_buffer, void *h_buffer, T *d_F, const int ldF) { /*----------------------------------------------------------------------------- * kernel launch parameters *-----------------------------------------------------------------------------*/ @@ -467,7 +467,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * * compute the scaling parameter and Pade approximant degree *-----------------------------------------------------------------------------*/ int m, s; - CHECK_CUEXPM(cuexpm_parameters(d_A, n, d_buffer, &m, &s)); + CHECK_CUEXPM(cuexpm_parameters(n, d_A, ldA, d_buffer, &m, &s)); // printf("m = %d, s = %d\n", m, s); /*----------------------------------------------------------------------------- @@ -492,7 +492,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * /*----------------------------------------------------------------------------- * rescale T, T = T / 2^s *-----------------------------------------------------------------------------*/ - CHECK_CUDA(cudaMemcpy(T1, d_A, sizeof(*d_A) * n * n, cudaMemcpyDeviceToDevice)); + CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, d_A, ldA, &cuexpm_traits::zero, T1, n, T1, n)); typename cuexpm_traits::S alpha = 1. / (1 << s); if (s != 0) { @@ -521,33 +521,33 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUDA(cudaStreamCreate(&streamV)); if (m == 3) { // U = U + c(3)*T2 + c(1)*I - setDiag<<>>(U, n, cuexpm_traits::Pade3[1]); + setDiag<<>>(n, U, n, cuexpm_traits::Pade3[1]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamU)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade3[3], T2, n, U, n)); // V = V + c(2)*T2 + c(0)*I - setDiag<<>>(V, n, cuexpm_traits::Pade3[0]); + setDiag<<>>(n, V, n, cuexpm_traits::Pade3[0]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamV)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade3[2], T2, n, V, n)); } else if (m == 5) { // U = U + c(5)*T4 + c(3)*T2 + c(1)*I - setDiag<<>>(U, n, cuexpm_traits::Pade5[1]); + setDiag<<>>(n, U, n, cuexpm_traits::Pade5[1]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamU)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade5[3], T2, n, U, n)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade5[5], T4, n, U, n)); // V = V + c(4)*T4 + c(2)*T2 + c(0)*I - setDiag<<>>(V, n, cuexpm_traits::Pade5[0]); + setDiag<<>>(n, V, n, cuexpm_traits::Pade5[0]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamV)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade5[2], T2, n, V, n)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade5[4], T4, n, V, n)); } else if (m == 7) { // U = U + c(7)*T6 + c(5)*T4 + c(3)*T2 + c(1)*I - setDiag<<>>(U, n, cuexpm_traits::Pade7[1]); + setDiag<<>>(n, U, n, cuexpm_traits::Pade7[1]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamU)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade7[3], T2, n, U, n)); @@ -555,7 +555,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade7[7], T6, n, U, n)); // V = V + c(6)*T6 + c(4)*T4 + c(2)*T2 + c(0)*I - setDiag<<>>(V, n, cuexpm_traits::Pade7[0]); + setDiag<<>>(n, V, n, cuexpm_traits::Pade7[0]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamV)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade7[2], T2, n, V, n)); @@ -563,7 +563,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade7[6], T6, n, V, n)); } else if (m == 9) { // U = U + c(9)*T8 + c(7)*T6 + c(5)*T4 + c(3)*T2 + c(1)*I - setDiag<<>>(U, n, cuexpm_traits::Pade9[1]); + setDiag<<>>(n, U, n, cuexpm_traits::Pade9[1]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamU)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade9[3], T2, n, U, n)); @@ -572,7 +572,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade9[9], T8, n, U, n)); // V = V + c(6)*T6 + c(4)*T4 + c(2)*T2 + c(0)*I - setDiag<<>>(V, n, cuexpm_traits::Pade9[0]); + setDiag<<>>(n, V, n, cuexpm_traits::Pade9[0]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamV)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade9[2], T2, n, V, n)); @@ -581,7 +581,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade9[8], T8, n, V, n)); } else if (m == 13) { // U = T6*(c(13)*T6 + c(11)*T4 + c(9)*T2) + c(7)*T6 + c(5)*T4 + c(3)*T2 + c(1)*I; - setDiag<<>>(U, n, cuexpm_traits::Pade13[1]); + setDiag<<>>(n, U, n, cuexpm_traits::Pade13[1]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamU)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, U, n, &cuexpm_traits::Pade13[3], T2, n, U, n)); @@ -592,7 +592,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUBLAS(cuexpm_traits::cublasXgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &cuexpm_traits::one, T6, n, T8, n, &cuexpm_traits::one, U, n)); // V = T6*(c(12)*T6 + c(10)*T4 + c(8)*T2) + c(6)*T6 + c(4)*T4 + c(2)*T2 + c(0)*I; - setDiag<<>>(V, n, cuexpm_traits::Pade13[0]); + setDiag<<>>(n, V, n, cuexpm_traits::Pade13[0]); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUBLAS(cublasSetStream(cublasH, streamV)); CHECK_CUBLAS(cuexpm_traits::cublasXgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, &cuexpm_traits::one, V, n, &cuexpm_traits::Pade13[2], T2, n, V, n)); @@ -657,7 +657,7 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * CHECK_CUSOLVER(cusolverDnDestroyParams(params)); // add identity - addDiag<<>>(U, n, cuexpm_traits::one); + addDiag<<>>(n, U, n, cuexpm_traits::one); CHECK_CUDA(cudaPeekAtLastError()); /*----------------------------------------------------------------------------- @@ -669,9 +669,12 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * s--; } + int ldU = n; + int ldF_ = ldF; // save original ldF for (int k = 0; k < s; ++k) { - CHECK_CUBLAS(cuexpm_traits::cublasXgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &cuexpm_traits::one, U, n, U, n, &cuexpm_traits::zero, d_F, n)); + CHECK_CUBLAS(cuexpm_traits::cublasXgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &cuexpm_traits::one, U, ldU, U, ldU, &cuexpm_traits::zero, d_F, ldF_)); std::swap(U, d_F); + std::swap(ldU, ldF_); } CHECK_CUDA(cudaDeviceSynchronize()); @@ -683,18 +686,18 @@ static int cuexpm(const T *d_A, const int n, void *d_buffer, void *h_buffer, T * return 0; } -int cuexpms(const float *d_A, const int n, void *d_buffer, void *h_buffer, float *d_expmA) { - return cuexpm(d_A, n, d_buffer, h_buffer, d_expmA); +int cuexpms(const int n, const float *d_A, const int ldA, void *d_buffer, void *h_buffer, float *d_expmA, const int ldexpmA) { + return cuexpm(n, d_A, ldA, d_buffer, h_buffer, d_expmA, ldexpmA); } -int cuexpmd(const double *d_A, const int n, void *d_buffer, void *h_buffer, double *d_expmA) { - return cuexpm(d_A, n, d_buffer, h_buffer, d_expmA); +int cuexpmd(const int n, const double *d_A, const int ldA, void *d_buffer, void *h_buffer, double *d_expmA, const int ldexpmA) { + return cuexpm(n, d_A, ldA, d_buffer, h_buffer, d_expmA, ldexpmA); } -int cuexpmc(const cuComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuComplex *d_expmA) { - return cuexpm(d_A, n, d_buffer, h_buffer, d_expmA); +int cuexpmc(const int n, const cuComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuComplex *d_expmA, const int ldexpmA) { + return cuexpm(n, d_A, ldA, d_buffer, h_buffer, d_expmA, ldexpmA); } -int cuexpmz(const cuDoubleComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA) { - return cuexpm(d_A, n, d_buffer, h_buffer, d_expmA); +int cuexpmz(const int n, const cuDoubleComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA, const int ldexpmA) { + return cuexpm(n, d_A, ldA, d_buffer, h_buffer, d_expmA, ldexpmA); } diff --git a/cuexpm.h b/cuexpm.h index 7398c9e..4c7ddc4 100644 --- a/cuexpm.h +++ b/cuexpm.h @@ -32,10 +32,10 @@ int cuexpms_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); int cuexpmd_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); int cuexpmc_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); int cuexpmz_bufferSize(const int n, size_t *d_bufferSize, size_t *h_bufferSize); -int cuexpms(const float *d_A, const int n, void *d_buffer, void *h_buffer, float *d_expmA); -int cuexpmd(const double *d_A, const int n, void *d_buffer, void *h_buffer, double *d_expmA); -int cuexpmc(const cuComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuComplex *d_expmA); -int cuexpmz(const cuDoubleComplex *d_A, const int n, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA); +int cuexpms(const int n, const float *d_A, const int ldA, void *d_buffer, void *h_buffer, float *d_expmA, const int ldexpmA); +int cuexpmd(const int n, const double *d_A, const int ldA, void *d_buffer, void *h_buffer, double *d_expmA, const int ldexpmA); +int cuexpmc(const int n, const cuComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuComplex *d_expmA, const int ldexpmA); +int cuexpmz(const int n, const cuDoubleComplex *d_A, const int ldA, void *d_buffer, void *h_buffer, cuDoubleComplex *d_expmA, const int ldexpmA); #ifdef __cplusplus } #endif diff --git a/example_cuexpmc.cu b/example_cuexpmc.cu index 0982015..9376006 100644 --- a/example_cuexpmc.cu +++ b/example_cuexpmc.cu @@ -90,7 +90,7 @@ int main(void) { * compute the approximation of the matrix exponential of A and measure the time *-----------------------------------------------------------------------------*/ auto t0 = std::chrono::high_resolution_clock::now(); - ret = cuexpmc(d_A, n, d_buffer, h_buffer, d_expmA); + ret = cuexpmc(n, d_A, n, d_buffer, h_buffer, d_expmA, n); if (ret) { fprintf(stderr, "cuexpmc failed with error %d\n", ret); fflush(stderr); diff --git a/example_cuexpmd.cu b/example_cuexpmd.cu index 1ec7e51..f173be7 100644 --- a/example_cuexpmd.cu +++ b/example_cuexpmd.cu @@ -90,7 +90,7 @@ int main(void) { * compute the approximation of the matrix exponential of A and measure the time *-----------------------------------------------------------------------------*/ auto t0 = std::chrono::high_resolution_clock::now(); - ret = cuexpmd(d_A, n, d_buffer, h_buffer, d_expmA); + ret = cuexpmd(n, d_A, n, d_buffer, h_buffer, d_expmA, n); if (ret) { fprintf(stderr, "cuexpmd failed with error %d\n", ret); fflush(stderr); diff --git a/example_cuexpms.cu b/example_cuexpms.cu index aef066f..a1907e3 100644 --- a/example_cuexpms.cu +++ b/example_cuexpms.cu @@ -90,7 +90,7 @@ int main(void) { * compute the approximation of the matrix exponential of A and measure the time *-----------------------------------------------------------------------------*/ auto t0 = std::chrono::high_resolution_clock::now(); - ret = cuexpms(d_A, n, d_buffer, h_buffer, d_expmA); + ret = cuexpms(n, d_A, n, d_buffer, h_buffer, d_expmA, n); if (ret) { fprintf(stderr, "cuexpms failed with error %d\n", ret); fflush(stderr); diff --git a/example_cuexpmz.cu b/example_cuexpmz.cu index 0bb9ef5..3811dc6 100644 --- a/example_cuexpmz.cu +++ b/example_cuexpmz.cu @@ -90,7 +90,7 @@ int main(void) { * compute the approximation of the matrix exponential of A and measure the time *-----------------------------------------------------------------------------*/ auto t0 = std::chrono::high_resolution_clock::now(); - ret = cuexpmz(d_A, n, d_buffer, h_buffer, d_expmA); + ret = cuexpmz(n, d_A, n, d_buffer, h_buffer, d_expmA, n); if (ret) { fprintf(stderr, "cuexpmz failed with error %d\n", ret); fflush(stderr);