From 1ff0a80b97ff27f4b8c117164d83b11ebd36c500 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Fri, 8 Apr 2022 19:13:30 +0800 Subject: [PATCH 1/4] replace for with CUDA_KERNEL_LOOP for index_select_grad_init() in index_select op --- paddle/phi/kernels/funcs/gather.cu.h | 15 +++++---------- paddle/phi/kernels/funcs/scatter.cu.h | 18 ++++++++---------- .../kernels/gpu/index_select_grad_kernel.cu | 5 ++--- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h index 59c8c9f3b8f0e..617d249308cda 100644 --- a/paddle/phi/kernels/funcs/gather.cu.h +++ b/paddle/phi/kernels/funcs/gather.cu.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" // TODO(paddle-dev): move gpu_primitives.h to phi +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/common/place.h" @@ -110,11 +111,8 @@ void GPUGather(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherCUDAKernel<<>>( p_src, p_index, p_output, index_size, slice_size); @@ -155,11 +153,8 @@ void GPUGatherNd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherNdCUDAKernel<<>>(p_input, g_input_dims, diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 254dd45edb596..5ae93ca250771 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" @@ -155,9 +156,8 @@ void GPUScatterAssign(const phi::GPUContext& ctx, // set block and grid num int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); // if not overwrite mode, init data if (!overwrite) { @@ -188,9 +188,8 @@ void GPUScatterGradForX(const phi::GPUContext& ctx, int64_t block = 512; int64_t n = slice_size * index_size; int64_t height = (n + block - 1) / block; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); ScatterInitCUDAKernel<<>>( p_index, p_output, index_size, slice_size); @@ -230,10 +229,9 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; - + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); + ScatterNdCUDAKernel<<>>( p_update, p_index, diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index 75ae1bbcd0a08..bdcd6de881802 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -48,10 +48,9 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, template __global__ void index_select_grad_init(T* input_grad, int64_t N) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; + CUDA_KERNEL_LOOP(idx, N) { + input_grad[idx] = 0.0; } - input_grad[idx] = 0.0; } template From ed80e9fd04f148ff569086cfe795b9b2eab51d19 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Fri, 8 Apr 2022 19:26:51 +0800 Subject: [PATCH 2/4] use CUDA_KERNEL_LOOP_TYPE --- paddle/phi/kernels/gpu/index_select_grad_kernel.cu | 5 ++--- paddle/phi/kernels/gpu/index_select_kernel.cu | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index bdcd6de881802..133e428983cdd 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -35,7 +35,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; @@ -47,8 +47,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, template __global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { input_grad[idx] = 0.0; } } diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu index 38a6582d790f8..0a6ac69cef098 100644 --- a/paddle/phi/kernels/gpu/index_select_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -32,7 +32,7 @@ __global__ void index_select_cuda_kernel(const T* input, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; From bd32facb3db7222d4308d0c7d5f655dbdee8ff52 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Sun, 10 Apr 2022 11:41:50 +0800 Subject: [PATCH 3/4] fix code style --- paddle/phi/kernels/funcs/scatter.cu.h | 2 +- paddle/phi/kernels/gpu/index_select_grad_kernel.cu | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 5ae93ca250771..87083af3bc6a2 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -231,7 +231,7 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, int64_t n = slice_size * remain_numel; dim3 grid = dim3((n + block - 1) / block); paddle::platform::LimitGridDim(ctx, &grid); - + ScatterNdCUDAKernel<<>>( p_update, p_index, diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index 133e428983cdd..972d757811566 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -47,9 +47,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, template __global__ void index_select_grad_init(T* input_grad, int64_t N) { - CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { - input_grad[idx] = 0.0; - } + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { input_grad[idx] = 0.0; } } template From 5a9adccc8e8cae23a4fcddef0273c3cce26aaa88 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Mon, 11 Apr 2022 18:10:31 +0800 Subject: [PATCH 4/4] replace index_select_grad_init with SetConstant --- paddle/phi/kernels/gpu/index_select_grad_kernel.cu | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index 972d757811566..84094f4c1ee5a 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(cudnn_deterministic); @@ -45,11 +46,6 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, } } -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { input_grad[idx] = 0.0; } -} - template void IndexSelectGradKernel(const Context& ctx, const DenseTensor& x, @@ -93,8 +89,8 @@ void IndexSelectGradKernel(const Context& ctx, dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); paddle::platform::LimitGridDim(ctx, &grid_dim); - index_select_grad_init<<>>(in_grad_data, - numel); + phi::funcs::SetConstant index_select_grad_init; + index_select_grad_init(ctx, x_grad, static_cast(0)); if (FLAGS_cudnn_deterministic) { VLOG(2) << "Run grad kernel of index_select with single thread.";