From ee7a30ef4ffb1a00b6993b2bd475c7240c3a8e7d Mon Sep 17 00:00:00 2001 From: rigaya Date: Tue, 12 Mar 2024 22:11:11 +0900 Subject: [PATCH] =?UTF-8?q?denoise-dct,=20smooth=E3=81=AEdct/idct=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=81=A7=E5=90=8C=E6=9C=9F=E3=81=8C=E4=B8=8D=E8=B6=B3?= =?UTF-8?q?=E3=81=97=E3=81=A6=E3=81=84=E3=81=9F=E3=81=AE=E3=82=92=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mppcore/rgy_filter_denoise_dct.cl | 60 ++++++++++++++++++------------ mppcore/rgy_filter_denoise_dct.cpp | 10 +++-- mppcore/rgy_filter_smooth.cl | 2 + 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/mppcore/rgy_filter_denoise_dct.cl b/mppcore/rgy_filter_denoise_dct.cl index c1f8427..52bf7f7 100644 --- a/mppcore/rgy_filter_denoise_dct.cl +++ b/mppcore/rgy_filter_denoise_dct.cl @@ -7,6 +7,7 @@ // DENOISE_SHARED_BLOCK_NUM_X // DENOISE_SHARED_BLOCK_NUM_Y // DENOISE_LOOP_COUNT_BLOCK +// DCT_IDCT_BARRIER //#define DENOISE_BLOCK_SIZE_X (8) //ひとつのスレッドブロックの担当するx方向の8x8ブロックの数 // @@ -322,25 +323,37 @@ void CUDAsubroutineInplaceIDCT16vector(__local TypeTmp *Vect00, const int Step) (*Vect15) = 0.176776695296637f * (x1d + x1f) - 0.25f*x1e; } -void dctBlock(__local TypeTmp shared_tmp[BLOCK_SIZE][BLOCK_SIZE + 1], const int thWorker) { +//こうしたバリアには全スレッドが通るようにしないとRX5500などでは正常に動作しない (他の箇所でbarrierしても意味がない) +//なので、計算の有無はenableフラグで切り替える +void dctBlock(const bool enable, __local TypeTmp shared_tmp[BLOCK_SIZE][BLOCK_SIZE + 1], const int thWorker) { //static_assert(BLOCK_SIZE == 8 || BLOCK_SIZE == 16, "BLOCK_SIZE must be 8 or 16"); if (BLOCK_SIZE == 8) { - CUDAsubroutineInplaceDCT8vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row - CUDAsubroutineInplaceDCT8vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (enable) CUDAsubroutineInplaceDCT8vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceDCT8vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); } else if (BLOCK_SIZE == 16) { - CUDAsubroutineInplaceDCT16vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row - CUDAsubroutineInplaceDCT16vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (enable) CUDAsubroutineInplaceDCT16vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceDCT16vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); } } -void idctBlock(__local TypeTmp shared_tmp[BLOCK_SIZE][BLOCK_SIZE + 1], const int thWorker) { +void idctBlock(const bool enable, __local TypeTmp shared_tmp[BLOCK_SIZE][BLOCK_SIZE + 1], const int thWorker) { //static_assert(BLOCK_SIZE == 8 || BLOCK_SIZE == 16, "BLOCK_SIZE must be 8 or 16"); if (BLOCK_SIZE == 8) { - CUDAsubroutineInplaceIDCT8vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column - CUDAsubroutineInplaceIDCT8vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceIDCT8vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceIDCT8vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); } else if (BLOCK_SIZE == 16) { - CUDAsubroutineInplaceIDCT16vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column - CUDAsubroutineInplaceIDCT16vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceIDCT16vector((__local TypeTmp *)&shared_tmp[0][thWorker], BLOCK_SIZE + 1); // column + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); + if (enable) CUDAsubroutineInplaceIDCT16vector((__local TypeTmp *)&shared_tmp[thWorker][0], 1); // row + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); } } @@ -431,6 +444,7 @@ void directAddBlock( } void filter_block( + const bool enable, const __global char *const __restrict__ ptrSrc, const int srcPitch, SHARED_TMP, SHARED_OUT, @@ -440,13 +454,13 @@ void filter_block( const int width, const int height, const float threshold) { #if 1 - loadBlocktmp(shared_tmp, local_bx, thWorker, ptrSrc, srcPitch, block_x, block_y, width, height); - dctBlock(shared_tmp[local_bx], thWorker); + if (enable) loadBlocktmp(shared_tmp, local_bx, thWorker, ptrSrc, srcPitch, block_x, block_y, width, height); + dctBlock(enable, shared_tmp[local_bx], thWorker); thresholdBlock(shared_tmp[local_bx], thWorker, threshold); - idctBlock(shared_tmp[local_bx], thWorker); - addBlocktmp(shared_out, shared_block_x, shared_block_y, shared_tmp, local_bx, thWorker); + idctBlock(enable, shared_tmp[local_bx], thWorker); + if (enable) addBlocktmp(shared_out, shared_block_x, shared_block_y, shared_tmp, local_bx, thWorker); #else - directAddBlock(shared_out, shared_block_x, shared_block_y, thWorker, ptrSrc, srcPitch, block_x, block_y, width, height); + if (enable) directAddBlock(shared_out, shared_block_x, shared_block_y, thWorker, ptrSrc, srcPitch, block_x, block_y, width, height); #endif } @@ -490,8 +504,8 @@ __kernel void kernel_denoise_dct( SHARED_TMP; SHARED_OUT; - #define FILTER_BLOCK(SHARED_X, SHARED_Y, X, Y) \ - { filter_block(ptrSrc, srcPitch, shared_tmp, shared_out, local_bx, thWorker, (SHARED_X), (SHARED_Y), (X), (Y), width, height, threshold); } + #define FILTER_BLOCK(enable, SHARED_X, SHARED_Y, X, Y) \ + { filter_block((enable), ptrSrc, srcPitch, shared_tmp, shared_out, local_bx, thWorker, (SHARED_X), (SHARED_Y), (X), (Y), width, height, threshold); } { // SHARED_OUTの初期化 clearSharedOut(shared_out, local_bx, thWorker); @@ -505,10 +519,10 @@ __kernel void kernel_denoise_dct( for (int ix_loop = 0; ix_loop < BLOCK_SIZE; ix_loop += STEP) { const int x = block_x + ix_loop; const int shared_x = local_bx * BLOCK_SIZE + ix_loop; - if (local_bx < 1) { // x方向の事前計算 - FILTER_BLOCK(shared_x, shared_y, x - BLOCK_SIZE, y); + { // local_bx < 1 のときのみ実行、enable引数で切りかえる + FILTER_BLOCK(local_bx < 1, shared_x, shared_y, x - BLOCK_SIZE, y); // x方向の事前計算 } - FILTER_BLOCK(shared_x + BLOCK_SIZE, shared_y, x, y); + FILTER_BLOCK(true, shared_x + BLOCK_SIZE, shared_y, x, y); barrier(CLK_LOCAL_MEM_FENCE); } } @@ -521,10 +535,10 @@ __kernel void kernel_denoise_dct( for (int ix_loop = 0; ix_loop < BLOCK_SIZE; ix_loop += STEP) { const int x = block_x + ix_loop; const int shared_x = local_bx * BLOCK_SIZE + ix_loop; - if (local_bx < 1) { // x方向の事前計算 - FILTER_BLOCK(shared_x, shared_y, x - BLOCK_SIZE, y); + { // local_bx < 1 のときのみ実行、enable引数で切りかえる + FILTER_BLOCK(local_bx < 1, shared_x, shared_y, x - BLOCK_SIZE, y); } - FILTER_BLOCK(shared_x + BLOCK_SIZE, shared_y, x, y); + FILTER_BLOCK(true, shared_x + BLOCK_SIZE, shared_y, x, y); barrier(CLK_LOCAL_MEM_FENCE); } for (int iy = 0; iy < STEP; iy++) { diff --git a/mppcore/rgy_filter_denoise_dct.cpp b/mppcore/rgy_filter_denoise_dct.cpp index 081a784..572f106 100644 --- a/mppcore/rgy_filter_denoise_dct.cpp +++ b/mppcore/rgy_filter_denoise_dct.cpp @@ -38,6 +38,8 @@ #define DENOISE_LOOP_COUNT_BLOCK (8) +#define DCT_IDCT_BARRIER (1) + RGY_ERR RGYFilterDenoiseDct::denoiseDct(RGYFrameInfo *pOutputFrame, const RGYFrameInfo *pInputFrame, RGYOpenCLQueue &queue) { auto prm = std::dynamic_pointer_cast(m_param); if (!prm) { @@ -91,7 +93,7 @@ RGY_ERR RGYFilterDenoiseDct::colorDecorrelation(RGYFrameInfo *pOutputFrame, cons } { const char *kernel_name = "kernel_color_decorrelation"; - RGYWorkSize local(64, 8); + RGYWorkSize local(64, 4); RGYWorkSize global(planeInputR.width, planeInputR.height); auto err = m_dct.get()->kernel(kernel_name).config(queue, local, global).launch( (cl_mem)planeOutputR.ptr[0], (cl_mem)planeOutputG.ptr[0], (cl_mem)planeOutputB.ptr[0], planeOutputR.pitch[0], @@ -126,7 +128,7 @@ RGY_ERR RGYFilterDenoiseDct::colorCorrelation(RGYFrameInfo *pOutputFrame, const } { const char *kernel_name = "kernel_color_correlation"; - RGYWorkSize local(64, 8); + RGYWorkSize local(64, 4); RGYWorkSize global(planeInputR.width, planeInputR.height); auto err = m_dct.get()->kernel(kernel_name).config(queue, local, global).launch( (cl_mem)planeOutputR.ptr[0], (cl_mem)planeOutputG.ptr[0], (cl_mem)planeOutputB.ptr[0], planeOutputR.pitch[0], @@ -289,9 +291,9 @@ RGY_ERR RGYFilterDenoiseDct::init(shared_ptr pParam, shared_ptr< } } const auto options = strsprintf("-D TypePixel=float -D bit_depth=32 -D TypeTmp=float -D BLOCK_SIZE=%d -D STEP=%d" - " -D DENOISE_BLOCK_SIZE_X=%d -D DENOISE_SHARED_BLOCK_NUM_X=%d -D DENOISE_SHARED_BLOCK_NUM_Y=%d -D DENOISE_LOOP_COUNT_BLOCK=%d", + " -D DENOISE_BLOCK_SIZE_X=%d -D DENOISE_SHARED_BLOCK_NUM_X=%d -D DENOISE_SHARED_BLOCK_NUM_Y=%d -D DENOISE_LOOP_COUNT_BLOCK=%d -D DCT_IDCT_BARRIER=%d", prm->dct.block_size, prm->dct.step, - DENOISE_BLOCK_SIZE_X, DENOISE_SHARED_BLOCK_NUM_X, DENOISE_SHARED_BLOCK_NUM_Y, DENOISE_LOOP_COUNT_BLOCK); + DENOISE_BLOCK_SIZE_X, DENOISE_SHARED_BLOCK_NUM_X, DENOISE_SHARED_BLOCK_NUM_Y, DENOISE_LOOP_COUNT_BLOCK, DCT_IDCT_BARRIER); m_dct.set(m_cl->buildResourceAsync(_T("RGY_FILTER_DENOISE_DCT_CL"), _T("EXE_DATA"), options.c_str())); auto err = AllocFrameBuf(prm->frameOut, 1); diff --git a/mppcore/rgy_filter_smooth.cl b/mppcore/rgy_filter_smooth.cl index cedd62e..b9ea6be 100644 --- a/mppcore/rgy_filter_smooth.cl +++ b/mppcore/rgy_filter_smooth.cl @@ -117,6 +117,7 @@ void CUDAsubroutineInplaceIDCTvector(__local TypeDct *Vect0, const int Step) { //こうしたバリアには全スレッドが通るようにしないとRX5500などでは正常に動作しない (他の箇所でbarrierしても意味がない) //なので、計算の有無はenableフラグで切り替える void dct8x8(bool enable, __local TypeDct shared_tmp[8][9], int thWorker) { + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); if (enable) CUDAsubroutineInplaceDCTvector((__local TypeDct *)&shared_tmp[thWorker][0], 1); // row if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); if (enable) CUDAsubroutineInplaceDCTvector((__local TypeDct *)&shared_tmp[0][thWorker], 9); // column @@ -124,6 +125,7 @@ void dct8x8(bool enable, __local TypeDct shared_tmp[8][9], int thWorker) { } void idct8x8(bool enable, __local TypeDct shared_tmp[8][9], int thWorker) { + if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); if (enable) CUDAsubroutineInplaceIDCTvector((__local TypeDct *)&shared_tmp[0][thWorker], 9); // column if (DCT_IDCT_BARRIER) barrier(CLK_LOCAL_MEM_FENCE); if (enable) CUDAsubroutineInplaceIDCTvector((__local TypeDct *)&shared_tmp[thWorker][0], 1); // row