From 6ea0341505da833f978ec7a5dd75ad73f1a3ab14 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Thu, 4 Mar 2021 03:29:31 +0000 Subject: [PATCH 1/2] optimize conv3x3 when group==1 test=develop --- .../cl_kernel/image/conv2d_3x3_opt_kernel.cl | 228 +++++++----------- lite/kernels/opencl/conv_image_compute.cc | 41 +++- 2 files changed, 119 insertions(+), 150 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index ebd0e021b99..ec8bfb66c74 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include - __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_w, __private const int item_h, @@ -36,9 +35,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, const int item_w_id = get_global_id(1); const int item_h_id = get_global_id(2); - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; + // out_width_id_per_blk + int out_w_base_id = mul24(item_ch_id, out_w); int out_w_id0 = item_w_id; int out_w_id1 = out_w_id0 + item_w; int out_w_id2 = out_w_id1 + item_w; @@ -46,12 +44,12 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, int out_w_id4 = out_w_id3 + item_w; // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + int in_h_id = mad24((item_h_id % out_h), stride, (-pad)); + int in_w_id0 = mad24(item_w_id, stride, (-pad)); + int in_w_id1 = mad24(item_w, stride, in_w_id0); + int in_w_id2 = mad24(item_w, stride, in_w_id1); + int in_w_id3 = mad24(item_w, stride, in_w_id2); + int in_w_id4 = mad24(item_w, stride, in_w_id3); #ifdef BIASE_CH @@ -99,81 +97,56 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, #endif CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 filter_trans[4] = {0.0f}; CL_DTYPE4 input[5] = {0.0f}; - int filter_h_val0 = item_ch_id * 4 * 3; - int filter_h_val1 = filter_h_val0 + 3; - int filter_h_val2 = filter_h_val1 + 3; - int filter_h_val3 = filter_h_val2 + 3; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { + int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0; const int in_w_base_id = mul24(ch, in_w); - int filter_w_val = ch * 3; + int filter_w_val = ch << 2; + int filter_h_val = mul24(item_ch_id, 9); for (int h = 0; h < 3; h++) { - int in_h_val = select(out_batch_id * in_h + in_h_id + h, - -1, - (out_batch_id * in_h + in_h_id + h < 0 || - out_batch_id * in_h + in_h_id + h >= in_h)); - + int in_h_val = in_h_id + h; + for (int w = 0; w < 3; w++) { int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + (in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + (in_w_id2 + w < 0 | in_w_id2 + w >= in_w)); int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + (in_w_id3 + w < 0 | in_w_id3 + w >= in_w)); int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + (in_w_id4 + w < 0 | in_w_id4 + w >= in_w)); filter[0] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + (int2)(filter_w_val, filter_h_val)); filter[1] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + (int2)(filter_w_val + 1, filter_h_val)); filter[2] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + (int2)(filter_w_val + 2, filter_h_val)); filter[3] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (CL_DTYPE4)(filter[0].x, - filter[1].x, - filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (CL_DTYPE4)(filter[0].y, - filter[1].y, - filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (CL_DTYPE4)(filter[0].z, - filter[1].z, - filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (CL_DTYPE4)(filter[0].w, - filter[1].w, - filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 + (int2)(filter_w_val + 3, filter_h_val++)); input[0] = READ_IMG_TYPE( CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); @@ -186,37 +159,36 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, input[4] = READ_IMG_TYPE( CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); + output[0] = mad(input[0].x, filter[0], output[0]); + output[1] = mad(input[1].x, filter[0], output[1]); + output[2] = mad(input[2].x, filter[0], output[2]); + output[3] = mad(input[3].x, filter[0], output[3]); + output[4] = mad(input[4].x, filter[0], output[4]); if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); + output[0] = mad(input[0].y, filter[1], output[0]); + output[1] = mad(input[1].y, filter[1], output[1]); + output[2] = mad(input[2].y, filter[1], output[2]); + output[3] = mad(input[3].y, filter[1], output[3]); + output[4] = mad(input[4].y, filter[1], output[4]); } if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); + output[0] = mad(input[0].z, filter[2], output[0]); + output[1] = mad(input[1].z, filter[2], output[1]); + output[2] = mad(input[2].z, filter[2], output[2]); + output[3] = mad(input[3].z, filter[2], output[3]); + output[4] = mad(input[4].z, filter[2], output[4]); } if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); + output[0] = mad(input[0].w, filter[3], output[0]); + output[1] = mad(input[1].w, filter[3], output[1]); + output[2] = mad(input[2].w, filter[3], output[2]); + output[3] = mad(input[3].w, filter[3], output[3]); + output[4] = mad(input[4].w, filter[3], output[4]); } } } } - CL_DTYPE4 alpha[5]; #ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); @@ -324,14 +296,14 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int out_w, __private const int out_h, __read_only image2d_t prelu_alpha) { - // item_id + // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); const int item_h_id = get_global_id(2); - // out_width_id_per_blk and out_batch_id + // out_width_id_per_blk int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; + int out_w_base_id = mul24(item_ch_id, out_w); int out_w_id0 = item_w_id; int out_w_id1 = out_w_id0 + item_w; int out_w_id2 = out_w_id1 + item_w; @@ -339,12 +311,12 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, int out_w_id4 = out_w_id3 + item_w; // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + int in_h_id = mad24((item_h_id % out_h), stride, (-pad)); + int in_w_id0 = mad24(item_w_id, stride, (-pad)); + int in_w_id1 = mad24(item_w, stride, in_w_id0); + int in_w_id2 = mad24(item_w, stride, in_w_id1); + int in_w_id3 = mad24(item_w, stride, in_w_id2); + int in_w_id4 = mad24(item_w, stride, in_w_id3); #ifdef BIASE_CH @@ -392,82 +364,60 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, #endif CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 filter_trans[4] = {0.0f}; CL_DTYPE4 input[5] = {0.0f}; - int filter_h_val0 = item_ch_id * 4 * 3; - int filter_h_val1 = filter_h_val0 + 3; - int filter_h_val2 = filter_h_val1 + 3; - int filter_h_val3 = filter_h_val2 + 3; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { + int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0; const int in_w_base_id = mul24(ch, in_w); - int filter_w_val = ch * 3; + int filter_w_val = ch << 2; + int filter_h_val = mul24(item_ch_id, 9); for (int h = 0; h < 3; h++) { int in_h_val = select( - out_batch_id * in_h + in_h_id + h, + mad24(out_batch_id, in_h, (in_h_id + h)), -1, - (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || - out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); - + (mad24(out_batch_id, in_h, (in_h_id + h)) < mul24(out_batch_id, in_h) || + mad24(out_batch_id, in_h, (in_h_id + h)) >= mul24((out_batch_id + 1), in_h))); + for (int w = 0; w < 3; w++) { int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + (in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + (in_w_id2 + w < 0 | in_w_id2 + w >= in_w)); int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + (in_w_id3 + w < 0 | in_w_id3 + w >= in_w)); int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + (in_w_id4 + w < 0 | in_w_id4 + w >= in_w)); filter[0] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + (int2)(filter_w_val, filter_h_val)); filter[1] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + (int2)(filter_w_val + 1, filter_h_val)); filter[2] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + (int2)(filter_w_val + 2, filter_h_val)); filter[3] = READ_IMG_TYPE( CL_DTYPE_CHAR, filter_image, SAMPLER, - (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (CL_DTYPE4)(filter[0].x, - filter[1].x, - filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (CL_DTYPE4)(filter[0].y, - filter[1].y, - filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (CL_DTYPE4)(filter[0].z, - filter[1].z, - filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (CL_DTYPE4)(filter[0].w, - filter[1].w, - filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 + (int2)(filter_w_val + 3, filter_h_val++)); input[0] = READ_IMG_TYPE( CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); @@ -480,32 +430,32 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, input[4] = READ_IMG_TYPE( CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); + output[0] = mad(input[0].x, filter[0], output[0]); + output[1] = mad(input[1].x, filter[0], output[1]); + output[2] = mad(input[2].x, filter[0], output[2]); + output[3] = mad(input[3].x, filter[0], output[3]); + output[4] = mad(input[4].x, filter[0], output[4]); if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); + output[0] = mad(input[0].y, filter[1], output[0]); + output[1] = mad(input[1].y, filter[1], output[1]); + output[2] = mad(input[2].y, filter[1], output[2]); + output[3] = mad(input[3].y, filter[1], output[3]); + output[4] = mad(input[4].y, filter[1], output[4]); } if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); + output[0] = mad(input[0].z, filter[2], output[0]); + output[1] = mad(input[1].z, filter[2], output[1]); + output[2] = mad(input[2].z, filter[2], output[2]); + output[3] = mad(input[3].z, filter[2], output[3]); + output[4] = mad(input[4].z, filter[2], output[4]); } if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); + output[0] = mad(input[0].w, filter[3], output[0]); + output[1] = mad(input[1].w, filter[3], output[1]); + output[2] = mad(input[2].w, filter[3], output[2]); + output[3] = mad(input[3].w, filter[3], output[3]); + output[4] = mad(input[4].w, filter[3], output[4]); } } } diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index b0b09e5cf71..bb84935b119 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -172,22 +172,41 @@ void ConvImageCompute::PrepareForRun() { input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt"); kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); impl_ = &ConvImageCompute::Conv2d3x3opt; + + CLImageConverterNBlock converter; + const DDim& filter_image_dims = + converter.InitImageDimInfoWith(filter_dims); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize( + {1, filter_image_w_, filter_image_h_, 4}); + auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_); + + converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); + MUTABLE_DATA_GPU(filter_gpu_image_, + filter_image_w_, + filter_image_h_, + filter_image_data); } else { // groups_ > 1 kernel_func_names_.push_back("conv2d_3x3"); kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); impl_ = &ConvImageCompute::Conv2d3x3; - } - CLImageConverterFolder converter; - const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - filter_image_h_ = filter_image_dims[1]; - filter_image_w_ = filter_image_dims[0]; - tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_); - - converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); - MUTABLE_DATA_GPU( - filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data); + CLImageConverterFolder converter; + const DDim& filter_image_dims = + converter.InitImageDimInfoWith(filter_dims); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize( + {1, filter_image_w_, filter_image_h_, 4}); + auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_); + + converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); + MUTABLE_DATA_GPU(filter_gpu_image_, + filter_image_w_, + filter_image_h_, + filter_image_data); + } } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { CHECK(pad_equal && stride_equal && dilation_equal); #define CONV_5x5_OPT From bd653eca342f1d7a67aa2e455a1d602c256aeed2 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Fri, 26 Mar 2021 06:57:29 +0000 Subject: [PATCH 2/2] test=develop