From ab667a575cac18f01d987ce1380d5e04b43ee26d Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 25 Nov 2022 21:07:32 +0000 Subject: [PATCH] Re-use common_includes to propagate shared functions The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them. --- .../mprofile/dsp/micro_kernel/common.py | 23 +++++++++++++++++-- .../micro_kernel/multi_channel_convolve.py | 11 ++++----- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py index 014f0b44b199..e89bf7c1b4fc 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py @@ -29,8 +29,8 @@ #include -#ifndef ARM_CPU_ROR_EXISTS -#define ARM_CPU_ROR_EXISTS +#ifndef ARM_CPU_INTRINSICS_EXIST +#define ARM_CPU_INTRINSICS_EXIST __attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2) { op2 %= 32U; @@ -40,6 +40,25 @@ } return (op1 >> op2) | (op1 << (32U - op2)); } + +#define __pkhbt(ARG1,ARG2,ARG3) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + __asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) + +#define __pkhtb(ARG1,ARG2,ARG3) \ +__extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + if (ARG3 == 0) \ + __asm("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \ + else \ + __asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) #endif """ diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py index 91d76b7bd5e5..25588964eeaf 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py @@ -23,7 +23,7 @@ import textwrap from tvm import te, tir -from .common import num_simd_lanes_per_word +from .common import num_simd_lanes_per_word, common_includes def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix): @@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str: def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix): return textwrap.dedent( ( - f""" - #include - #include - + common_includes + + f""" // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \ @@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix): return textwrap.dedent( ( - f""" + common_includes + + f""" #include /* We do four channels at once to get this speed boost. */