From ab667a575cac18f01d987ce1380d5e04b43ee26d Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Nov 2022 21:07:32 +0000
Subject: [PATCH] Re-use common_includes to propagate shared functions

The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.
---
 .../mprofile/dsp/micro_kernel/common.py       | 23 +++++++++++++++++--
 .../micro_kernel/multi_channel_convolve.py    | 11 ++++-----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
index 014f0b44b199..e89bf7c1b4fc 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -29,8 +29,8 @@
 #include <tvm/runtime/crt/error_codes.h>
 
 
-#ifndef ARM_CPU_ROR_EXISTS
-#define ARM_CPU_ROR_EXISTS
+#ifndef ARM_CPU_INTRINSICS_EXIST
+#define ARM_CPU_INTRINSICS_EXIST
 __attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
 {
   op2 %= 32U;
@@ -40,6 +40,25 @@
   }
   return (op1 >> op2) | (op1 << (32U - op2));
 }
+
+#define __pkhbt(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __pkhtb(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __asm("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
 #endif
 """
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
index 91d76b7bd5e5..25588964eeaf 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -23,7 +23,7 @@
 import textwrap
 
 from tvm import te, tir
-from .common import num_simd_lanes_per_word
+from .common import num_simd_lanes_per_word, common_includes
 
 
 def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
@@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
 def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
-        #include <stdint.h>
-        #include <arm_acle.h>
-
+            common_includes
+            + f"""
         // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
 
         #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
@@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
 def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
+            common_includes
+            + f"""
         #include <stdint.h>
 
         /* We do four channels at once to get this speed boost. */