Skip to content

Commit

Permalink
Re-use common_includes to propagate shared functions
Browse files Browse the repository at this point in the history
The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.
  • Loading branch information
Mousius committed Dec 28, 2022
1 parent 4d330af commit ab667a5
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
23 changes: 21 additions & 2 deletions python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
#include <tvm/runtime/crt/error_codes.h>
#ifndef ARM_CPU_ROR_EXISTS
#define ARM_CPU_ROR_EXISTS
#ifndef ARM_CPU_INTRINSICS_EXIST
#define ARM_CPU_INTRINSICS_EXIST
__attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
{
op2 %= 32U;
Expand All @@ -40,6 +40,25 @@
}
return (op1 >> op2) | (op1 << (32U - op2));
}
#define __pkhbt(ARG1,ARG2,ARG3) \
__extension__ \
({ \
uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
__asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
__RES; \
})
#define __pkhtb(ARG1,ARG2,ARG3) \
__extension__ \
({ \
uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
if (ARG3 == 0) \
__asm("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \
else \
__asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
__RES; \
})
#endif
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import textwrap

from tvm import te, tir
from .common import num_simd_lanes_per_word
from .common import num_simd_lanes_per_word, common_includes


def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
Expand Down Expand Up @@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
return textwrap.dedent(
(
f"""
#include <stdint.h>
#include <arm_acle.h>
common_includes
+ f"""
// __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
#define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
Expand Down Expand Up @@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
return textwrap.dedent(
(
f"""
common_includes
+ f"""
#include <stdint.h>
/* We do four channels at once to get this speed boost. */
Expand Down

0 comments on commit ab667a5

Please sign in to comment.