Skip to content

Commit

Permalink
Detect ARM CPU features for host target and in runtime (#8298)
Browse files Browse the repository at this point in the history
Adds feature detection for ARM CPUs to the runtime library and to
the host target feature computation. Supports Windows, macOS,
Linux, iOS, and Android.

Also fix bug in Type::max() and Type::min() for float16.

Fixes #4727
Fixes #6106
Fixes #7901
Fixes #7979
Fixes #8340
  • Loading branch information
alexreinking authored and steven-johnson committed Jul 15, 2024
1 parent 41bc134 commit 23ce3eb
Show file tree
Hide file tree
Showing 14 changed files with 422 additions and 42 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
hexagon_dma_pool \
hexagon_host \
ios_io \
linux_aarch64_cpu_features \
linux_arm_cpu_features \
linux_clock \
linux_host_cpu_count \
linux_yield \
Expand All @@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
msan \
msan_stubs \
opencl \
osx_aarch64_cpu_features \
osx_arm_cpu_features \
osx_clock \
osx_get_symbol \
osx_host_cpu_count \
Expand Down Expand Up @@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
wasm_cpu_features \
webgpu_dawn \
webgpu_emscripten \
windows_aarch64_cpu_features_arm \
windows_clock \
windows_cuda \
windows_d3d12compute_arm \
Expand Down
64 changes: 52 additions & 12 deletions src/LLVM_Runtime_Linker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
return std::unique_ptr<llvm::Module>(); \
}

#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits) \
do { \
if (debug) { \
return get_initmod_##mod##_##bits##_debug(context); \
} else { \
return get_initmod_##mod##_##bits(context); \
} \
} while (0)

#define DECLARE_CPP_INITMOD_LOOKUP(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
if (debug) { \
return get_initmod_##mod##_64_debug(context); \
} else { \
return get_initmod_##mod##_64(context); \
} \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32); \
} \
}

#define DECLARE_CPP_INITMOD_LOOKUP_64(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
if (debug) { \
return get_initmod_##mod##_32_debug(context); \
} else { \
return get_initmod_##mod##_32(context); \
} \
internal_error << "No support for 32-bit initmod: " #mod; \
return nullptr; /* appease warnings */ \
} \
}

Expand All @@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP(mod)

#define DECLARE_CPP_INITMOD_64(mod) \
DECLARE_INITMOD(mod##_64_debug) \
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP_64(mod)

#define DECLARE_LL_INITMOD(mod) \
DECLARE_INITMOD(mod##_ll)

Expand Down Expand Up @@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
DECLARE_LL_INITMOD(arm)
DECLARE_LL_INITMOD(arm_no_neon)
DECLARE_CPP_INITMOD(arm_cpu_features)
DECLARE_CPP_INITMOD(linux_arm_cpu_features)
DECLARE_CPP_INITMOD(osx_arm_cpu_features)
#else
DECLARE_NO_INITMOD(arm)
DECLARE_NO_INITMOD(arm_no_neon)
DECLARE_NO_INITMOD(arm_cpu_features)
DECLARE_NO_INITMOD(linux_arm_cpu_features)
DECLARE_NO_INITMOD(osx_arm_cpu_features)
#endif // WITH_ARM

#ifdef WITH_AARCH64
DECLARE_LL_INITMOD(aarch64)
DECLARE_CPP_INITMOD(aarch64_cpu_features)
DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
#else
DECLARE_NO_INITMOD(aarch64)
DECLARE_NO_INITMOD(aarch64_cpu_features)
DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
#endif // WITH_AARCH64

#ifdef WITH_NVPTX
Expand Down Expand Up @@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
}
if (t.arch == Target::ARM) {
if (t.bits == 64) {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::Windows) {
modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
} else {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
}
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
}
}
}
if (t.arch == Target::POWERPC) {
Expand Down
122 changes: 115 additions & 7 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,50 @@
#endif

#ifdef _MSC_VER
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <intrin.h>
#include <windows.h>
#endif // _MSC_VER

#ifdef __APPLE__
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif

#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_ASIMDHP
#define HWCAP_ASIMDHP 0
#endif
#ifndef HWCAP_ASIMDDP
#define HWCAP_ASIMDDP 0
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE 0
#endif
#ifndef HWCAP2_SVE2
#define HWCAP2_SVE2 0
#endif
#endif

namespace Halide {

using std::string;
using std::vector;

namespace {

#ifdef _MSC_VER
static void cpuid(int info[4], int infoType, int extra) {
#if defined(_M_IX86) || defined(_M_AMD64)

void cpuid(int info[4], int infoType, int extra) {
__cpuidex(info, infoType, extra);
}
#else

#if defined(__x86_64__) || defined(__i386__)
#elif defined(__x86_64__) || defined(__i386__)

// CPU feature detection code taken from ispc
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)

Expand All @@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "0"(infoType), "2"(extra));
}
#endif

#endif

#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)

enum class VendorSignatures {
Unknown,
Expand Down Expand Up @@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s

#endif // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)

#ifdef __APPLE__

template<typename T>
std::optional<T> getsysctl(const char *name) {
T value;
size_t size = sizeof(value);
if (sysctlbyname(name, &value, &size, nullptr, 0)) {
return std::nullopt;
}
return std::make_optional(value);
}

bool sysctl_is_set(const char *name) {
return getsysctl<int>(name).value_or(0);
}

bool is_armv7s() {
return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
}

#endif // __APPLE__

Target calculate_host_target() {
Target::OS os = Target::OSUnknown;
#ifdef __linux__
Expand All @@ -164,8 +214,66 @@ Target calculate_host_target() {
#if __riscv
Target::Arch arch = Target::RISCV;
#else
#if defined(__arm__) || defined(__aarch64__)
#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
Target::Arch arch = Target::ARM;

#ifdef __APPLE__
if (is_armv7s()) {
initial_features.push_back(Target::ARMv7s);
}

if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
initial_features.push_back(Target::ARMDotProd);
}

if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
initial_features.push_back(Target::ARMFp16);
}
#endif

#ifdef __linux__
unsigned long hwcaps = getauxval(AT_HWCAP);
unsigned long hwcaps2 = getauxval(AT_HWCAP2);

if (hwcaps & HWCAP_ASIMDDP) {
initial_features.push_back(Target::ARMDotProd);
}

if (hwcaps & HWCAP_ASIMDHP) {
initial_features.push_back(Target::ARMFp16);
}

if (hwcaps & HWCAP_SVE) {
initial_features.push_back(Target::SVE);
}

if (hwcaps2 & HWCAP2_SVE2) {
initial_features.push_back(Target::SVE2);
}
#endif

#ifdef _MSC_VER

// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)

// This is the strategy used by Google's cpuinfo library for
// detecting fp16 arithmetic support on Windows.
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMFp16);
}

if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMDotProd);
}

if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::SVE);
}

#endif

#else
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
Target::Arch arch = Target::POWERPC;
Expand Down
4 changes: 2 additions & 2 deletions src/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, 65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand All @@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, -65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand Down
9 changes: 8 additions & 1 deletion src/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
// Portable bit-counting methods
int popcount64(uint64_t x) {
#ifdef _MSC_VER
#if defined(_WIN64)
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
int popcnt = 0;
while (x) {
x &= x - 1;
popcnt++;
}
return popcnt;
#elif defined(_WIN64)
return __popcnt64(x);
#else
return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ set(RUNTIME_CPP
hexagon_dma_pool
hexagon_host
ios_io
linux_aarch64_cpu_features
linux_arm_cpu_features
linux_clock
linux_host_cpu_count
linux_yield
Expand All @@ -43,6 +45,8 @@ set(RUNTIME_CPP
msan
msan_stubs
opencl
osx_aarch64_cpu_features
osx_arm_cpu_features
osx_clock
osx_get_symbol
osx_host_cpu_count
Expand Down Expand Up @@ -80,6 +84,7 @@ set(RUNTIME_CPP
# webgpu
webgpu_dawn
webgpu_emscripten
windows_aarch64_cpu_features_arm
windows_clock
windows_cuda
windows_d3d12compute_arm
Expand Down
Loading

0 comments on commit 23ce3eb

Please sign in to comment.