Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect ARM CPU features for host target and in runtime (Backport to release/18.x) #8343

Merged
merged 1 commit into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
hexagon_dma_pool \
hexagon_host \
ios_io \
linux_aarch64_cpu_features \
linux_arm_cpu_features \
linux_clock \
linux_host_cpu_count \
linux_yield \
Expand All @@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
msan \
msan_stubs \
opencl \
osx_aarch64_cpu_features \
osx_arm_cpu_features \
osx_clock \
osx_get_symbol \
osx_host_cpu_count \
Expand Down Expand Up @@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
wasm_cpu_features \
webgpu_dawn \
webgpu_emscripten \
windows_aarch64_cpu_features_arm \
windows_clock \
windows_cuda \
windows_d3d12compute_arm \
Expand Down
64 changes: 52 additions & 12 deletions src/LLVM_Runtime_Linker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
return std::unique_ptr<llvm::Module>(); \
}

#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits) \
do { \
if (debug) { \
return get_initmod_##mod##_##bits##_debug(context); \
} else { \
return get_initmod_##mod##_##bits(context); \
} \
} while (0)

#define DECLARE_CPP_INITMOD_LOOKUP(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
if (debug) { \
return get_initmod_##mod##_64_debug(context); \
} else { \
return get_initmod_##mod##_64(context); \
} \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32); \
} \
}

#define DECLARE_CPP_INITMOD_LOOKUP_64(mod) \
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
if (bits_64) { \
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
} else { \
if (debug) { \
return get_initmod_##mod##_32_debug(context); \
} else { \
return get_initmod_##mod##_32(context); \
} \
internal_error << "No support for 32-bit initmod: " #mod; \
return nullptr; /* appease warnings */ \
} \
}

Expand All @@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP(mod)

#define DECLARE_CPP_INITMOD_64(mod) \
DECLARE_INITMOD(mod##_64_debug) \
DECLARE_INITMOD(mod##_64) \
DECLARE_CPP_INITMOD_LOOKUP_64(mod)

#define DECLARE_LL_INITMOD(mod) \
DECLARE_INITMOD(mod##_ll)

Expand Down Expand Up @@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
DECLARE_LL_INITMOD(arm)
DECLARE_LL_INITMOD(arm_no_neon)
DECLARE_CPP_INITMOD(arm_cpu_features)
DECLARE_CPP_INITMOD(linux_arm_cpu_features)
DECLARE_CPP_INITMOD(osx_arm_cpu_features)
#else
DECLARE_NO_INITMOD(arm)
DECLARE_NO_INITMOD(arm_no_neon)
DECLARE_NO_INITMOD(arm_cpu_features)
DECLARE_NO_INITMOD(linux_arm_cpu_features)
DECLARE_NO_INITMOD(osx_arm_cpu_features)
#endif // WITH_ARM

#ifdef WITH_AARCH64
DECLARE_LL_INITMOD(aarch64)
DECLARE_CPP_INITMOD(aarch64_cpu_features)
DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
#else
DECLARE_NO_INITMOD(aarch64)
DECLARE_NO_INITMOD(aarch64_cpu_features)
DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
#endif // WITH_AARCH64

#ifdef WITH_NVPTX
Expand Down Expand Up @@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
}
if (t.arch == Target::ARM) {
if (t.bits == 64) {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
} else if (t.os == Target::Windows) {
modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
} else {
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
}
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
if (t.os == Target::Android || t.os == Target::Linux) {
modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
} else if (t.os == Target::OSX || t.os == Target::IOS) {
modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
} else {
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
}
}
}
if (t.arch == Target::POWERPC) {
Expand Down
122 changes: 115 additions & 7 deletions src/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,50 @@
#endif

#ifdef _MSC_VER
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <intrin.h>
#include <windows.h>
#endif // _MSC_VER

#ifdef __APPLE__
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif

#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_ASIMDHP
#define HWCAP_ASIMDHP 0
#endif
#ifndef HWCAP_ASIMDDP
#define HWCAP_ASIMDDP 0
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE 0
#endif
#ifndef HWCAP2_SVE2
#define HWCAP2_SVE2 0
#endif
#endif

namespace Halide {

using std::string;
using std::vector;

namespace {

#ifdef _MSC_VER
static void cpuid(int info[4], int infoType, int extra) {
#if defined(_M_IX86) || defined(_M_AMD64)

void cpuid(int info[4], int infoType, int extra) {
__cpuidex(info, infoType, extra);
}
#else

#if defined(__x86_64__) || defined(__i386__)
#elif defined(__x86_64__) || defined(__i386__)

// CPU feature detection code taken from ispc
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)

Expand All @@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "0"(infoType), "2"(extra));
}
#endif

#endif

#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)

enum class VendorSignatures {
Unknown,
Expand Down Expand Up @@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s

#endif // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)

#ifdef __APPLE__

template<typename T>
std::optional<T> getsysctl(const char *name) {
T value;
size_t size = sizeof(value);
if (sysctlbyname(name, &value, &size, nullptr, 0)) {
return std::nullopt;
}
return std::make_optional(value);
}

bool sysctl_is_set(const char *name) {
return getsysctl<int>(name).value_or(0);
}

bool is_armv7s() {
return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
}

#endif // __APPLE__

Target calculate_host_target() {
Target::OS os = Target::OSUnknown;
#ifdef __linux__
Expand All @@ -164,8 +214,66 @@ Target calculate_host_target() {
#if __riscv
Target::Arch arch = Target::RISCV;
#else
#if defined(__arm__) || defined(__aarch64__)
#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
Target::Arch arch = Target::ARM;

#ifdef __APPLE__
if (is_armv7s()) {
initial_features.push_back(Target::ARMv7s);
}

if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
initial_features.push_back(Target::ARMDotProd);
}

if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
initial_features.push_back(Target::ARMFp16);
}
#endif

#ifdef __linux__
unsigned long hwcaps = getauxval(AT_HWCAP);
unsigned long hwcaps2 = getauxval(AT_HWCAP2);

if (hwcaps & HWCAP_ASIMDDP) {
initial_features.push_back(Target::ARMDotProd);
}

if (hwcaps & HWCAP_ASIMDHP) {
initial_features.push_back(Target::ARMFp16);
}

if (hwcaps & HWCAP_SVE) {
initial_features.push_back(Target::SVE);
}

if (hwcaps2 & HWCAP2_SVE2) {
initial_features.push_back(Target::SVE2);
}
#endif

#ifdef _MSC_VER

// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)

// This is the strategy used by Google's cpuinfo library for
// detecting fp16 arithmetic support on Windows.
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMFp16);
}

if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::ARMDotProd);
}

if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
initial_features.push_back(Target::SVE);
}

#endif

#else
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
Target::Arch arch = Target::POWERPC;
Expand Down
4 changes: 2 additions & 2 deletions src/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, 65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand All @@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
} else {
internal_assert(is_float());
if (bits() == 16) {
return Internal::FloatImm::make(*this, -65504.0);
return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
} else if (bits() == 32) {
return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
} else if (bits() == 64) {
Expand Down
9 changes: 8 additions & 1 deletion src/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
// Portable bit-counting methods
int popcount64(uint64_t x) {
#ifdef _MSC_VER
#if defined(_WIN64)
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
int popcnt = 0;
while (x) {
x &= x - 1;
popcnt++;
}
return popcnt;
#elif defined(_WIN64)
return __popcnt64(x);
#else
return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));
Expand Down
5 changes: 5 additions & 0 deletions src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ set(RUNTIME_CPP
hexagon_dma_pool
hexagon_host
ios_io
linux_aarch64_cpu_features
linux_arm_cpu_features
linux_clock
linux_host_cpu_count
linux_yield
Expand All @@ -43,6 +45,8 @@ set(RUNTIME_CPP
msan
msan_stubs
opencl
osx_aarch64_cpu_features
osx_arm_cpu_features
osx_clock
osx_get_symbol
osx_host_cpu_count
Expand Down Expand Up @@ -80,6 +84,7 @@ set(RUNTIME_CPP
# webgpu
webgpu_dawn
webgpu_emscripten
windows_aarch64_cpu_features_arm
windows_clock
windows_cuda
windows_d3d12compute_arm
Expand Down
Loading
Loading