Skip to content
This repository has been archived by the owner on Oct 29, 2024. It is now read-only.

Commit

Permalink
shader_jit_a64: Compact host executable memory (#230)
Browse files Browse the repository at this point in the history
* common/aarch64: Allow generic code generator types

Use the templated `BasicCodeGenerator` type rather than the specialized
`CodeGenerator` type.
Allows `VectorCodeGenerator` to work with these functions.

* common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction`

`VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here.

* shader_jit_a64: Implement position-independent VectorCodeGenerator

Generates more position-independent assembly to allow for code to be
generated within a resizable vector before copying into executable
memory, allowing for more compact memory allocations and usage rather
than a statically defined worst-case for all-cases.

`VectorCodeGenerator` will need to generate position-independent code
rather than use absolute addresses. Assumes all far function calls in the
case of `VectorCodeGenerator` to use absolute addresses rather than
potentially use a relative `BL` branch after memory relocation.
  • Loading branch information
Wunkolo authored Sep 1, 2024
1 parent 82faf2e commit 3e5bbac
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 41 deletions.
6 changes: 4 additions & 2 deletions src/common/aarch64/oaknut_abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra
return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)};
}

inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
template <typename Policy>
inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
std::size_t frame_size = 0) {
using namespace oaknut;
using namespace oaknut::util;
Expand Down Expand Up @@ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
}
}

inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs,
template <typename Policy>
inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs,
std::size_t frame_size = 0) {
using namespace oaknut;
using namespace oaknut::util;
Expand Down
10 changes: 10 additions & 0 deletions src/common/aarch64/oaknut_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) {
}
}

template <typename T>
inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) {
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
// X16(IP0) and X17(IP1) is the standard veneer register
// LR is also available as an intermediate register
// https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f));
code.BLR(oaknut::util::X16);
}

} // namespace Common::A64

#endif // CITRA_ARCH(arm64)
89 changes: 55 additions & 34 deletions src/video_core/shader/shader_jit_a64_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
swizzle_data = swizzle_data_;

// Reset flow control state
program = xptr<CompiledShader*>();
const std::uintptr_t program_offset = offset();
program_counter = 0;
loop_depth = 0;
instruction_labels.fill(Label());
Expand Down Expand Up @@ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
return_offsets.clear();
return_offsets.shrink_to_fit();

// Memory is ready to execute
protect();
invalidate_all();
// Copy to executable memory
const size_t code_size = code_vec.size() * sizeof(u32);

code_mem = std::make_unique<oaknut::CodeBlock>(code_size);
code_mem->unprotect();

program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) +
program_offset);

const std::size_t code_size = static_cast<std::size_t>(offset());
// Copy to executable memory
std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32));

// Memory is ready to execute
code_mem->protect();
code_mem->invalidate_all();

ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size);
// code_vec is no longer needed
code_vec.clear();
code_vec.shrink_to_fit();
}

JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) {
unprotect();
JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {
CompilePrelude();
}

Expand All @@ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() {
// range. Coefficients for the minimax polynomial.
// f(x) computes approximately log2(x) / (x - 1).
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
align(16);
const void* c0 = xptr<const void*>();
oaknut::Label c0;
// align(16);
l(c0);
dw(0x3d74552f);

align(16);
const void* c14 = xptr<const void*>();
// align(16);
oaknut::Label c14;
l(c14);
dw(0xbeee7397);
dw(0x3fbd96dd);
dw(0xc02153f6);
dw(0x4038d96c);

align(16);
const void* negative_infinity_vector = xptr<const void*>();
// align(16);
oaknut::Label negative_infinity_vector;
l(negative_infinity_vector);
dw(0xff800000);
dw(0xff800000);
dw(0xff800000);
Expand All @@ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() {

Label input_is_nan, input_is_zero, input_out_of_range;

align(16);
// align(16);
l(input_out_of_range);
B(Cond::EQ, input_is_zero);
MOVP2R(XSCRATCH0, default_qnan_vector);
ADR(XSCRATCH0, default_qnan_vector);
LDR(SRC1, XSCRATCH0);
RET();

l(input_is_zero);
MOVP2R(XSCRATCH0, negative_infinity_vector);
ADR(XSCRATCH0, negative_infinity_vector);
LDR(SRC1, XSCRATCH0);
RET();

align(16);
// align(16);
l(subroutine);

// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
Expand Down Expand Up @@ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() {
UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS());
// VSCRATCH1 now contains the exponent of the input.

MOVP2R(XSCRATCH0, c0);
ADR(XSCRATCH0, c0);
LDR(XSCRATCH0.toW(), XSCRATCH0);
MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW());

// Complete computation of polynomial
// Load C1,C2,C3,C4 into a single scratch register
const QReg C14 = SRC2;
MOVP2R(XSCRATCH0, c14);
ADR(XSCRATCH0, c14);
LDR(C14, XSCRATCH0);
FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS());
FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]);
Expand Down Expand Up @@ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() {
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
// result into the appropriate range.

align(16);
const void* input_max = xptr<const void*>();
// align(16);
Label input_max;
l(input_max);
dw(0x43010000);
const void* input_min = xptr<const void*>();
Label input_min;
l(input_min);
dw(0xc2fdffff);
const void* c0 = xptr<const void*>();
Label c0;
l(c0);
dw(0x3c5dbe69);
const void* half = xptr<const void*>();
Label half;
l(half);
dw(0x3f000000);
const void* c1 = xptr<const void*>();
Label c1;
l(c1);
dw(0x3d5509f9);
const void* c2 = xptr<const void*>();
Label c2;
l(c2);
dw(0x3e773cc5);
const void* c3 = xptr<const void*>();
Label c3;
l(c3);
dw(0x3f3168b3);
const void* c4 = xptr<const void*>();
Label c4;
l(c4);
dw(0x3f800016);

Label ret_label;

align(16);
// align(16);
l(subroutine);

// Handle edge cases
Expand All @@ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() {
// VSCRATCH0=2^round(input)
// SRC1=input-round(input) [-0.5, 0.5)
// Clamp to maximum range since we shift the value directly into the exponent.
MOVP2R(XSCRATCH0, input_max);
ADR(XSCRATCH0, input_max);
LDR(VSCRATCH0.toS(), XSCRATCH0);
FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());

MOVP2R(XSCRATCH0, input_min);
ADR(XSCRATCH0, input_min);
LDR(VSCRATCH0.toS(), XSCRATCH0);
FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS());

MOVP2R(XSCRATCH0, half);
ADR(XSCRATCH0, half);
LDR(VSCRATCH0.toS(), XSCRATCH0);
FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS());

Expand Down
10 changes: 5 additions & 5 deletions src/video_core/shader/shader_jit_a64_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,17 @@ struct ShaderUnit;

namespace Pica::Shader {

/// Memory allocated for each compiled shader
constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256;

/**
* This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
* code that can be executed on the host machine directly.
*/
class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator {
class JitShader : public oaknut::VectorCodeGenerator {
public:
JitShader();

void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const {
program(&setup.uniforms, &state,
reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) +
reinterpret_cast<const std::byte*>(code_mem->ptr()) +
instruction_labels[offset].offset());
}

Expand Down Expand Up @@ -81,6 +78,9 @@ class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator {
void Compile_SETE(Instruction instr);

private:
std::vector<u32> code_vec;
std::unique_ptr<oaknut::CodeBlock> code_mem;

void Compile_Block(u32 end);
void Compile_NextInstr();

Expand Down

0 comments on commit 3e5bbac

Please sign in to comment.