Skip to content

Commit

Permalink
Merge pull request #4095 from alyssarosenzweig/opt/jit-time
Browse files Browse the repository at this point in the history
speed up the JIT
  • Loading branch information
Sonicadvance1 authored Oct 4, 2024
2 parents 9716fc7 + 44484a7 commit e82d2c7
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 201 deletions.
45 changes: 20 additions & 25 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) {
void OpDispatchBuilder::SALCOp(OpcodeArgs) {
CalculateDeferredFlags();

auto Result = NZCVSelect(OpSize::i32Bit, {COND_UGE} /* CF = 1 */, _Constant(0xffffffff), _Constant(0));
auto Result = NZCVSelect(OpSize::i32Bit, {COND_UGE} /* CF = 1 */, _InlineConstant(0xffffffff), _InlineConstant(0));

StoreResult(GPRClass, Op, Result, -1);
}
Expand Down Expand Up @@ -493,7 +493,7 @@ void OpDispatchBuilder::POPAOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size);

// Skip loading RSP because it'll be correct at the end
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _Constant(Size)));
SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(Size)));

StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size);
StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size);
Expand Down Expand Up @@ -611,7 +611,7 @@ Ref OpDispatchBuilder::SelectPF(bool Invert, IR::OpSize ResultSize, Ref TrueValu

// Because we're only clobbering NZCV internally, we ignore all carry flag
// shenanigans and just use the raw test and raw select.
_TestNZ(OpSize::i32Bit, Cmp, _Constant(1));
_TestNZ(OpSize::i32Bit, Cmp, _InlineConstant(1));
return _NZCVSelect(ResultSize, {COND_NEQ}, TrueValue, FalseValue);
}

Expand Down Expand Up @@ -803,11 +803,6 @@ void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) {
uint8_t JcxGPRSize = CTX->GetGPRSize();
JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize;

IRPair<IROp_Constant> TakeBranch;
IRPair<IROp_Constant> DoNotTakeBranch;
TakeBranch = _Constant(1);
DoNotTakeBranch = _Constant(0);

uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal();

Ref CondReg = LoadGPRRegister(X86State::REG_RCX, JcxGPRSize);
Expand Down Expand Up @@ -876,15 +871,15 @@ void OpDispatchBuilder::LoopOp(OpcodeArgs) {
uint64_t Target = Op->PC + Op->InstSize + Op->Src[1].Literal();

Ref CondReg = LoadSource_WithOpSize(GPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
CondReg = _Sub(OpSize, CondReg, _Constant(SrcSize * 8, 1));
CondReg = _Sub(OpSize, CondReg, _InlineConstant(1));
StoreResult(GPRClass, Op, Op->Src[0], CondReg, -1);

// If LOOPE then jumps to target if RCX != 0 && ZF == 1
// If LOOPNE then jumps to target if RCX != 0 && ZF == 0
//
// To handle efficiently, smash RCX to zero if ZF is wrong (1 csel).
if (CheckZF) {
CondReg = NZCVSelect(OpSize, {ZFTrue ? COND_EQ : COND_NEQ}, CondReg, _Constant(0));
CondReg = NZCVSelect(OpSize, {ZFTrue ? COND_EQ : COND_NEQ}, CondReg, _InlineConstant(0));
}

CalculateDeferredFlags();
Expand Down Expand Up @@ -1154,7 +1149,7 @@ void OpDispatchBuilder::SAHFOp(OpcodeArgs) {
Src = _Andn(OpSize::i64Bit, Src, _Constant(0b101000));

// Set the bit that is always set here
Src = _Or(OpSize::i64Bit, Src, _Constant(0b10));
Src = _Or(OpSize::i64Bit, Src, _InlineConstant(0b10));

// Store the lower 8 bits in to RFLAGS
SetPackedRFLAG(true, Src);
Expand Down Expand Up @@ -1437,9 +1432,9 @@ void OpDispatchBuilder::SHLDOp(OpcodeArgs) {

// x86 masks the shift by 0x3F or 0x1F depending on size of op.
if (Size == 64) {
Shift = _And(OpSize::i64Bit, Shift, _Constant(0x3F));
Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F));
} else {
Shift = _And(OpSize::i64Bit, Shift, _Constant(0x1F));
Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F));
}

// a64 masks the bottom bits, so if we're using a native 32/64-bit shift, we
Expand Down Expand Up @@ -1510,9 +1505,9 @@ void OpDispatchBuilder::SHRDOp(OpcodeArgs) {

// x86 masks the shift by 0x3F or 0x1F depending on size of op
if (Size == 64) {
Shift = _And(OpSize::i64Bit, Shift, _Constant(0x3F));
Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F));
} else {
Shift = _And(OpSize::i64Bit, Shift, _Constant(0x1F));
Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F));
}

auto ShiftLeft = _Sub(OpSize::i64Bit, _Constant(Size), Shift);
Expand Down Expand Up @@ -1608,7 +1603,7 @@ void OpDispatchBuilder::RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool I
Src = _Constant(UnmaskedConst & Mask);
} else {
UnmaskedSrc = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
Src = _And(OpSize::i64Bit, UnmaskedSrc, _Constant(Mask));
Src = _And(OpSize::i64Bit, UnmaskedSrc, _InlineConstant(Mask));
}

// We fill the upper bits so we allow garbage on load.
Expand Down Expand Up @@ -1643,7 +1638,7 @@ void OpDispatchBuilder::RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool I

// We deferred the masking for 8-bit to the flag section, do it here.
if (Size == 8) {
Src = _And(OpSize::i64Bit, UnmaskedSrc, _Constant(0x1F));
Src = _And(OpSize::i64Bit, UnmaskedSrc, _InlineConstant(0x1F));
}

_RotateFlags(OpSizeFromSrc(Op), Res, Src, Left);
Expand Down Expand Up @@ -1730,7 +1725,7 @@ void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) {
auto Size = OpSizeFromSrc(Op);

auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _Xor(Size, _Sub(Size, Src, _Constant(1)), Src);
auto Result = _Xor(Size, _Sub(Size, Src, _InlineConstant(1)), Src);

StoreResult(GPRClass, Op, Result, -1);
InvalidatePF_AF();
Expand All @@ -1752,7 +1747,7 @@ void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) {
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Size = OpSizeFromSrc(Op);

auto Result = _And(Size, _Sub(Size, Src, _Constant(1)), Src);
auto Result = _And(Size, _Sub(Size, Src, _InlineConstant(1)), Src);
StoreResult(GPRClass, Op, Result, -1);

auto Zero = _Constant(0);
Expand Down Expand Up @@ -1817,8 +1812,8 @@ void OpDispatchBuilder::BZHI(OpcodeArgs) {
auto Result = _NZCVSelect(IR::SizeToOpSize(Size), {COND_NEQ}, Src, MaskResult);
StoreResult(GPRClass, Op, Result, -1);

auto Zero = _Constant(0);
auto One = _Constant(1);
auto Zero = _InlineConstant(0);
auto One = _InlineConstant(1);
auto CFInv = _NZCVSelect(OpSize::i32Bit, {COND_EQ}, One, Zero);

InvalidatePF_AF();
Expand Down Expand Up @@ -1849,7 +1844,7 @@ void OpDispatchBuilder::RORX(OpcodeArgs) {
auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Result = Src;
if (DoRotation) [[likely]] {
Result = _Ror(OpSizeFromSrc(Op), Src, _Constant(Amount));
Result = _Ror(OpSizeFromSrc(Op), Src, _InlineConstant(Amount));
}

StoreResult(GPRClass, Op, Result, -1);
Expand Down Expand Up @@ -2057,7 +2052,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
return;
}

Ref SrcMasked = _And(OpSize, Src, _Constant(Size, Mask));
Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask));
Calculate_ShiftVariable(
Op, SrcMasked,
[this, Op, Size, OpSize]() {
Expand Down Expand Up @@ -2282,7 +2277,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
return;
}

Ref SrcMasked = _And(OpSize, Src, _Constant(Size, Mask));
Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask));
Calculate_ShiftVariable(
Op, SrcMasked,
[this, Op, Size, OpSize]() {
Expand All @@ -2305,7 +2300,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
SetCFDirect(NewCF, 0, true);

// Since Shift != 0 we can inject the CF. Shift absorbs the masking.
Ref CFShl = _Sub(OpSize, Src, _Constant(Size, 1));
Ref CFShl = _Sub(OpSize, Src, _InlineConstant(1));
auto TmpCF = _Lshl(OpSize, CF, CFShl);
Res = _Or(OpSize, Res, TmpCF);

Expand Down
27 changes: 12 additions & 15 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,13 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
// Calculate flags early.
CalculateDeferredFlags();

Ref Original = _Constant(0);

// SF/ZF and N/Z are together on both arm64 and x86_64, so we special case that.
bool GetNZ = (FlagsMask & (1 << FEXCore::X86State::RFLAG_SF_RAW_LOC)) && (FlagsMask & (1 << FEXCore::X86State::RFLAG_ZF_RAW_LOC));

// Handle CF first, since it's at bit 0 and hence doesn't need shift or OR.
if (FlagsMask & (1 << FEXCore::X86State::RFLAG_CF_RAW_LOC)) {
static_assert(FEXCore::X86State::RFLAG_CF_RAW_LOC == 0);
Original = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);
}
LOGMAN_THROW_A_FMT(FlagsMask & (1 << FEXCore::X86State::RFLAG_CF_RAW_LOC), "CF always handled");
static_assert(FEXCore::X86State::RFLAG_CF_RAW_LOC == 0);
Ref Original = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

for (size_t i = 0; i < FlagOffsets.size(); ++i) {
const auto FlagOffset = FlagOffsets[i];
Expand Down Expand Up @@ -116,7 +113,7 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
// instead.
if (FlagsMask & (1 << FEXCore::X86State::RFLAG_PF_RAW_LOC)) {
// Set every bit except the bottommost.
auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false, false), _Constant(~1ull));
auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false, false), _InlineConstant(~1ull));

// Rotate the bottom bit to the appropriate location for PF, so we get
// something like 111P1111. Then invert that to get 000p0000. Then OR that
Expand All @@ -129,13 +126,13 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
if (GetNZ) {
static_assert(FEXCore::X86State::RFLAG_SF_RAW_LOC == (FEXCore::X86State::RFLAG_ZF_RAW_LOC + 1));
auto NZCV = GetNZCV();
auto NZ = _And(OpSize::i64Bit, NZCV, _Constant(0b11u << 30));
auto NZ = _And(OpSize::i64Bit, NZCV, _InlineConstant(0b11u << 30));
Original = _Orlshr(OpSize::i64Bit, Original, NZ, 31 - FEXCore::X86State::RFLAG_SF_RAW_LOC);
}

// The constant is OR'ed in at the end, to avoid a pointless or xzr, #2.
if ((1U << X86State::RFLAG_RESERVED_LOC) & FlagsMask) {
Original = _Or(OpSize::i64Bit, Original, _Constant(2));
Original = _Or(OpSize::i64Bit, Original, _InlineConstant(2));
}

return Original;
Expand Down Expand Up @@ -266,8 +263,8 @@ Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) {
}

Ref OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, Ref Src1, Ref Src2) {
auto Zero = _Constant(0);
auto One = _Constant(1);
auto Zero = _InlineConstant(0);
auto One = _InlineConstant(1);
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
Ref Res;

Expand Down Expand Up @@ -303,8 +300,8 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, Ref Src1, Ref Src2) {
}

Ref OpDispatchBuilder::CalculateFlags_SBB(uint8_t SrcSize, Ref Src1, Ref Src2) {
auto Zero = _Constant(0);
auto One = _Constant(1);
auto Zero = _InlineConstant(0);
auto One = _InlineConstant(1);
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;

CalculateAF(Src1, Src2);
Expand Down Expand Up @@ -408,7 +405,7 @@ void OpDispatchBuilder::CalculateFlags_MUL(uint8_t SrcSize, Ref Res, Ref High) {

// If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF
// undefined, this does what we need after inverting carry.
auto Zero = _Constant(0);
auto Zero = _InlineConstant(0);
_CondSubNZCV(OpSize::i64Bit, Zero, Zero, CondClassType {COND_EQ}, 0x1 /* nzcV */);
CFInverted = true;
}
Expand All @@ -417,7 +414,7 @@ void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) {
HandleNZCVWrite();
InvalidatePF_AF();

auto Zero = _Constant(0);
auto Zero = _InlineConstant(0);
OpSize Size = IR::SizeToOpSize(GetOpSize(High));

// CF and OF are set if the result of the operation can't be fit in to the destination register
Expand Down
Loading

0 comments on commit e82d2c7

Please sign in to comment.