From da880118053787851bd6588603f9a5a9c5bc1674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 2 Jun 2024 00:08:41 +0200 Subject: [PATCH 1/4] Specialize a few arithmetic instructions for the interpreter. --- Core/MIPS/IR/IRFrontend.cpp | 2 +- Core/MIPS/IR/IRInst.cpp | 15 +++++++++- Core/MIPS/IR/IRInst.h | 7 +++++ Core/MIPS/IR/IRInterpreter.cpp | 13 ++++++++ Core/MIPS/IR/IRPassSimplify.cpp | 53 +++++++++++++++++++++++++++++++++ Core/MIPS/IR/IRPassSimplify.h | 2 ++ 6 files changed, 90 insertions(+), 2 deletions(-) diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp index 36efd6ad722c..72be918765d5 100644 --- a/Core/MIPS/IR/IRFrontend.cpp +++ b/Core/MIPS/IR/IRFrontend.cpp @@ -291,7 +291,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector &instructions, u32 &m if (opts.optimizeForInterpreter) { // Add special passes here. - // passes.push_back(&ReorderLoadStore); + passes.push_back(&OptimizeForInterpreter); } if (IRApplyPasses(passes.data(), passes.size(), ir, simplified, opts)) logBlocks = 1; diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp index 7a1453f2f5cb..c4a152e0b5aa 100644 --- a/Core/MIPS/IR/IRInst.cpp +++ b/Core/MIPS/IR/IRInst.cpp @@ -1,4 +1,5 @@ #include "Common/CommonFuncs.h" +#include "Common/Log.h" #include "Core/MIPS/IR/IRInst.h" #include "Core/MIPS/MIPSDebugInterface.h" #include "Core/HLE/ReplaceTables.h" @@ -8,6 +9,7 @@ // _ = ignore // G = GPR register // C = 32-bit constant from array +// c = 8-bit constant from array // I = immediate value from instruction // F = FPR register, single // V = FPR register, Vec4. Reg number always divisible by 4. @@ -29,10 +31,13 @@ static const IRMeta irMeta[] = { { IROp::Or, "Or", "GGG" }, { IROp::Xor, "Xor", "GGG" }, { IROp::AddConst, "AddConst", "GGC" }, + { IROp::OptAddConst, "OptAddConst", "GC" }, { IROp::SubConst, "SubConst", "GGC" }, { IROp::AndConst, "AndConst", "GGC" }, { IROp::OrConst, "OrConst", "GGC" }, { IROp::XorConst, "XorConst", "GGC" }, + { IROp::OptAndConst, "OptAndConst", "GC" }, + { IROp::OptOrConst, "OptOrConst", "GC" }, { IROp::Shl, "Shl", "GGG" }, { IROp::Shr, "Shr", "GGG" }, { IROp::Sar, "Sar", "GGG" }, @@ -128,7 +133,7 @@ static const IRMeta irMeta[] = { { IROp::FCmpVfpuAggregate, "FCmpVfpuAggregate", "I" }, { IROp::Vec4Init, "Vec4Init", "Vv" }, { IROp::Vec4Shuffle, "Vec4Shuffle", "VVs" }, - { IROp::Vec4Blend, "Vec4Blend", "VVVC" }, + { IROp::Vec4Blend, "Vec4Blend", "VVVc" }, { IROp::Vec4Mov, "Vec4Mov", "VV" }, { IROp::Vec4Add, "Vec4Add", "VVV" }, { IROp::Vec4Sub, "Vec4Sub", "VVV" }, @@ -218,6 +223,11 @@ int IRWriter::AddConstantFloat(float value) { return AddConstant(val); } +void IRWriter::ReplaceConstant(size_t instNumber, u32 newConstant) { + _dbg_assert_(instNumber < insts_.size()); + insts_[instNumber].constant = newConstant; +} + static std::string GetGPRName(int r) { if (r < 32) { return currentDebugMIPS->GetRegName(0, r); @@ -295,6 +305,9 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, u32 constant) case 'C': snprintf(buf, bufSize, "%08x", constant); break; + case 'c': + snprintf(buf, bufSize, "%02x", constant); + break; case 'I': snprintf(buf, bufSize, "%02x", param); break; diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h index e2d855a3e032..473600994441 100644 --- a/Core/MIPS/IR/IRInst.h +++ b/Core/MIPS/IR/IRInst.h @@ -17,6 +17,9 @@ // even be directly JIT-ed, but the gains will probably be tiny over our older direct // MIPS->target JITs. +// Ops beginning with "OI" are specialized for IR Interpreter use. These will not be produced +// for the IR JITs. + enum class IROp : uint8_t { SetConst, SetConstF, @@ -33,11 +36,14 @@ enum class IROp : uint8_t { Xor, AddConst, + OptAddConst, SubConst, AndConst, OrConst, XorConst, + OptAndConst, + OptOrConst, Shl, Shr, @@ -391,6 +397,7 @@ class IRWriter { void Clear() { insts_.clear(); } + void ReplaceConstant(size_t instNumber, u32 newConstant); const std::vector &GetInstructions() const { return insts_; } diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index e73aed8640da..1b2a26819cdc 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -120,15 +120,24 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::AddConst: mips->r[inst->dest] = mips->r[inst->src1] + inst->constant; break; + case IROp::OptAddConst: // For this one, it's worth having a "unary" variant of the above that only needs to read one register param. + mips->r[inst->dest] += inst->constant; + break; case IROp::SubConst: mips->r[inst->dest] = mips->r[inst->src1] - inst->constant; break; case IROp::AndConst: mips->r[inst->dest] = mips->r[inst->src1] & inst->constant; break; + case IROp::OptAndConst: // For this one, it's worth having a "unary" variant of the above that only needs to read one register param. + mips->r[inst->dest] &= inst->constant; + break; case IROp::OrConst: mips->r[inst->dest] = mips->r[inst->src1] | inst->constant; break; + case IROp::OptOrConst: + mips->r[inst->dest] |= inst->constant; + break; case IROp::XorConst: mips->r[inst->dest] = mips->r[inst->src1] ^ inst->constant; break; @@ -431,6 +440,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::Vec2Pack31To16: { + // Used in Tekken 6 + u32 val = (mips->fi[inst->src1] >> 15) & 0xFFFF; val |= (mips->fi[inst->src1 + 1] << 1) & 0xFFFF0000; mips->fi[inst->dest] = val; @@ -451,6 +462,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::Vec4Pack31To8: { + // Used in Tekken 6 + // Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2. // pshufb or SSE4 instructions can be used instead. u32 val = (mips->fi[inst->src1] >> 23) & 0xFF; diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp index 16ac30cb1003..25f4871fe663 100644 --- a/Core/MIPS/IR/IRPassSimplify.cpp +++ b/Core/MIPS/IR/IRPassSimplify.cpp @@ -2149,3 +2149,56 @@ bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) { } return logBlocks; } + +bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) { + CONDITIONAL_DISABLE; + // This tells us to skip an AND op that has been optimized out. + // Maybe we could skip multiple, but that'd slow things down and is pretty uncommon. + int nextSkip = -1; + + bool logBlocks = false; + // We also move the downcount to the top so the interpreter can assume that it's there. + bool foundDowncount = false; + out.Write(IROp::Downcount); + + for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { + IRInst inst = in.GetInstructions()[i]; + + // Specialize some instructions. + switch (inst.op) { + case IROp::Downcount: + if (!foundDowncount) { + // Move the value into the initial Downcount. + foundDowncount = true; + out.ReplaceConstant(0, inst.constant); + } else { + // Already had a downcount. Let's just re-emit it. + out.Write(inst); + } + break; + case IROp::AddConst: + if (inst.src1 == inst.dest) { + inst.op = IROp::OptAddConst; + } + out.Write(inst); + break; + case IROp::AndConst: + if (inst.src1 == inst.dest) { + inst.op = IROp::OptAndConst; + } + out.Write(inst); + break; + case IROp::OrConst: + if (inst.src1 == inst.dest) { + inst.op = IROp::OptOrConst; + } + out.Write(inst); + break; + default: + out.Write(inst); + break; + } + } + + return logBlocks; +} diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h index c7c644351eee..f3f484a3a7d3 100644 --- a/Core/MIPS/IR/IRPassSimplify.h +++ b/Core/MIPS/IR/IRPassSimplify.h @@ -16,3 +16,5 @@ bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts); bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts); bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts); bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts); + +bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts); From bd0beb68a4dc42ab715f3dea16da8e15646d2a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 7 Jun 2024 11:07:21 +0200 Subject: [PATCH 2/4] Add new IR optimization pass, OptimizeLoadsAfterStores --- Core/MIPS/IR/IRFrontend.cpp | 1 + Core/MIPS/IR/IRJit.cpp | 8 +++++-- Core/MIPS/IR/IRPassSimplify.cpp | 39 +++++++++++++++++++++++++++++++++ Core/MIPS/IR/IRPassSimplify.h | 1 + 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp index 72be918765d5..4af6d9a753f4 100644 --- a/Core/MIPS/IR/IRFrontend.cpp +++ b/Core/MIPS/IR/IRFrontend.cpp @@ -284,6 +284,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector &instructions, u32 &m &PropagateConstants, &PurgeTemps, &ReduceVec4Flush, + &OptimizeLoadsAfterStores, // &ReorderLoadStore, // &MergeLoadStore, // &ThreeOpToTwoOp, diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp index f962e9518020..942c801936d9 100644 --- a/Core/MIPS/IR/IRJit.cpp +++ b/Core/MIPS/IR/IRJit.cpp @@ -255,15 +255,19 @@ void IRJit::RunLoopUntil(u64 globalticks) { u32 opcode = inst & 0xFF000000; if (opcode == MIPS_EMUHACK_OPCODE) { u32 offset = inst & 0x00FFFFFF; // Alternatively, inst - opcode + const IRInst *instPtr = blocks_.GetArenaPtr() + offset; + _dbg_assert_(instPtr->op == IROp::Downcount); + mips->downcount -= instPtr->constant; + instPtr++; #ifdef IR_PROFILING IRBlock *block = blocks_.GetBlock(blocks_.GetBlockNumFromOffset(offset)); TimeSpan span; - mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset); + mips->pc = IRInterpret(mips, instPtr); int64_t elapsedNanos = span.ElapsedNanos(); block->profileStats_.executions += 1; block->profileStats_.totalNanos += elapsedNanos; #else - mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset); + mips->pc = IRInterpret(mips, instPtr); #endif // Note: this will "jump to zero" on a badly constructed block missing exits. if (!Memory::IsValid4AlignedAddress(mips->pc)) { diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp index 25f4871fe663..7032017a5dc6 100644 --- a/Core/MIPS/IR/IRPassSimplify.cpp +++ b/Core/MIPS/IR/IRPassSimplify.cpp @@ -2150,6 +2150,45 @@ bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) { return logBlocks; } +// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon. +bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) { + CONDITIONAL_DISABLE; + // This tells us to skip an AND op that has been optimized out. + // Maybe we could skip multiple, but that'd slow things down and is pretty uncommon. + int nextSkip = -1; + + bool logBlocks = false; + for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { + IRInst inst = in.GetInstructions()[i]; + + // Just copy the last instruction. + if (i == n - 1) { + out.Write(inst); + break; + } + + out.Write(inst); + + IRInst next = in.GetInstructions()[i + 1]; + switch (inst.op) { + case IROp::Store32: + if (next.op == IROp::Load32 && + next.constant == inst.constant && + next.dest == inst.src3 && + next.src1 == inst.src1) { + // The upcoming load is completely redundant. + // Skip it. + i++; + } + break; + default: + break; + } + } + + return logBlocks; +} + bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; // This tells us to skip an AND op that has been optimized out. diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h index f3f484a3a7d3..5fbd2a8fba02 100644 --- a/Core/MIPS/IR/IRPassSimplify.h +++ b/Core/MIPS/IR/IRPassSimplify.h @@ -17,4 +17,5 @@ bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts); bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts); bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts); +bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts); bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts); From d1e0384b2f75cd947069b439766c4ac4b14672b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 7 Jun 2024 11:26:10 +0200 Subject: [PATCH 3/4] Improve disasm --- Core/MIPS/IR/IRInst.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp index c4a152e0b5aa..a4edebecab97 100644 --- a/Core/MIPS/IR/IRInst.cpp +++ b/Core/MIPS/IR/IRInst.cpp @@ -303,13 +303,13 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, u32 constant) } break; case 'C': - snprintf(buf, bufSize, "%08x", constant); + snprintf(buf, bufSize, "0x%08x", constant); break; case 'c': - snprintf(buf, bufSize, "%02x", constant); + snprintf(buf, bufSize, "0x%02x", constant); break; case 'I': - snprintf(buf, bufSize, "%02x", param); + snprintf(buf, bufSize, "0x%02x", param); break; case 'm': snprintf(buf, bufSize, "%d", param); From 0c246297d2be483620c05f30a86b156301ab4511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 7 Jun 2024 21:25:39 +0200 Subject: [PATCH 4/4] Create an IR op for a FPRtoGPR + shift-right-8, very common --- Core/MIPS/IR/IRInst.cpp | 1 + Core/MIPS/IR/IRInst.h | 1 + Core/MIPS/IR/IRInterpreter.cpp | 8 +++++++- Core/MIPS/IR/IRPassSimplify.cpp | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp index a4edebecab97..083b6acba333 100644 --- a/Core/MIPS/IR/IRInst.cpp +++ b/Core/MIPS/IR/IRInst.cpp @@ -120,6 +120,7 @@ static const IRMeta irMeta[] = { { IROp::FSatMinus1_1, "FSat(-1 - 1)", "FF" }, { IROp::FMovFromGPR, "FMovFromGPR", "FG" }, { IROp::FMovToGPR, "FMovToGPR", "GF" }, + { IROp::OptFMovToGPRShr8, "OptFMovToGPRShr8", "GF" }, { IROp::FpCondFromReg, "FpCondFromReg", "_G" }, { IROp::FpCondToReg, "FpCondToReg", "G" }, { IROp::FpCtrlFromReg, "FpCtrlFromReg", "_G" }, diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h index 473600994441..6cdd3b88bada 100644 --- a/Core/MIPS/IR/IRInst.h +++ b/Core/MIPS/IR/IRInst.h @@ -139,6 +139,7 @@ enum class IROp : uint8_t { FMovFromGPR, FMovToGPR, + OptFMovToGPRShr8, FSat0_1, FSatMinus1_1, diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 1b2a26819cdc..7fe2990ff474 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -1000,7 +1000,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::FMovToGPR: memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4); break; - + case IROp::OptFMovToGPRShr8: + { + u32 temp; + memcpy(&temp, &mips->f[inst->src1], 4); + mips->r[inst->dest] = temp >> 8; + break; + } case IROp::ExitToConst: return inst->constant; diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp index 7032017a5dc6..7f23846cea1e 100644 --- a/Core/MIPS/IR/IRPassSimplify.cpp +++ b/Core/MIPS/IR/IRPassSimplify.cpp @@ -2203,6 +2203,8 @@ bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions & for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { IRInst inst = in.GetInstructions()[i]; + bool last = i == n - 1; + // Specialize some instructions. switch (inst.op) { case IROp::Downcount: @@ -2233,6 +2235,19 @@ bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions & } out.Write(inst); break; + case IROp::FMovToGPR: + if (!last) { + IRInst next = in.GetInstructions()[i + 1]; + if (next.op == IROp::ShrImm && next.src2 == 8 && next.src1 == next.dest && next.src1 == inst.dest) { + // Heavily used when writing display lists. + inst.op = IROp::OptFMovToGPRShr8; + i++; // Skip the next instruction. + } + out.Write(inst); + } else { + out.Write(inst); + } + break; default: out.Write(inst); break;