Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86jit: Perform vector transfers instead of flushing to memory #18234

Merged
merged 5 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Common/x64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar

void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only

// THESE TWO ARE UNTESTED.
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}

Expand Down Expand Up @@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}

void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }

void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
Expand Down Expand Up @@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }
Expand Down
12 changes: 7 additions & 5 deletions Common/x64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -684,12 +684,14 @@ class XEmitter

// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
void DPPD(X64Reg dest, OpArg src, u8 arg);

// These are probably useful for VFPU emulation.
void INSERTPS(X64Reg dest, OpArg src, u8 arg);
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
#endif

// SSE4: Insert and extract for floats.
// Note: insert from memory or an XMM.
void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
// Extract to memory or GPR.
void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);

// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
void HADDPS(X64Reg dest, OpArg src);

Expand Down Expand Up @@ -1040,7 +1042,7 @@ class XEmitter
// Can only extract from the low 128 bits.
void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
// Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM.
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/ARM64/Arm64IRRegCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ void Arm64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
}
}

bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
// No special flags, skip the check for a little speed.
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/ARM64/Arm64IRRegCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class Arm64IRRegCache : public IRNativeRegCacheBase {
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;

bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
Expand Down
67 changes: 50 additions & 17 deletions Core/MIPS/IR/IRRegCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,12 +406,12 @@ IRNativeReg IRNativeRegCacheBase::FindFreeReg(MIPSLoc type, MIPSMap flags) const

bool IRNativeRegCacheBase::IsGPRClobbered(IRReg gpr) const {
_dbg_assert_(IsValidGPR(gpr));
return IsRegClobbered(MIPSLoc::REG, MIPSMap::INIT, gpr);
return IsRegClobbered(MIPSLoc::REG, gpr);
}

bool IRNativeRegCacheBase::IsFPRClobbered(IRReg fpr) const {
_dbg_assert_(IsValidFPR(fpr));
return IsRegClobbered(MIPSLoc::FREG, MIPSMap::INIT, fpr + 32);
return IsRegClobbered(MIPSLoc::FREG, fpr + 32);
}

IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const {
Expand All @@ -423,7 +423,7 @@ IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc t
return IRUsage::UNKNOWN;
}

bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const {
bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, IRReg r) const {
static const int UNUSED_LOOKAHEAD_OPS = 30;

IRSituation info;
Expand All @@ -450,6 +450,21 @@ bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r)
return false;
}

bool IRNativeRegCacheBase::IsRegRead(MIPSLoc type, IRReg first) const {
static const int UNUSED_LOOKAHEAD_OPS = 30;

IRSituation info;
info.lookaheadCount = UNUSED_LOOKAHEAD_OPS;
// We look starting one ahead, unlike spilling.
info.currentIndex = irIndex_ + 1;
info.instructions = irBlock_->GetInstructions();
info.numInstructions = irBlock_->GetNumInstructions();

// Note: this intentionally doesn't look at the full reg, only the lane.
IRUsage usage = GetNextRegUsage(info, type, first);
return usage == IRUsage::READ;
}

IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const {
int allocCount = 0, base = 0;
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
Expand Down Expand Up @@ -501,7 +516,7 @@ IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, b
return -1;
}

bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
int allocCount = 0, base = 0;
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);

Expand All @@ -514,6 +529,11 @@ bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type,
return false;
}

bool IRNativeRegCacheBase::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
// To be overridden if the backend supports transfers.
return false;
}

void IRNativeRegCacheBase::DiscardNativeReg(IRNativeReg nreg) {
_assert_msg_(nreg >= 0 && nreg < config_.totalNativeRegs, "DiscardNativeReg on invalid register %d", nreg);
if (nr[nreg].mipsReg != IRREG_INVALID) {
Expand Down Expand Up @@ -930,21 +950,28 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
case MIPSLoc::REG:
if (type != MIPSLoc::REG) {
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
// If it's not compatible, we'll need to reallocate.
// TODO: Could do a transfer and avoid memory flush.
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
nreg = mr[first].nReg;
} else {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
}
}
break;

case MIPSLoc::FREG:
case MIPSLoc::VREG:
if (type != mr[first].loc) {
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
nreg = mr[first].nReg;
} else {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
}
}
break;

Expand Down Expand Up @@ -981,10 +1008,13 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
_assert_msg_(!mreg.isStatic, "Cannot MapNativeReg a static reg mismatch");
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
// If we need init, we have to flush mismatches.
// TODO: Do a shuffle if interior only?
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
// For example Vec4Scale v0..v3, v0..v3, v3
FlushNativeReg(mreg.nReg);
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags)) {
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
// For example Vec4Scale v0..v3, v0..v3, v3
FlushNativeReg(mreg.nReg);
}
// The mismatch has been "resolved" now.
mismatch = false;
} else if (oldlanes != 1) {
// Even if we don't care about the current contents, we can't discard outside.
bool extendsBefore = oldlane > i;
Expand Down Expand Up @@ -1017,6 +1047,9 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
DiscardNativeReg(mreg.nReg);
else
FlushNativeReg(mreg.nReg);

// That took care of the mismatch, either by clobber or flush.
mismatch = false;
}
}
}
Expand All @@ -1027,8 +1060,8 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
// We better not be trying to map to a different nreg if it's in one now.
// This might happen on some sort of transfer...
// TODO: Make a direct transfer, i.e. FREG -> VREG?
FlushNativeReg(mreg.nReg);
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags))
FlushNativeReg(mreg.nReg);
} else {
DiscardNativeReg(mreg.nReg);
}
Expand Down
6 changes: 4 additions & 2 deletions Core/MIPS/IR/IRRegCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,14 @@ class IRNativeRegCacheBase {
IRNativeReg AllocateReg(MIPSLoc type, MIPSMap flags);
IRNativeReg FindFreeReg(MIPSLoc type, MIPSMap flags) const;
IRNativeReg FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const;
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags);
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes);
virtual void DiscardNativeReg(IRNativeReg nreg);
virtual void FlushNativeReg(IRNativeReg nreg);
virtual void DiscardReg(IRReg mreg);
virtual void FlushReg(IRReg mreg);
virtual void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state);
virtual void MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg first, int lanes, MIPSMap flags);
virtual bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
virtual IRNativeReg MapNativeReg(MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
IRNativeReg MapNativeRegAsPointer(IRReg gpr);

Expand All @@ -238,7 +239,8 @@ class IRNativeRegCacheBase {
void SetSpillLockIRIndex(IRReg reg, int index);
int GetMipsRegOffset(IRReg r);

bool IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const;
bool IsRegClobbered(MIPSLoc type, IRReg r) const;
bool IsRegRead(MIPSLoc type, IRReg r) const;
IRUsage GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const;

bool IsValidGPR(IRReg r) const;
Expand Down
4 changes: 2 additions & 2 deletions Core/MIPS/RiscV/RiscVRegCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,11 @@ void RiscVRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
}
}

bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
// No special flags except VREG, skip the check for a little speed.
if (type != MIPSLoc::VREG)
return true;
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags);
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags, lanes);
}

void RiscVRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/RiscV/RiscVRegCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class RiscVRegCache : public IRNativeRegCacheBase {
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;

bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
Expand Down
Loading