Skip to content

Commit

Permalink
[NativeAOT/ARM64] Generate frames compatible with Apple compact unwin…
Browse files Browse the repository at this point in the history
…ding (dotnet#107766)

* JIT/ARM64: Add ability to generate frames compatible with Apple compact
unwinding format.

For NativeAOT/ARM64/Apple API do the following:
- Save callee registers in opposite order and in pairs.
- Prefer saving FP/LR on the top of the frame. Heuristics are used to
  avoid worse code quality outside of prolog/epilog due to addressing
  range limits of the ARM64 instruction set.
- Added optimization to lvaFrameAddress to rewrite FP-x references to
  SP+y when possible. This allows efficient addressing using positive
  indexes when FP points to the top of the frame. It mimics similar
  optimization on ARM32.

* ObjWriter: For Mach-O ARM64 try to convert the DWARF CFI unwinding codes
into compact unwinding code

* Disable lvaFrameAddress FP->SP optimization for OSR methods
  • Loading branch information
filipnavara committed Jan 15, 2025
1 parent 5a395ed commit 35d6c08
Show file tree
Hide file tree
Showing 7 changed files with 329 additions and 86 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ class CodeGen final : public CodeGenInterface
virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const;
bool genSaveFpLrWithAllCalleeSavedRegisters;
bool genForceFuncletFrameType5;
bool genReverseAndPairCalleeSavedRegisters;
#endif // TARGET_ARM64

//-------------------------------------------------------------------------
Expand Down
46 changes: 38 additions & 8 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,12 +845,19 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i

for (int i = 0; i < regStack.Height(); ++i)
{
RegPair regPair = regStack.Bottom(i);
RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Top(i) : regStack.Bottom(i);
if (regPair.reg2 != REG_NA)
{
// We can use a STP instruction.
genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
nullptr);
if (genReverseAndPairCalleeSavedRegisters)
{
genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, REG_IP0, nullptr);
}
else
{
genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
nullptr);
}

spOffset += 2 * slotSize;
}
Expand Down Expand Up @@ -926,8 +933,9 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe

// Save integer registers at higher addresses than floating-point registers.

regMaskTP maskSaveRegsFrame = regsToSaveMask & (RBM_FP | RBM_LR);
regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT;
regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat;
regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat & ~maskSaveRegsFrame;

if (maskSaveRegsFloat != RBM_NONE)
{
Expand All @@ -939,6 +947,13 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
if (maskSaveRegsInt != RBM_NONE)
{
genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset);
spDelta = 0;
lowestCalleeSavedOffset += genCountBits(maskSaveRegsInt) * FPSAVE_REGSIZE_BYTES;
}

if (maskSaveRegsFrame != RBM_NONE)
{
genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, nullptr);
// No need to update spDelta, lowestCalleeSavedOffset since they're not used after this.
}
}
Expand Down Expand Up @@ -970,13 +985,20 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
stackDelta = spDelta;
}

RegPair regPair = regStack.Top(i);
RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Bottom(i) : regStack.Top(i);
if (regPair.reg2 != REG_NA)
{
spOffset -= 2 * slotSize;

genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, REG_IP1,
nullptr);
if (genReverseAndPairCalleeSavedRegisters)
{
genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, REG_IP1, nullptr);
}
else
{
genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair,
REG_IP1, nullptr);
}
}
else
{
Expand Down Expand Up @@ -1043,11 +1065,19 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in

// Save integer registers at higher addresses than floating-point registers.

regMaskTP maskRestoreRegsFrame = regsToRestoreMask & (RBM_FP | RBM_LR);
regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT;
regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat;
regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat & ~maskRestoreRegsFrame;

// Restore in the opposite order of saving.

if (maskRestoreRegsFrame != RBM_NONE)
{
int spFrameDelta = (maskRestoreRegsFloat != RBM_NONE || maskRestoreRegsInt != RBM_NONE) ? 0 : spDelta;
spOffset -= 2 * REGSIZE_BYTES;
genEpilogRestoreRegPair(REG_FP, REG_LR, spOffset, spFrameDelta, false, REG_IP1, nullptr);
}

if (maskRestoreRegsInt != RBM_NONE)
{
int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment?
Expand Down
24 changes: 24 additions & 0 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ CodeGen::CodeGen(Compiler* theCompiler)
#ifdef TARGET_ARM64
genSaveFpLrWithAllCalleeSavedRegisters = false;
genForceFuncletFrameType5 = false;
genReverseAndPairCalleeSavedRegisters = false;
#endif // TARGET_ARM64
}

Expand Down Expand Up @@ -4812,6 +4813,29 @@ void CodeGen::genFinalizeFrame()
}
#endif // TARGET_ARM

#ifdef TARGET_ARM64
if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform)
{
JITDUMP("Setting genReverseAndPairCalleeSavedRegisters = true");

genReverseAndPairCalleeSavedRegisters = true;

// Make sure we push the registers in pairs if possible. If we only allocate a contiguous
// block of registers this should add at most one integer and at most one floating point
// register to the list. The stack has to be 16-byte aligned, so in worst case it results
// in allocating 16 bytes more space on stack if odd number of integer and odd number of
// FP registers were occupied. Same number of instructions will be generated, just the
// STR instructions are replaced with STP (store pair).
regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask();
regMaskTP maskPairRegs = ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) |
((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1);
if (maskPairRegs != RBM_NONE)
{
regSet.rsSetRegsModified(maskPairRegs);
}
}
#endif

#ifdef DEBUG
if (verbose)
{
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2808,6 +2808,16 @@ inline
{
*pBaseReg = REG_SPBASE;
}
#elif defined(TARGET_ARM64)
if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 && !opts.IsOSR() &&
lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
{
int spVarOffset = varOffset + codeGen->genSPtoFPdelta();
JITDUMP("lvaFrameAddress optimization for V%02u: [FP-%d] -> [SP+%d]\n", varNum, -varOffset, spVarOffset);
FPbased = false;
varOffset = spVarOffset;
}
*pFPbased = FPbased;
#else
*pFPbased = FPbased;
#endif
Expand Down
139 changes: 85 additions & 54 deletions src/coreclr/jit/lclvars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5641,7 +5641,9 @@ void Compiler::lvaFixVirtualFrameOffsets()
#endif

// The delta to be added to virtual offset to adjust it relative to frame pointer or SP
int delta = 0;
int delta = 0;
int frameLocalsDelta = 0;
int frameBoundary = 0;

#ifdef TARGET_XARCH
delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64
Expand All @@ -5666,7 +5668,25 @@ void Compiler::lvaFixVirtualFrameOffsets()
// We set FP to be after LR, FP
delta += 2 * REGSIZE_BYTES;
}
#elif defined(TARGET_AMD64) || defined(TARGET_ARM64)
#elif defined(TARGET_ARM64)
else
{
// FP is used.
delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();

// If we placed FP/LR at the bottom of the frame we need to shift all the variables
// on the new frame to account for it. See lvaAssignVirtualFrameOffsetsToLocals.
if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
{
// We set FP to be after LR, FP
frameLocalsDelta = 2 * REGSIZE_BYTES;
frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0;
if (info.compIsVarArgs)
frameBoundary -= MAX_REG_ARG * REGSIZE_BYTES;
}
JITDUMP("--- delta bump %d for FP frame, %d inside frame for FP/LR relocation\n", delta, frameLocalsDelta);
}
#elif defined(TARGET_AMD64)
else
{
// FP is used.
Expand Down Expand Up @@ -5734,7 +5754,7 @@ void Compiler::lvaFixVirtualFrameOffsets()

#if defined(TARGET_X86)
// On x86, we set the stack offset for a promoted field
// to match a struct parameter in lvAssignFrameOffsetsToPromotedStructs.
// to match a struct parameter in lvaAssignFrameOffsetsToPromotedStructs.
if ((!varDsc->lvIsParam || parentvarDsc->lvIsParam) && promotionType == PROMOTION_TYPE_DEPENDENT)
#else
if (!varDsc->lvIsParam && promotionType == PROMOTION_TYPE_DEPENDENT)
Expand All @@ -5754,15 +5774,23 @@ void Compiler::lvaFixVirtualFrameOffsets()

if (doAssignStkOffs)
{
JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + delta);
varDsc->SetStackOffset(varDsc->GetStackOffset() + delta);
int localDelta = delta;

if (frameLocalsDelta != 0 && varDsc->GetStackOffset() < frameBoundary)
{
localDelta += frameLocalsDelta;
}

JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(),
varDsc->GetStackOffset() + localDelta);
varDsc->SetStackOffset(varDsc->GetStackOffset() + localDelta);

#if DOUBLE_ALIGN
if (genDoubleAlign() && !codeGen->isFramePointerUsed())
{
if (varDsc->lvFramePointerBased)
{
varDsc->SetStackOffset(varDsc->GetStackOffset() - delta);
varDsc->SetStackOffset(varDsc->GetStackOffset() - localDelta);

// We need to re-adjust the offsets of the parameters so they are EBP
// relative rather than stack/frame pointer relative
Expand All @@ -5784,9 +5812,13 @@ void Compiler::lvaFixVirtualFrameOffsets()
assert(codeGen->regSet.tmpAllFree());
for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp))
{
temp->tdAdjustTempOffs(delta);
temp->tdAdjustTempOffs(delta + frameLocalsDelta);
}

if (lvaCachedGenericContextArgOffs < frameBoundary)
{
lvaCachedGenericContextArgOffs += frameLocalsDelta;
}
lvaCachedGenericContextArgOffs += delta;

#if FEATURE_FIXED_OUT_ARGS
Expand Down Expand Up @@ -6042,30 +6074,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
codeGen->setFramePointerUsed(codeGen->isFramePointerRequired());
}

#ifdef TARGET_ARM64
// Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
// a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
// need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
// and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
// frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
// (It should be legal to use these frame types for every frame).

if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
{
// Default configuration
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20));
}
else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
{
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
}
else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
{
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
}
#endif // TARGET_ARM64

#ifdef TARGET_XARCH
// On x86/amd64, the return address has already been pushed by the call instruction in the caller.
stkOffs -= TARGET_POINTER_SIZE; // return address;
Expand Down Expand Up @@ -6114,9 +6122,13 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
#endif // !TARGET_ARM

#ifdef TARGET_ARM64
// If the frame pointer is used, then we'll save FP/LR at the bottom of the stack.
// Otherwise, we won't store FP, and we'll store LR at the top, with the other callee-save
// registers (if any).
// If the frame pointer is used, then we'll save FP/LR either at the bottom of the stack
// or at the top of the stack depending on frame type. We make the decision after assigning
// the variables on the frame and then fix up the offsets in lvaFixVirtualFrameOffsets.
// For now, we proceed as if FP/LR were saved with the callee registers. If we later
// decide to move the FP/LR to the bottom of the frame it shifts all the assigned
// variables and temporaries by 16 bytes. The largest alignment we currently make is 16
// bytes for SIMD.

int initialStkOffs = 0;
if (info.compIsVarArgs)
Expand All @@ -6127,17 +6139,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
stkOffs -= initialStkOffs;
}

if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || !isFramePointerUsed()) // Note that currently we always have
// a frame pointer
{
stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;
}
else
{
// Subtract off FP and LR.
assert(compCalleeRegsPushed >= 2);
stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES;
}
stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;

#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)

Expand Down Expand Up @@ -6807,15 +6809,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
}
#endif // TARGET_AMD64

#ifdef TARGET_ARM64
if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have
// a frame pointer
{
// Create space for saving FP and LR.
stkOffs -= 2 * REGSIZE_BYTES;
}
#endif // TARGET_ARM64

#if FEATURE_FIXED_OUT_ARGS
if (lvaOutgoingArgSpaceSize > 0)
{
Expand Down Expand Up @@ -6853,6 +6846,44 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()

noway_assert(compLclFrameSize + originalFrameSize ==
(unsigned)-(stkOffs + (pushedCount * (int)TARGET_POINTER_SIZE)));

#ifdef TARGET_ARM64
// Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
// a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
// need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
// and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
// frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
// (It should be legal to use these frame types for every frame).
//
// For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout that can
// be represented with compact unwinding information. In order to maintain code quality we only do it when
// we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress optimization, or if the
// whole frame is small enough that the negative FP-based addressing can address the whole frame.

if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
{
if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform &&
(!codeGen->isFramePointerRequired() || codeGen->genTotalFrameSize() < 0x100))
{
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true);
}
else
{
// Default configuration
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
opts.compDbgEnC ||
compStressCompile(Compiler::STRESS_GENERIC_VARN, 20));
}
}
else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
{
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
}
else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
{
codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
}
#endif // TARGET_ARM64
}

//------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,18 @@ internal static class MachNative
public const uint PLATFORM_TVOSSIMULATOR = 8;
public const uint PLATFORM_WATCHOSSIMULATOR = 9;
public const uint PLATFORM_DRIVERKIT = 10;

public const uint UNWIND_ARM64_MODE_FRAMELESS = 0x02000000;
public const uint UNWIND_ARM64_MODE_DWARF = 0x03000000;
public const uint UNWIND_ARM64_MODE_FRAME = 0x04000000;
public const uint UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001;
public const uint UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002;
public const uint UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004;
public const uint UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008;
public const uint UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010;
public const uint UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100;
public const uint UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200;
public const uint UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400;
public const uint UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800;
}
}
Loading

0 comments on commit 35d6c08

Please sign in to comment.