From c5cefd3f196cc2b2d0d2189aecad4e35d86b23ac Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 4 Feb 2025 11:13:25 -0800 Subject: [PATCH 01/11] proof of concept --- src/coreclr/jit/codegenarm.cpp | 6 +- src/coreclr/jit/codegenarm64.cpp | 12 ++-- src/coreclr/jit/codegenloongarch64.cpp | 14 ++-- src/coreclr/jit/codegenriscv64.cpp | 13 ++-- src/coreclr/jit/codegenxarch.cpp | 10 +-- src/coreclr/jit/gentree.cpp | 3 + src/coreclr/jit/gentree.h | 3 + src/coreclr/jit/helperexpansion.cpp | 68 ++++++++++++++++-- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/objectalloc.cpp | 95 ++++++++++++++++++++------ src/coreclr/jit/objectalloc.h | 44 +++++++----- 11 files changed, 201 insertions(+), 68 deletions(-) diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 92d6bc8635224e..fc7cfc33f1c2e2 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -385,6 +385,8 @@ void CodeGen::genLclHeap(GenTree* tree) GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // Result of localloc will be returned in regCnt. // Also it used as temporary register in code generation // for storing allocation size @@ -470,7 +472,7 @@ void CodeGen::genLclHeap(GenTree* tree) goto ALLOC_DONE; } - else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <= + else if (!initMem && (amount < compiler->eeGetPageSize())) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, must touch it BEFORE @@ -494,7 +496,7 @@ void CodeGen::genLclHeap(GenTree* tree) } // Allocation - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to localloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index ec01f356e194ed..d1de02600551b0 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3155,6 +3155,8 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -3184,7 +3186,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3232,7 +3234,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes); assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time - if (compiler->info.compInitMem) + if (initMem) { if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { @@ -3303,10 +3305,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3318,7 +3320,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { BasicBlock* loop = genCreateTempLabel(); diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index f8e26e956209aa..51602c0d9e14fa 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -1600,6 +1600,8 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -1626,9 +1628,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1680,7 +1682,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1727,10 +1729,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1742,7 +1744,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 8efe6e0827125c..b04d138992aca1 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -1508,6 +1508,7 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); const target_size_t pageSize = compiler->eeGetPageSize(); // According to RISC-V Privileged ISA page size is 4KiB @@ -1539,9 +1540,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1592,7 +1593,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1641,10 +1642,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1655,7 +1656,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6cffd104de9814..e0f0a4f65ffe5c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2870,6 +2870,8 @@ void CodeGen::genLclHeap(GenTree* tree) target_size_t stackAdjustment = 0; target_size_t locAllocStackOffset = 0; + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI() && size->isContained()) @@ -2893,7 +2895,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -2918,7 +2920,7 @@ void CodeGen::genLclHeap(GenTree* tree) inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type)); - if (compiler->info.compInitMem) + if (initMem) { // Convert the count from a count of bytes to a loop count. We will loop once per // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes @@ -2939,7 +2941,7 @@ void CodeGen::genLclHeap(GenTree* tree) } bool initMemOrLargeAlloc; // Declaration must be separate from initialization to avoid clang compiler error. - initMemOrLargeAlloc = compiler->info.compInitMem || (amount >= compiler->eeGetPageSize()); // must be >= not > + initMemOrLargeAlloc = initMem || (amount >= compiler->eeGetPageSize()); // must be >= not > #if FEATURE_FIXED_OUT_ARGS // If we have an outgoing arg area then we must adjust the SP by popping off the @@ -3013,7 +3015,7 @@ void CodeGen::genLclHeap(GenTree* tree) // We should not have any temp registers at this point. assert(internalRegisters.Count(tree) == 0); - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the number of loop iterations for this loop, if each // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ea02025450b39a..7cbb57bd907429 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -13233,6 +13233,9 @@ const char* Compiler::gtGetWellKnownArgNameForArgMsg(WellKnownArg arg) return "tail call"; case WellKnownArg::StackArrayLocal: return "&lcl arr"; + case WellKnownArg::StackArrayElemSize: + return "arr elemsz"; + default: return nullptr; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 641f0b05e1f61a..13ab784d9b66b9 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -550,6 +550,8 @@ enum GenTreeFlags : unsigned int GTF_ALLOCOBJ_EMPTY_STATIC = 0x80000000, // GT_ALLOCOBJ -- allocation site is part of an empty static pattern + GTF_LCLHEAP_MUSTINIT = 0x80000000, // GT_LCLHEAP -- allocation must be zeroed + #ifdef FEATURE_HW_INTRINSICS GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call @@ -4568,6 +4570,7 @@ enum class WellKnownArg : unsigned SwiftSelf, X86TailCallSpecialArg, StackArrayLocal, + StackArrayElemSize, }; #ifdef DEBUG diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index ff4eb6f8433294..c7f4bb3a7f7c54 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2832,17 +2832,28 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, return false; } - // If this is a local array, the new helper will have an arg for the array's address + // If this is a local array, the new helper will have an arg for the array's address or an arg + // for the array element size // CallArg* const stackLocalAddressArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayLocal); + CallArg* const elemSizeArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayElemSize); - if (stackLocalAddressArg == nullptr) + if ((stackLocalAddressArg == nullptr) && (elemSizeArg == nullptr)) { return false; } - JITDUMP("Expanding new array helper for stack allocated array at [%06d] in " FMT_BB ":\n", dspTreeID(call), - block->bbNum); + // If we have an elem size arg, this is intended to be a localloc + // + // Note we may have figured out the array length after we did the + // escape analysis (that is, lengthArg might be a constant), so we + // could change this from a localloc to a fixed alloc, if we + // introduced a new block lcl var. + // + bool const isLocAlloc = (elemSizeArg != nullptr); + + JITDUMP("Expanding new array helper for stack allocated array at [%06d] %sin " FMT_BB ":\n", dspTreeID(call), + isLocAlloc ? " into localloc " : "", block->bbNum); DISPTREE(call); JITDUMP("\n"); @@ -2859,7 +2870,53 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, } } - GenTree* const stackLocalAddress = stackLocalAddressArg->GetNode(); + GenTree* const lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); + GenTree* stackLocalAddress = nullptr; + + // Todo -- clone and leave option to make a helper call under some runtime check + // for sufficient stack. + // + if (isLocAlloc) + { + assert(elemSizeArg != nullptr); + assert(stackLocalAddressArg == nullptr); + GenTree* const elemSize = elemSizeArg->GetNode(); + assert(elemSize->IsCnsIntOrI()); + + unsigned const locallocTemp = lvaGrabTemp(true DEBUGARG("localloc stack address")); + lvaTable[locallocTemp].lvType = TYP_I_IMPL; + + GenTree* const arrayLength = gtCloneExpr(lengthArg); + GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); + GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); + GenTree* const totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); + GenTree* const locallocStore = gtNewStoreLclVarNode(locallocTemp, locallocNode); + Statement* const locallocStmt = fgNewStmtFromTree(locallocStore); + + gtUpdateStmtSideEffects(locallocStmt); + fgInsertStmtBefore(block, stmt, locallocStmt); + + // Array address is the result of the localloc + // + stackLocalAddress = gtNewLclVarNode(locallocTemp); + compLocallocUsed = true; + + // Codegen must zero out the new allocation. + // + locallocNode->gtFlags &= GTF_LCLHEAP_MUSTINIT; + + codeGen->setFramePointerRequired(true); + } + else + { + assert(elemSizeArg == nullptr); + assert(stackLocalAddressArg != nullptr); + + // Array address is the block local we created earlier + // + stackLocalAddress = stackLocalAddressArg->GetNode(); + } // Initialize the array method table pointer. // @@ -2873,7 +2930,6 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, // Initialize the array length. // - GenTree* const lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); GenTree* const lengthArgInt = fgOptimizeCast(gtNewCastNode(TYP_INT, lengthArg, false, TYP_INT)); GenTree* const lengthAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(stackLocalAddress), gtNewIconNode(OFFSETOF__CORINFO_Array__length, TYP_I_IMPL)); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 2d622cc33b7f5b..ba900848a10227 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -674,6 +674,7 @@ RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStac CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAllocationConditionalEscapeRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 0); RELEASE_CONFIG_INTEGER(JitEECallTimingInfo, "JitEECallTimingInfo", 0) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 8c84d47cf75536..0e9dd0a5db3f44 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -513,8 +513,7 @@ bool ObjectAllocator::MorphAllocObjNodes() case CORINFO_HELP_NEWARR_1_DIRECT: case CORINFO_HELP_NEWARR_1_ALIGN8: { - if ((data->AsCall()->gtArgs.CountUserArgs() == 2) && - data->AsCall()->gtArgs.GetUserArgByIndex(1)->GetNode()->IsCnsIntOrI()) + if (data->AsCall()->gtArgs.CountUserArgs() == 2) { allocType = OAT_NEWARR; } @@ -533,6 +532,7 @@ bool ObjectAllocator::MorphAllocObjNodes() { bool canStack = false; bool bashCall = false; + bool useLocalloc = false; const char* onHeapReason = nullptr; unsigned int lclNum = stmtExpr->AsLclVar()->GetLclNum(); @@ -578,9 +578,9 @@ bool ObjectAllocator::MorphAllocObjNodes() CORINFO_CLASS_HANDLE clsHnd = comp->gtGetHelperCallClassHandle(data->AsCall(), &isExact, &isNonNull); GenTree* const len = data->AsCall()->gtArgs.GetUserArgByIndex(1)->GetNode(); - assert(len != nullptr); + ssize_t arraySize = len->IsCnsIntOrI() ? len->AsIntCon()->IconValue() : -1; unsigned int blockSize = 0; comp->Metrics.NewArrayHelperCalls++; @@ -589,27 +589,36 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[array type is either non-exact or null]"; canStack = false; } - else if (!len->IsCnsIntOrI()) + else if (!len->IsCnsIntOrI() && !m_UseLocalloc) { - onHeapReason = "[non-constant size]"; + onHeapReason = "[unknown size]"; canStack = false; } - else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, len->AsIntCon()->IconValue(), - &blockSize, &onHeapReason)) + else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, arraySize, &blockSize, + &onHeapReason)) { // reason set by the call canStack = false; } else { - JITDUMP("Allocating V%02u on the stack\n", lclNum); + useLocalloc = !len->IsCnsIntOrI(); + JITDUMP("Allocating V%02u on the stack%s\n", lclNum, + useLocalloc ? " [via localloc]" : " [via block local]"); canStack = true; - const unsigned int stackLclNum = + + if (useLocalloc) + { + MorphNewArrNodeIntoLocAlloc(data->AsCall(), clsHnd, len, block, stmt); + } + else + { MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, (unsigned int)len->AsIntCon()->IconValue(), blockSize, block, stmt); + } - // Note we do not want to rewrite uses of the array temp, so we + // Note we do not want to rewrite uses of lclNum, so we // do not update m_HeapLocalToStackLocalMap. // comp->Metrics.StackAllocatedArrays++; @@ -679,7 +688,11 @@ bool ObjectAllocator::MorphAllocObjNodes() // We keep the set of possibly-stack-pointing pointers as a superset of the set of // definitely-stack-pointing pointers. All definitely-stack-pointing pointers are in both // sets. - MarkLclVarAsDefinitelyStackPointing(lclNum); + + if (!useLocalloc) + { + MarkLclVarAsDefinitelyStackPointing(lclNum); + } MarkLclVarAsPossiblyStackPointing(lclNum); // If this was conditionally escaping enumerator, establish a connection between this local @@ -799,18 +812,15 @@ GenTree* ObjectAllocator::MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* alloc // block - a basic block where newArr is // stmt - a statement where newArr is // -// Return Value: -// local num for the new stack allocated local -// // Notes: // This function can insert additional statements before stmt. // -unsigned int ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt) +void ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, + CORINFO_CLASS_HANDLE clsHnd, + unsigned int length, + unsigned int blockSize, + BasicBlock* block, + Statement* stmt) { assert(newArr != nullptr); assert(m_AnalysisDone); @@ -873,8 +883,51 @@ unsigned int ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* // Note that we have stack allocated arrays in this method // comp->setMethodHasStackAllocatedArray(); +} - return lclNum; +//------------------------------------------------------------------------ +// MorphNewArrNodeIntoLocAlloc: Morph a newarray helper call node into a local frame allocation. +// +// Arguments: +// newArr - GT_CALL that will be replaced by helper call. +// clsHnd - class representing the type of the array +// length - operand for length of the array +// block - a basic block where newArr is +// stmt - a statement where newArr is +// +void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt) +{ + assert(newArr != nullptr); + assert(m_AnalysisDone); + assert(clsHnd != NO_CLASS_HANDLE); + assert(newArr->IsHelperCall()); + assert(newArr->GetHelperNum() != CORINFO_HELP_NEWARR_1_MAYBEFROZEN); + + // Get element size + // + CORINFO_CLASS_HANDLE elemClsHnd = NO_CLASS_HANDLE; + CorInfoType corType = comp->info.compCompHnd->getChildType(clsHnd, &elemClsHnd); + var_types type = JITtype2varType(corType); + ClassLayout* elemLayout = type == TYP_STRUCT ? comp->typGetObjLayout(elemClsHnd) : nullptr; + + const unsigned elemSize = elemLayout != nullptr ? elemLayout->GetSize() : genTypeSize(type); + + // Mark the newarr call as being "on stack", and add the element size + // operand for the stack local as an argument + // + GenTree* const elemSizeNode = comp->gtNewIconNode(elemSize); + newArr->gtArgs.PushBack(comp, NewCallArg::Primitive(elemSizeNode).WellKnown(WellKnownArg::StackArrayElemSize)); + newArr->gtCallMoreFlags |= GTF_CALL_M_STACK_ARRAY; + + // Retype the call result as an unmanaged pointer + // + newArr->ChangeType(TYP_I_IMPL); + newArr->gtReturnType = TYP_I_IMPL; + + // Note that we have stack allocated arrays in this method + // + comp->setMethodHasStackAllocatedArray(); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 593e7b7915335c..3175a80f34931a 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -134,6 +134,7 @@ class ObjectAllocator final : public Phase LocalToLocalMap m_HeapLocalToStackLocalMap; BitSetShortLongRep* m_ConnGraphAdjacencyMatrix; unsigned int m_StackAllocMaxSize; + bool m_UseLocalloc; // Info for conditionally-escaping locals LocalToLocalMap m_EnumeratorLocalToPseudoLocalMap; @@ -176,12 +177,15 @@ class ObjectAllocator final : public Phase GenTree* MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* allocObj); unsigned int MorphAllocObjNodeIntoStackAlloc( GenTreeAllocObj* allocObj, CORINFO_CLASS_HANDLE clsHnd, bool isValueClass, BasicBlock* block, Statement* stmt); - unsigned int MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt); + void MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, + CORINFO_CLASS_HANDLE clsHnd, + unsigned int length, + unsigned int blockSize, + BasicBlock* block, + Statement* stmt); + void MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); + struct BuildConnGraphVisitorCallbackData; bool CanLclVarEscapeViaParentStack(ArrayStack* parentStack, unsigned int lclNum, BasicBlock* block); void UpdateAncestorTypes(GenTree* tree, ArrayStack* parentStack, var_types newType); @@ -284,6 +288,7 @@ inline ObjectAllocator::ObjectAllocator(Compiler* comp) m_ConnGraphAdjacencyMatrix = nullptr; m_StackAllocMaxSize = (unsigned)JitConfig.JitObjectStackAllocationSize(); + m_UseLocalloc = JitConfig.JitObjectStackAllocationLocalloc(); } //------------------------------------------------------------------------ @@ -313,7 +318,7 @@ inline void ObjectAllocator::EnableObjectStackAllocation() // lclNum - Local variable number // clsHnd - Class/struct handle of the variable class // allocType - Type of allocation (newobj or newarr) -// length - Length of the array (for newarr) +// length - Length of the array (for newarr), -1 for runtime determined size // blockSize - [out, optional] exact size of the object // reason - [out, required] if result is false, reason why // preliminaryCheck - if true, allow checking before analysis is done @@ -353,7 +358,7 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu return false; } - if ((length < 0) || (length > CORINFO_Array_MaxLength)) + if ((length < -1) || (length > CORINFO_Array_MaxLength)) { *reason = "[invalid array length]"; return false; @@ -370,19 +375,22 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu return false; } - const unsigned elemSize = elemLayout != nullptr ? elemLayout->GetSize() : genTypeSize(type); + if (length != -1) + { + const unsigned elemSize = elemLayout != nullptr ? elemLayout->GetSize() : genTypeSize(type); - ClrSafeInt totalSize(elemSize); - totalSize *= static_cast(length); - totalSize += static_cast(OFFSETOF__CORINFO_Array__data); + ClrSafeInt totalSize(elemSize); + totalSize *= static_cast(length); + totalSize += static_cast(OFFSETOF__CORINFO_Array__data); - if (totalSize.IsOverflow()) - { - *reason = "[overflow array length]"; - return false; - } + if (totalSize.IsOverflow()) + { + *reason = "[overflow array length]"; + return false; + } - classSize = totalSize.Value(); + classSize = totalSize.Value(); + } } else if (allocType == OAT_NEWOBJ) { From 4d9f1bcd04164f56396cbd6270fcfaf98d503f3e Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 4 Feb 2025 11:51:00 -0800 Subject: [PATCH 02/11] fix flags --- src/coreclr/jit/helperexpansion.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index c7f4bb3a7f7c54..7af986ec75a1a7 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2891,6 +2891,11 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); GenTree* const totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); + + // Allocation might fail. Codegen must zero the allocation + // + locallocNode->gtFlags &= (GTF_EXCEPT | GTF_LCLHEAP_MUSTINIT); + GenTree* const locallocStore = gtNewStoreLclVarNode(locallocTemp, locallocNode); Statement* const locallocStmt = fgNewStmtFromTree(locallocStore); @@ -2902,10 +2907,8 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, stackLocalAddress = gtNewLclVarNode(locallocTemp); compLocallocUsed = true; - // Codegen must zero out the new allocation. + // We now require a frame pointer // - locallocNode->gtFlags &= GTF_LCLHEAP_MUSTINIT; - codeGen->setFramePointerRequired(true); } else From e3c40198c763d5b0faf5ba99f99771dc659f704c Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 4 Feb 2025 11:51:15 -0800 Subject: [PATCH 03/11] enable by default (for now) --- src/coreclr/jit/jitconfigvalues.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index ba900848a10227..6efa1c70f50884 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -674,7 +674,7 @@ RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStac CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAllocationConditionalEscapeRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) -RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 0); +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 1); RELEASE_CONFIG_INTEGER(JitEECallTimingInfo, "JitEECallTimingInfo", 0) From 23b42611e80d0d5a7e7947462a4ba58d7a917ae8 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 4 Feb 2025 16:16:46 -0800 Subject: [PATCH 04/11] enable for array allocations in loops too --- src/coreclr/jit/helperexpansion.cpp | 14 +++++++------- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/objectalloc.cpp | 8 ++++++-- src/coreclr/jit/objectalloc.h | 7 ++++++- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index 7af986ec75a1a7..930b52528a2c66 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2847,8 +2847,8 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, // // Note we may have figured out the array length after we did the // escape analysis (that is, lengthArg might be a constant), so we - // could change this from a localloc to a fixed alloc, if we - // introduced a new block lcl var. + // could possibly change this from a localloc to a fixed alloc, + // if we could show that was sound. // bool const isLocAlloc = (elemSizeArg != nullptr); @@ -2886,11 +2886,11 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, unsigned const locallocTemp = lvaGrabTemp(true DEBUGARG("localloc stack address")); lvaTable[locallocTemp].lvType = TYP_I_IMPL; - GenTree* const arrayLength = gtCloneExpr(lengthArg); - GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); - GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); - GenTree* const totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); - GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); + GenTree* const arrayLength = gtCloneExpr(lengthArg); + GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); + GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); + GenTree* const totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); // Allocation might fail. Codegen must zero the allocation // diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 6efa1c70f50884..1fc5d8db2a865e 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -675,6 +675,7 @@ CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAll RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 1); +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationInLoop, "JitObjectStackAllocationInLoop", 1); RELEASE_CONFIG_INTEGER(JitEECallTimingInfo, "JitEECallTimingInfo", 0) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 0e9dd0a5db3f44..a70dbf04d5a4e5 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -543,7 +543,7 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[object stack allocation disabled]"; canStack = false; } - else if (basicBlockHasBackwardJump) + else if (basicBlockHasBackwardJump && !((allocType == OAT_NEWARR) && m_UseLocallocInLoop)) { onHeapReason = "[alloc in loop]"; canStack = false; @@ -602,7 +602,7 @@ bool ObjectAllocator::MorphAllocObjNodes() } else { - useLocalloc = !len->IsCnsIntOrI(); + useLocalloc = !len->IsCnsIntOrI() || basicBlockHasBackwardJump; JITDUMP("Allocating V%02u on the stack%s\n", lclNum, useLocalloc ? " [via localloc]" : " [via block local]"); canStack = true; @@ -928,6 +928,10 @@ void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( // Note that we have stack allocated arrays in this method // comp->setMethodHasStackAllocatedArray(); + + // Notify the compiler; this disables fast tail calls (for now) + // + comp->compLocallocUsed = true; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 3175a80f34931a..841f29391fd4c6 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -135,6 +135,7 @@ class ObjectAllocator final : public Phase BitSetShortLongRep* m_ConnGraphAdjacencyMatrix; unsigned int m_StackAllocMaxSize; bool m_UseLocalloc; + bool m_UseLocallocInLoop; // Info for conditionally-escaping locals LocalToLocalMap m_EnumeratorLocalToPseudoLocalMap; @@ -288,7 +289,11 @@ inline ObjectAllocator::ObjectAllocator(Compiler* comp) m_ConnGraphAdjacencyMatrix = nullptr; m_StackAllocMaxSize = (unsigned)JitConfig.JitObjectStackAllocationSize(); - m_UseLocalloc = JitConfig.JitObjectStackAllocationLocalloc(); + + // OSR does not support localloc (though seems like late-introduced localloc might be ok) + // + m_UseLocalloc = JitConfig.JitObjectStackAllocationLocalloc() && !comp->opts.IsOSR(); + m_UseLocallocInLoop = m_UseLocalloc && JitConfig.JitObjectStackAllocationInLoop(); } //------------------------------------------------------------------------ From d096d6312d463a0a9d9b69b0383167a87945fe79 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 4 Feb 2025 17:48:15 -0800 Subject: [PATCH 05/11] add missing bit of code --- src/coreclr/jit/morph.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4c0fc233196482..720394e437c422 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -736,6 +736,8 @@ const char* getWellKnownArgName(WellKnownArg arg) return "X86TailCallSpecialArg"; case WellKnownArg::StackArrayLocal: return "StackArrayLocal"; + case WellKnownArg::StackArrayElemSize: + return "StackArrayElemSize"; } return "N/A"; From 032f7242877090b862622cfd78bb86c72bf2283e Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Wed, 5 Feb 2025 13:10:02 -0800 Subject: [PATCH 06/11] add handler check; fix elem size type --- src/coreclr/jit/objectalloc.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index a70dbf04d5a4e5..0cb6398d6b0a3b 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -483,6 +483,7 @@ bool ObjectAllocator::MorphAllocObjNodes() const bool basicBlockHasNewObj = block->HasFlag(BBF_HAS_NEWOBJ); const bool basicBlockHasNewArr = block->HasFlag(BBF_HAS_NEWARR); const bool basicBlockHasBackwardJump = block->HasFlag(BBF_BACKWARD_JUMP); + const bool basicBlockInHandler = block->hasHndIndex(); if (!basicBlockHasNewObj && !basicBlockHasNewArr) { @@ -594,6 +595,11 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[unknown size]"; canStack = false; } + else if (!len->IsCnsIntOrI() && basicBlockInHandler) + { + onHeapReason = "[unknown size, in handler]"; + canStack = false; + } else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, arraySize, &blockSize, &onHeapReason)) { @@ -916,7 +922,7 @@ void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( // Mark the newarr call as being "on stack", and add the element size // operand for the stack local as an argument // - GenTree* const elemSizeNode = comp->gtNewIconNode(elemSize); + GenTree* const elemSizeNode = comp->gtNewIconNode(elemSize, TYP_I_IMPL); newArr->gtArgs.PushBack(comp, NewCallArg::Primitive(elemSizeNode).WellKnown(WellKnownArg::StackArrayElemSize)); newArr->gtCallMoreFlags |= GTF_CALL_M_STACK_ARRAY; From 1e088c4f657c90b8ff7ee0dc840fd9346595299a Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Wed, 5 Feb 2025 19:06:20 -0800 Subject: [PATCH 07/11] fix zero init logic --- src/coreclr/jit/helperexpansion.cpp | 2 +- src/coreclr/jit/lsraarm.cpp | 2 +- src/coreclr/jit/lsraarm64.cpp | 4 ++-- src/coreclr/jit/lsraloongarch64.cpp | 4 ++-- src/coreclr/jit/lsrariscv64.cpp | 4 ++-- src/coreclr/jit/lsraxarch.cpp | 5 +++-- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index 930b52528a2c66..57ab0425a603fb 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2894,7 +2894,7 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, // Allocation might fail. Codegen must zero the allocation // - locallocNode->gtFlags &= (GTF_EXCEPT | GTF_LCLHEAP_MUSTINIT); + locallocNode->gtFlags |= (GTF_EXCEPT | GTF_LCLHEAP_MUSTINIT); GenTree* const locallocStore = gtNewStoreLclVarNode(locallocTemp, locallocNode); Statement* const locallocStmt = fgNewStmtFromTree(locallocStore); diff --git a/src/coreclr/jit/lsraarm.cpp b/src/coreclr/jit/lsraarm.cpp index 815f0149aede11..efa7d1e3a82ed6 100644 --- a/src/coreclr/jit/lsraarm.cpp +++ b/src/coreclr/jit/lsraarm.cpp @@ -68,7 +68,7 @@ int LinearScan::BuildLclHeap(GenTree* tree) { internalIntCount = 0; } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 9af6bef2f17f19..c8f1706e1f4ca0 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1183,7 +1183,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -1202,7 +1202,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraloongarch64.cpp b/src/coreclr/jit/lsraloongarch64.cpp index 529e6d8127b670..d0064dc3a9d88d 100644 --- a/src/coreclr/jit/lsraloongarch64.cpp +++ b/src/coreclr/jit/lsraloongarch64.cpp @@ -441,7 +441,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -460,7 +460,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsrariscv64.cpp b/src/coreclr/jit/lsrariscv64.cpp index 1185eac4cea938..ae6b5c76d6ebaa 100644 --- a/src/coreclr/jit/lsrariscv64.cpp +++ b/src/coreclr/jit/lsrariscv64.cpp @@ -560,7 +560,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -581,7 +581,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 98129f9016cc10..eeda2b5fd2b2fd 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1812,14 +1812,15 @@ int LinearScan::BuildLclHeap(GenTree* tree) size_t sizeVal = AlignUp((size_t)size->AsIntCon()->gtIconVal, STACK_ALIGN); // Explicitly zeroed LCLHEAP also needs a regCnt in case of x86 or large page - if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize())) + if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize()) || + (tree->gtFlags & GTF_LCLHEAP_MUSTINIT)) { buildInternalIntRegisterDefForNode(tree); } } else { - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // For regCnt buildInternalIntRegisterDefForNode(tree); From 526ba2fb3dc8b8b02d661aac1c1b74a550bfda41 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 20 Feb 2025 17:34:10 -0800 Subject: [PATCH 08/11] update post merge, cleanup a bit --- src/coreclr/jit/objectalloc.cpp | 28 +++++++++------------------- src/coreclr/jit/objectalloc.h | 15 +++++++-------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index ee93645c3e7c67..80ac6874cf8a14 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -619,9 +619,7 @@ bool ObjectAllocator::MorphAllocObjNodes() } else { - MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, - (unsigned int)len->AsIntCon()->IconValue(), blockSize, - block, stmt); + MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, len, block, stmt); } // Note we do not want to rewrite uses of lclNum, so we @@ -813,37 +811,29 @@ GenTree* ObjectAllocator::MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* alloc // Arguments: // newArr - GT_CALL that will be replaced by helper call. // clsHnd - class representing the type of the array -// length - length of the array -// blockSize - size of the layout +// len - tree representing length of the array (must be a constant) // block - a basic block where newArr is // stmt - a statement where newArr is // // Notes: // This function can insert additional statements before stmt. // -void ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt) +void ObjectAllocator::MorphNewArrNodeIntoStackAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* len, BasicBlock* block, Statement* stmt) { assert(newArr != nullptr); assert(m_AnalysisDone); assert(clsHnd != NO_CLASS_HANDLE); assert(newArr->IsHelperCall()); assert(newArr->GetHelperNum() != CORINFO_HELP_NEWARR_1_MAYBEFROZEN); + assert(len->IsCnsIntOrI()); + const unsigned length = (unsigned int)len->AsIntCon()->IconValue(); const bool shortLifetime = false; const bool alignTo8 = newArr->GetHelperNum() == CORINFO_HELP_NEWARR_1_ALIGN8; const unsigned int lclNum = comp->lvaGrabTemp(shortLifetime DEBUGARG("stack allocated array temp")); LclVarDsc* const lclDsc = comp->lvaGetDesc(lclNum); - if (alignTo8) - { - blockSize = AlignUp(blockSize, 8); - } - comp->lvaSetStruct(lclNum, comp->typGetArrayLayout(clsHnd, length), /* unsafe */ false); lclDsc->lvStackAllocatedObject = true; @@ -926,10 +916,10 @@ void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( newArr->gtArgs.PushBack(comp, NewCallArg::Primitive(elemSizeNode).WellKnown(WellKnownArg::StackArrayElemSize)); newArr->gtCallMoreFlags |= GTF_CALL_M_STACK_ARRAY; - // Retype the call result as an unmanaged pointer + // Retype the call result as a byref (we may decide to heap allocate at runtime). // - newArr->ChangeType(TYP_I_IMPL); - newArr->gtReturnType = TYP_I_IMPL; + newArr->ChangeType(TYP_BYREF); + newArr->gtReturnType = TYP_BYREF; // Note that we have stack allocated arrays in this method // diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 2aeb9d4df592dc..6190f86b26e662 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -178,12 +178,8 @@ class ObjectAllocator final : public Phase GenTree* MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* allocObj); unsigned int MorphAllocObjNodeIntoStackAlloc( GenTreeAllocObj* allocObj, CORINFO_CLASS_HANDLE clsHnd, bool isValueClass, BasicBlock* block, Statement* stmt); - void MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt); + void MorphNewArrNodeIntoStackAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); void MorphNewArrNodeIntoLocAlloc( GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); @@ -369,8 +365,11 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu return false; } - ClassLayout* const layout = comp->typGetArrayLayout(clsHnd, (unsigned)length); - classSize = layout->GetSize(); + if (length != -1) + { + ClassLayout* const layout = comp->typGetArrayLayout(clsHnd, (unsigned)length); + classSize = layout->GetSize(); + } } else if (allocType == OAT_NEWOBJ) { From 9e7cf8ee2e807a7e8b02dfe19cd4ff043d855773 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 20 Feb 2025 17:34:46 -0800 Subject: [PATCH 09/11] pad localloc array size as if it was on heap; handle align8 --- src/coreclr/jit/helperexpansion.cpp | 52 ++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index 5046fc3dc98a42..dbfc7c0e632339 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2857,6 +2857,7 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, // if we could show that was sound. // bool const isLocAlloc = (elemSizeArg != nullptr); + bool const isAlign8 = isLocAlloc && (helper == CORINFO_HELP_NEWARR_1_ALIGN8); JITDUMP("Expanding new array helper for stack allocated array at [%06d] %sin " FMT_BB ":\n", dspTreeID(call), isLocAlloc ? " into localloc " : "", block->bbNum); @@ -2892,10 +2893,38 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, unsigned const locallocTemp = lvaGrabTemp(true DEBUGARG("localloc stack address")); lvaTable[locallocTemp].lvType = TYP_I_IMPL; - GenTree* const arrayLength = gtCloneExpr(lengthArg); - GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); - GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); - GenTree* const totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + GenTree* const arrayLength = gtCloneExpr(lengthArg); + GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); + GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); + GenTree* totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + + unsigned const elemSizeValue = (unsigned)elemSize->AsIntCon()->IconValue(); + + if ((elemSizeValue % TARGET_POINTER_SIZE) != 0) + { + // Round size up to TARGET_POINTER_SIZE. + // size = (size + TPS) & ~(TPS-1) + // + GenTree* const roundSize = gtNewIconNode(TARGET_POINTER_SIZE, TYP_I_IMPL); + GenTree* const biasedSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, roundSize); + GenTree* const mask = gtNewIconNode(TARGET_POINTER_SIZE - 1, TYP_I_IMPL); + GenTree* const invMask = gtNewOperNode(GT_NOT, TYP_I_IMPL, mask); + GenTree* const paddedSize = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedSize, invMask); + + totalSize = paddedSize; + } + +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, allocate an extra TARGET_POINTER_SIZED (4) bytes so + // we can fix alignment below. + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, alignSize); + } +#endif + GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); // Allocation might fail. Codegen must zero the allocation @@ -2913,6 +2942,21 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, stackLocalAddress = gtNewLclVarNode(locallocTemp); compLocallocUsed = true; +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, adjust address to be suitably aligned. + // Addr = (Localloc + 4) & ~7; + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + GenTree* const biasedAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, stackLocalAddress, alignSize); + GenTree* const alignMaskInv = gtNewIconNode(-8, TYP_I_IMPL); + GenTree* const alignedAddress = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedAddress, alignMaskInv); + + stackLocalAddress = alignedAddress; + } +#endif + // We now require a frame pointer // codeGen->setFramePointerRequired(true); From e24712353a7444dc741a64d784880e9d51e58c94 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 22 Feb 2025 08:41:28 -0800 Subject: [PATCH 10/11] don't allow localloc for gc type arrays --- src/coreclr/jit/jitmetadatalist.h | 1 + src/coreclr/jit/objectalloc.cpp | 20 ++++++++++---------- src/coreclr/jit/objectalloc.h | 19 ++++++++++++++----- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 215ea82ead5606..6c03726f968529 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -89,6 +89,7 @@ JITMETADATAMETRIC(NewBoxedValueClassHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedBoxedValueClasses, int, 0) JITMETADATAMETRIC(NewArrayHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedArrays, int, 0) +JITMETADATAMETRIC(LocallocAllocatedArrays, int, 0) JITMETADATAMETRIC(LocalAssertionCount, int, 0) JITMETADATAMETRIC(LocalAssertionOverflow, int, 0) JITMETADATAMETRIC(MorphTrackedLocals, int, 0) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 80ac6874cf8a14..6c973fe1ce37a8 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -581,7 +581,7 @@ bool ObjectAllocator::MorphAllocObjNodes() GenTree* const len = data->AsCall()->gtArgs.GetUserArgByIndex(1)->GetNode(); assert(len != nullptr); - ssize_t arraySize = len->IsCnsIntOrI() ? len->AsIntCon()->IconValue() : -1; + ssize_t arraySize = len->IsCnsIntOrI() ? len->AsIntCon()->IconValue() : 1; unsigned int blockSize = 0; comp->Metrics.NewArrayHelperCalls++; @@ -600,8 +600,8 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[unknown size, in handler]"; canStack = false; } - else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, arraySize, &blockSize, - &onHeapReason)) + else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, arraySize, len->IsCnsIntOrI(), + &blockSize, &onHeapReason)) { // reason set by the call canStack = false; @@ -616,16 +616,13 @@ bool ObjectAllocator::MorphAllocObjNodes() if (useLocalloc) { MorphNewArrNodeIntoLocAlloc(data->AsCall(), clsHnd, len, block, stmt); + comp->Metrics.LocallocAllocatedArrays++; } else { MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, len, block, stmt); + comp->Metrics.StackAllocatedArrays++; } - - // Note we do not want to rewrite uses of lclNum, so we - // do not update m_HeapLocalToStackLocalMap. - // - comp->Metrics.StackAllocatedArrays++; } } else if (allocType == OAT_NEWOBJ) @@ -653,7 +650,8 @@ bool ObjectAllocator::MorphAllocObjNodes() comp->Metrics.NewRefClassHelperCalls++; } - if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, 0, nullptr, &onHeapReason)) + if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, /* length */ 0, /* lengthKnown */ true, + nullptr, &onHeapReason)) { // reason set by the call canStack = false; @@ -1333,6 +1331,7 @@ void ObjectAllocator::UpdateAncestorTypes(GenTree* tree, ArrayStack* p case GT_IND: case GT_CALL: + // Watch for helper calls that have retyped operands...? break; default: @@ -2026,7 +2025,8 @@ void ObjectAllocator::CheckForGuardedAllocationOrCopy(BasicBlock* block, const char* reason = nullptr; unsigned size = 0; unsigned length = TARGET_POINTER_SIZE; - if (CanAllocateLclVarOnStack(enumeratorLocal, clsHnd, OAT_NEWOBJ, length, &size, &reason, + if (CanAllocateLclVarOnStack(enumeratorLocal, clsHnd, OAT_NEWOBJ, length, /* length known */ true, + &size, &reason, /* preliminaryCheck */ true)) { // We are going to conditionally track accesses to the enumerator local via a pseudo local. diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 6190f86b26e662..beef00276d9418 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -154,6 +154,7 @@ class ObjectAllocator final : public Phase CORINFO_CLASS_HANDLE clsHnd, ObjectAllocationType allocType, ssize_t length, + bool lengthKnown, unsigned int* blockSize, const char** reason, bool preliminaryCheck = false); @@ -319,7 +320,8 @@ inline void ObjectAllocator::EnableObjectStackAllocation() // lclNum - Local variable number // clsHnd - Class/struct handle of the variable class // allocType - Type of allocation (newobj or newarr) -// length - Length of the array (for newarr), -1 for runtime determined size +// length - Length of the array (for newarr), 1 for runtime determined size +// lengthKnown - true if length is known // blockSize - [out, optional] exact size of the object // reason - [out, required] if result is false, reason why // preliminaryCheck - if true, allow checking before analysis is done @@ -332,6 +334,7 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu CORINFO_CLASS_HANDLE clsHnd, ObjectAllocationType allocType, ssize_t length, + bool lengthKnown, unsigned int* blockSize, const char** reason, bool preliminaryCheck) @@ -359,16 +362,22 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu return false; } - if ((length < -1) || (length > CORINFO_Array_MaxLength)) + if ((length < 0) || (length > CORINFO_Array_MaxLength)) { *reason = "[invalid array length]"; return false; } - if (length != -1) + ClassLayout* const layout = comp->typGetArrayLayout(clsHnd, (unsigned)length); + classSize = layout->GetSize(); + + if (!lengthKnown && layout->HasGCPtr()) { - ClassLayout* const layout = comp->typGetArrayLayout(clsHnd, (unsigned)length); - classSize = layout->GetSize(); + // We can't represent GC info for these yet + // + assert(length == 1); + *reason = "[unknown length, gc elements]"; + return false; } } else if (allocType == OAT_NEWOBJ) From 5540b26d1ecc2ba6c9e94534981ac7bad48f3548 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 25 Feb 2025 14:44:42 -0800 Subject: [PATCH 11/11] temp fix for linux x64 issue with misaligned frame --- src/coreclr/jit/codegen.h | 1 + src/coreclr/jit/codegenxarch.cpp | 1 + src/coreclr/jit/objectalloc.cpp | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index b26c93534b2f9d..01c2a96608842a 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -125,6 +125,7 @@ class CodeGen final : public CodeGenInterface //------------------------------------------------------------------------- + bool genLocallocUsed; // true if we have used localloc in the method bool genUseBlockInit; // true if we plan to block-initialize the local stack frame unsigned genInitStkLclCnt; // The count of local variables that we need to zero init diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 487052f1d976d9..3f5ad78613a24c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2850,6 +2850,7 @@ void CodeGen::genLclHeap(GenTree* tree) { assert(tree->OperGet() == GT_LCLHEAP); assert(compiler->compLocallocUsed); + genLocallocUsed = true; GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 6c973fe1ce37a8..3b00fca94566d7 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -926,6 +926,13 @@ void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( // Notify the compiler; this disables fast tail calls (for now) // comp->compLocallocUsed = true; + +#ifdef UNIX_AMD64_ABI + // Ensure we don't end up with misaligned frames, + // if we manage to dead code this newarr. + // + comp->opts.compNeedToAlignFrame = true; +#endif } //------------------------------------------------------------------------