diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index b26c93534b2f9d..01c2a96608842a 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -125,6 +125,7 @@ class CodeGen final : public CodeGenInterface //------------------------------------------------------------------------- + bool genLocallocUsed; // true if we have used localloc in the method bool genUseBlockInit; // true if we plan to block-initialize the local stack frame unsigned genInitStkLclCnt; // The count of local variables that we need to zero init diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 26975c0130ab9c..9b13287c2573d9 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -385,6 +385,8 @@ void CodeGen::genLclHeap(GenTree* tree) GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // Result of localloc will be returned in regCnt. // Also it used as temporary register in code generation // for storing allocation size @@ -470,7 +472,7 @@ void CodeGen::genLclHeap(GenTree* tree) goto ALLOC_DONE; } - else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <= + else if (!initMem && (amount < compiler->eeGetPageSize())) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, must touch it BEFORE @@ -494,7 +496,7 @@ void CodeGen::genLclHeap(GenTree* tree) } // Allocation - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to localloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index ec01f356e194ed..d1de02600551b0 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3155,6 +3155,8 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -3184,7 +3186,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3232,7 +3234,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes); assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time - if (compiler->info.compInitMem) + if (initMem) { if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { @@ -3303,10 +3305,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3318,7 +3320,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { BasicBlock* loop = genCreateTempLabel(); diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index ceb64e2fc355cb..ae101ee35bbb0a 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -1600,6 +1600,8 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -1626,9 +1628,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1680,7 +1682,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1727,10 +1729,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1742,7 +1744,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 9a4736eef737ed..3337eca00e9f56 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -1523,6 +1523,7 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); const target_size_t pageSize = compiler->eeGetPageSize(); // According to RISC-V Privileged ISA page size is 4KiB @@ -1554,9 +1555,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1607,7 +1608,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1656,10 +1657,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1670,7 +1671,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 1e43cc1e84952e..3f5ad78613a24c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2850,6 +2850,7 @@ void CodeGen::genLclHeap(GenTree* tree) { assert(tree->OperGet() == GT_LCLHEAP); assert(compiler->compLocallocUsed); + genLocallocUsed = true; GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); @@ -2871,6 +2872,8 @@ void CodeGen::genLclHeap(GenTree* tree) target_size_t stackAdjustment = 0; target_size_t locAllocStackOffset = 0; + bool const initMem = compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI() && size->isContained()) @@ -2894,7 +2897,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -2919,7 +2922,7 @@ void CodeGen::genLclHeap(GenTree* tree) inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type)); - if (compiler->info.compInitMem) + if (initMem) { // Convert the count from a count of bytes to a loop count. We will loop once per // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes @@ -2940,7 +2943,7 @@ void CodeGen::genLclHeap(GenTree* tree) } bool initMemOrLargeAlloc; // Declaration must be separate from initialization to avoid clang compiler error. - initMemOrLargeAlloc = compiler->info.compInitMem || (amount >= compiler->eeGetPageSize()); // must be >= not > + initMemOrLargeAlloc = initMem || (amount >= compiler->eeGetPageSize()); // must be >= not > #if FEATURE_FIXED_OUT_ARGS // If we have an outgoing arg area then we must adjust the SP by popping off the @@ -3014,7 +3017,7 @@ void CodeGen::genLclHeap(GenTree* tree) // We should not have any temp registers at this point. assert(internalRegisters.Count(tree) == 0); - if (compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the number of loop iterations for this loop, if each // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 95087e29be2304..f9b11468d40ded 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -13093,6 +13093,9 @@ const char* Compiler::gtGetWellKnownArgNameForArgMsg(WellKnownArg arg) return "tail call"; case WellKnownArg::StackArrayLocal: return "&lcl arr"; + case WellKnownArg::StackArrayElemSize: + return "arr elemsz"; + default: return nullptr; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index e01d9705eb0d4a..8af0c8fc2ba5e4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -550,6 +550,8 @@ enum GenTreeFlags : unsigned int GTF_ALLOCOBJ_EMPTY_STATIC = 0x80000000, // GT_ALLOCOBJ -- allocation site is part of an empty static pattern + GTF_LCLHEAP_MUSTINIT = 0x80000000, // GT_LCLHEAP -- allocation must be zeroed + #ifdef FEATURE_HW_INTRINSICS GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call @@ -4564,6 +4566,7 @@ enum class WellKnownArg : unsigned SwiftSelf, X86TailCallSpecialArg, StackArrayLocal, + StackArrayElemSize, }; #ifdef DEBUG diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index 6ecbc71cedf292..dbfc7c0e632339 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2838,17 +2838,29 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, return false; } - // If this is a local array, the new helper will have an arg for the array's address + // If this is a local array, the new helper will have an arg for the array's address or an arg + // for the array element size // CallArg* const stackLocalAddressArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayLocal); + CallArg* const elemSizeArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayElemSize); - if (stackLocalAddressArg == nullptr) + if ((stackLocalAddressArg == nullptr) && (elemSizeArg == nullptr)) { return false; } - JITDUMP("Expanding new array helper for stack allocated array at [%06d] in " FMT_BB ":\n", dspTreeID(call), - block->bbNum); + // If we have an elem size arg, this is intended to be a localloc + // + // Note we may have figured out the array length after we did the + // escape analysis (that is, lengthArg might be a constant), so we + // could possibly change this from a localloc to a fixed alloc, + // if we could show that was sound. + // + bool const isLocAlloc = (elemSizeArg != nullptr); + bool const isAlign8 = isLocAlloc && (helper == CORINFO_HELP_NEWARR_1_ALIGN8); + + JITDUMP("Expanding new array helper for stack allocated array at [%06d] %sin " FMT_BB ":\n", dspTreeID(call), + isLocAlloc ? " into localloc " : "", block->bbNum); DISPTREE(call); JITDUMP("\n"); @@ -2865,7 +2877,99 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, } } - GenTree* const stackLocalAddress = stackLocalAddressArg->GetNode(); + GenTree* const lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); + GenTree* stackLocalAddress = nullptr; + + // Todo -- clone and leave option to make a helper call under some runtime check + // for sufficient stack. + // + if (isLocAlloc) + { + assert(elemSizeArg != nullptr); + assert(stackLocalAddressArg == nullptr); + GenTree* const elemSize = elemSizeArg->GetNode(); + assert(elemSize->IsCnsIntOrI()); + + unsigned const locallocTemp = lvaGrabTemp(true DEBUGARG("localloc stack address")); + lvaTable[locallocTemp].lvType = TYP_I_IMPL; + + GenTree* const arrayLength = gtCloneExpr(lengthArg); + GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); + GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); + GenTree* totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + + unsigned const elemSizeValue = (unsigned)elemSize->AsIntCon()->IconValue(); + + if ((elemSizeValue % TARGET_POINTER_SIZE) != 0) + { + // Round size up to TARGET_POINTER_SIZE. + // size = (size + TPS) & ~(TPS-1) + // + GenTree* const roundSize = gtNewIconNode(TARGET_POINTER_SIZE, TYP_I_IMPL); + GenTree* const biasedSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, roundSize); + GenTree* const mask = gtNewIconNode(TARGET_POINTER_SIZE - 1, TYP_I_IMPL); + GenTree* const invMask = gtNewOperNode(GT_NOT, TYP_I_IMPL, mask); + GenTree* const paddedSize = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedSize, invMask); + + totalSize = paddedSize; + } + +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, allocate an extra TARGET_POINTER_SIZED (4) bytes so + // we can fix alignment below. + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, alignSize); + } +#endif + + GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSize); + + // Allocation might fail. Codegen must zero the allocation + // + locallocNode->gtFlags |= (GTF_EXCEPT | GTF_LCLHEAP_MUSTINIT); + + GenTree* const locallocStore = gtNewStoreLclVarNode(locallocTemp, locallocNode); + Statement* const locallocStmt = fgNewStmtFromTree(locallocStore); + + gtUpdateStmtSideEffects(locallocStmt); + fgInsertStmtBefore(block, stmt, locallocStmt); + + // Array address is the result of the localloc + // + stackLocalAddress = gtNewLclVarNode(locallocTemp); + compLocallocUsed = true; + +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, adjust address to be suitably aligned. + // Addr = (Localloc + 4) & ~7; + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + GenTree* const biasedAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, stackLocalAddress, alignSize); + GenTree* const alignMaskInv = gtNewIconNode(-8, TYP_I_IMPL); + GenTree* const alignedAddress = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedAddress, alignMaskInv); + + stackLocalAddress = alignedAddress; + } +#endif + + // We now require a frame pointer + // + codeGen->setFramePointerRequired(true); + } + else + { + assert(elemSizeArg == nullptr); + assert(stackLocalAddressArg != nullptr); + + // Array address is the block local we created earlier + // + stackLocalAddress = stackLocalAddressArg->GetNode(); + } // Initialize the array method table pointer. // @@ -2877,7 +2981,6 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, // Initialize the array length. // - GenTree* const lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); GenTree* const lengthArgInt = fgOptimizeCast(gtNewCastNode(TYP_INT, lengthArg, false, TYP_INT)); GenTree* const lengthAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(stackLocalAddress), gtNewIconNode(OFFSETOF__CORINFO_Array__length, TYP_I_IMPL)); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 5fe7439d2bc658..075391fb40da21 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -675,6 +675,8 @@ RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStac CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAllocationConditionalEscapeRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 1); +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationInLoop, "JitObjectStackAllocationInLoop", 1); RELEASE_CONFIG_INTEGER(JitEECallTimingInfo, "JitEECallTimingInfo", 0) diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 215ea82ead5606..6c03726f968529 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -89,6 +89,7 @@ JITMETADATAMETRIC(NewBoxedValueClassHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedBoxedValueClasses, int, 0) JITMETADATAMETRIC(NewArrayHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedArrays, int, 0) +JITMETADATAMETRIC(LocallocAllocatedArrays, int, 0) JITMETADATAMETRIC(LocalAssertionCount, int, 0) JITMETADATAMETRIC(LocalAssertionOverflow, int, 0) JITMETADATAMETRIC(MorphTrackedLocals, int, 0) diff --git a/src/coreclr/jit/lsraarm.cpp b/src/coreclr/jit/lsraarm.cpp index 815f0149aede11..efa7d1e3a82ed6 100644 --- a/src/coreclr/jit/lsraarm.cpp +++ b/src/coreclr/jit/lsraarm.cpp @@ -68,7 +68,7 @@ int LinearScan::BuildLclHeap(GenTree* tree) { internalIntCount = 0; } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 9af6bef2f17f19..c8f1706e1f4ca0 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1183,7 +1183,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -1202,7 +1202,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraloongarch64.cpp b/src/coreclr/jit/lsraloongarch64.cpp index 529e6d8127b670..d0064dc3a9d88d 100644 --- a/src/coreclr/jit/lsraloongarch64.cpp +++ b/src/coreclr/jit/lsraloongarch64.cpp @@ -441,7 +441,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -460,7 +460,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsrariscv64.cpp b/src/coreclr/jit/lsrariscv64.cpp index 7583d7687d54f2..bac154bb72f395 100644 --- a/src/coreclr/jit/lsrariscv64.cpp +++ b/src/coreclr/jit/lsrariscv64.cpp @@ -572,7 +572,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!compiler->info.compInitMem) + else if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // No need to initialize allocated stack space. if (sizeVal < compiler->eeGetPageSize()) @@ -593,7 +593,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 12c5617da229b5..8465980b294e3e 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1871,14 +1871,15 @@ int LinearScan::BuildLclHeap(GenTree* tree) size_t sizeVal = AlignUp((size_t)size->AsIntCon()->gtIconVal, STACK_ALIGN); // Explicitly zeroed LCLHEAP also needs a regCnt in case of x86 or large page - if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize())) + if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize()) || + (tree->gtFlags & GTF_LCLHEAP_MUSTINIT)) { buildInternalIntRegisterDefForNode(tree); } } else { - if (!compiler->info.compInitMem) + if (!(compiler->info.compInitMem || (tree->gtFlags & GTF_LCLHEAP_MUSTINIT))) { // For regCnt buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 9544bac68f3109..a34bf38081c9de 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -739,6 +739,8 @@ const char* getWellKnownArgName(WellKnownArg arg) return "X86TailCallSpecialArg"; case WellKnownArg::StackArrayLocal: return "StackArrayLocal"; + case WellKnownArg::StackArrayElemSize: + return "StackArrayElemSize"; } return "N/A"; diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index fce4152d0a0620..3b00fca94566d7 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -483,6 +483,7 @@ bool ObjectAllocator::MorphAllocObjNodes() const bool basicBlockHasNewObj = block->HasFlag(BBF_HAS_NEWOBJ); const bool basicBlockHasNewArr = block->HasFlag(BBF_HAS_NEWARR); const bool basicBlockHasBackwardJump = block->HasFlag(BBF_BACKWARD_JUMP); + const bool basicBlockInHandler = block->hasHndIndex(); if (!basicBlockHasNewObj && !basicBlockHasNewArr) { @@ -513,8 +514,7 @@ bool ObjectAllocator::MorphAllocObjNodes() case CORINFO_HELP_NEWARR_1_DIRECT: case CORINFO_HELP_NEWARR_1_ALIGN8: { - if ((data->AsCall()->gtArgs.CountUserArgs() == 2) && - data->AsCall()->gtArgs.GetUserArgByIndex(1)->GetNode()->IsCnsIntOrI()) + if (data->AsCall()->gtArgs.CountUserArgs() == 2) { allocType = OAT_NEWARR; } @@ -533,6 +533,7 @@ bool ObjectAllocator::MorphAllocObjNodes() { bool canStack = false; bool bashCall = false; + bool useLocalloc = false; const char* onHeapReason = nullptr; unsigned int lclNum = stmtExpr->AsLclVar()->GetLclNum(); @@ -543,7 +544,7 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[object stack allocation disabled]"; canStack = false; } - else if (basicBlockHasBackwardJump) + else if (basicBlockHasBackwardJump && !((allocType == OAT_NEWARR) && m_UseLocallocInLoop)) { onHeapReason = "[alloc in loop]"; canStack = false; @@ -578,9 +579,9 @@ bool ObjectAllocator::MorphAllocObjNodes() CORINFO_CLASS_HANDLE clsHnd = comp->gtGetHelperCallClassHandle(data->AsCall(), &isExact, &isNonNull); GenTree* const len = data->AsCall()->gtArgs.GetUserArgByIndex(1)->GetNode(); - assert(len != nullptr); + ssize_t arraySize = len->IsCnsIntOrI() ? len->AsIntCon()->IconValue() : 1; unsigned int blockSize = 0; comp->Metrics.NewArrayHelperCalls++; @@ -589,12 +590,17 @@ bool ObjectAllocator::MorphAllocObjNodes() onHeapReason = "[array type is either non-exact or null]"; canStack = false; } - else if (!len->IsCnsIntOrI()) + else if (!len->IsCnsIntOrI() && !m_UseLocalloc) + { + onHeapReason = "[unknown size]"; + canStack = false; + } + else if (!len->IsCnsIntOrI() && basicBlockInHandler) { - onHeapReason = "[non-constant size]"; + onHeapReason = "[unknown size, in handler]"; canStack = false; } - else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, len->AsIntCon()->IconValue(), + else if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, arraySize, len->IsCnsIntOrI(), &blockSize, &onHeapReason)) { // reason set by the call @@ -602,17 +608,21 @@ bool ObjectAllocator::MorphAllocObjNodes() } else { - JITDUMP("Allocating V%02u on the stack\n", lclNum); + useLocalloc = !len->IsCnsIntOrI() || basicBlockHasBackwardJump; + JITDUMP("Allocating V%02u on the stack%s\n", lclNum, + useLocalloc ? " [via localloc]" : " [via block local]"); canStack = true; - const unsigned int stackLclNum = - MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, - (unsigned int)len->AsIntCon()->IconValue(), blockSize, - block, stmt); - - // Note we do not want to rewrite uses of the array temp, so we - // do not update m_HeapLocalToStackLocalMap. - // - comp->Metrics.StackAllocatedArrays++; + + if (useLocalloc) + { + MorphNewArrNodeIntoLocAlloc(data->AsCall(), clsHnd, len, block, stmt); + comp->Metrics.LocallocAllocatedArrays++; + } + else + { + MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, len, block, stmt); + comp->Metrics.StackAllocatedArrays++; + } } } else if (allocType == OAT_NEWOBJ) @@ -640,7 +650,8 @@ bool ObjectAllocator::MorphAllocObjNodes() comp->Metrics.NewRefClassHelperCalls++; } - if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, 0, nullptr, &onHeapReason)) + if (!CanAllocateLclVarOnStack(lclNum, clsHnd, allocType, /* length */ 0, /* lengthKnown */ true, + nullptr, &onHeapReason)) { // reason set by the call canStack = false; @@ -679,7 +690,11 @@ bool ObjectAllocator::MorphAllocObjNodes() // We keep the set of possibly-stack-pointing pointers as a superset of the set of // definitely-stack-pointing pointers. All definitely-stack-pointing pointers are in both // sets. - MarkLclVarAsDefinitelyStackPointing(lclNum); + + if (!useLocalloc) + { + MarkLclVarAsDefinitelyStackPointing(lclNum); + } MarkLclVarAsPossiblyStackPointing(lclNum); // If this was conditionally escaping enumerator, establish a connection between this local @@ -794,40 +809,29 @@ GenTree* ObjectAllocator::MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* alloc // Arguments: // newArr - GT_CALL that will be replaced by helper call. // clsHnd - class representing the type of the array -// length - length of the array -// blockSize - size of the layout +// len - tree representing length of the array (must be a constant) // block - a basic block where newArr is // stmt - a statement where newArr is // -// Return Value: -// local num for the new stack allocated local -// // Notes: // This function can insert additional statements before stmt. // -unsigned int ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt) +void ObjectAllocator::MorphNewArrNodeIntoStackAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* len, BasicBlock* block, Statement* stmt) { assert(newArr != nullptr); assert(m_AnalysisDone); assert(clsHnd != NO_CLASS_HANDLE); assert(newArr->IsHelperCall()); assert(newArr->GetHelperNum() != CORINFO_HELP_NEWARR_1_MAYBEFROZEN); + assert(len->IsCnsIntOrI()); + const unsigned length = (unsigned int)len->AsIntCon()->IconValue(); const bool shortLifetime = false; const bool alignTo8 = newArr->GetHelperNum() == CORINFO_HELP_NEWARR_1_ALIGN8; const unsigned int lclNum = comp->lvaGrabTemp(shortLifetime DEBUGARG("stack allocated array temp")); LclVarDsc* const lclDsc = comp->lvaGetDesc(lclNum); - if (alignTo8) - { - blockSize = AlignUp(blockSize, 8); - } - comp->lvaSetStruct(lclNum, comp->typGetArrayLayout(clsHnd, length), /* unsafe */ false); lclDsc->lvStackAllocatedObject = true; @@ -873,8 +877,62 @@ unsigned int ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* // Note that we have stack allocated arrays in this method // comp->setMethodHasStackAllocatedArray(); +} - return lclNum; +//------------------------------------------------------------------------ +// MorphNewArrNodeIntoLocAlloc: Morph a newarray helper call node into a local frame allocation. +// +// Arguments: +// newArr - GT_CALL that will be replaced by helper call. +// clsHnd - class representing the type of the array +// length - operand for length of the array +// block - a basic block where newArr is +// stmt - a statement where newArr is +// +void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt) +{ + assert(newArr != nullptr); + assert(m_AnalysisDone); + assert(clsHnd != NO_CLASS_HANDLE); + assert(newArr->IsHelperCall()); + assert(newArr->GetHelperNum() != CORINFO_HELP_NEWARR_1_MAYBEFROZEN); + + // Get element size + // + CORINFO_CLASS_HANDLE elemClsHnd = NO_CLASS_HANDLE; + CorInfoType corType = comp->info.compCompHnd->getChildType(clsHnd, &elemClsHnd); + var_types type = JITtype2varType(corType); + ClassLayout* elemLayout = type == TYP_STRUCT ? comp->typGetObjLayout(elemClsHnd) : nullptr; + + const unsigned elemSize = elemLayout != nullptr ? elemLayout->GetSize() : genTypeSize(type); + + // Mark the newarr call as being "on stack", and add the element size + // operand for the stack local as an argument + // + GenTree* const elemSizeNode = comp->gtNewIconNode(elemSize, TYP_I_IMPL); + newArr->gtArgs.PushBack(comp, NewCallArg::Primitive(elemSizeNode).WellKnown(WellKnownArg::StackArrayElemSize)); + newArr->gtCallMoreFlags |= GTF_CALL_M_STACK_ARRAY; + + // Retype the call result as a byref (we may decide to heap allocate at runtime). + // + newArr->ChangeType(TYP_BYREF); + newArr->gtReturnType = TYP_BYREF; + + // Note that we have stack allocated arrays in this method + // + comp->setMethodHasStackAllocatedArray(); + + // Notify the compiler; this disables fast tail calls (for now) + // + comp->compLocallocUsed = true; + +#ifdef UNIX_AMD64_ABI + // Ensure we don't end up with misaligned frames, + // if we manage to dead code this newarr. + // + comp->opts.compNeedToAlignFrame = true; +#endif } //------------------------------------------------------------------------ @@ -1280,6 +1338,7 @@ void ObjectAllocator::UpdateAncestorTypes(GenTree* tree, ArrayStack* p case GT_IND: case GT_CALL: + // Watch for helper calls that have retyped operands...? break; default: @@ -1973,7 +2032,8 @@ void ObjectAllocator::CheckForGuardedAllocationOrCopy(BasicBlock* block, const char* reason = nullptr; unsigned size = 0; unsigned length = TARGET_POINTER_SIZE; - if (CanAllocateLclVarOnStack(enumeratorLocal, clsHnd, OAT_NEWOBJ, length, &size, &reason, + if (CanAllocateLclVarOnStack(enumeratorLocal, clsHnd, OAT_NEWOBJ, length, /* length known */ true, + &size, &reason, /* preliminaryCheck */ true)) { // We are going to conditionally track accesses to the enumerator local via a pseudo local. diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 99dd8965924669..beef00276d9418 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -134,6 +134,8 @@ class ObjectAllocator final : public Phase LocalToLocalMap m_HeapLocalToStackLocalMap; BitSetShortLongRep* m_ConnGraphAdjacencyMatrix; unsigned int m_StackAllocMaxSize; + bool m_UseLocalloc; + bool m_UseLocallocInLoop; // Info for conditionally-escaping locals LocalToLocalMap m_EnumeratorLocalToPseudoLocalMap; @@ -152,6 +154,7 @@ class ObjectAllocator final : public Phase CORINFO_CLASS_HANDLE clsHnd, ObjectAllocationType allocType, ssize_t length, + bool lengthKnown, unsigned int* blockSize, const char** reason, bool preliminaryCheck = false); @@ -176,12 +179,11 @@ class ObjectAllocator final : public Phase GenTree* MorphAllocObjNodeIntoHelperCall(GenTreeAllocObj* allocObj); unsigned int MorphAllocObjNodeIntoStackAlloc( GenTreeAllocObj* allocObj, CORINFO_CLASS_HANDLE clsHnd, bool isValueClass, BasicBlock* block, Statement* stmt); - unsigned int MorphNewArrNodeIntoStackAlloc(GenTreeCall* newArr, - CORINFO_CLASS_HANDLE clsHnd, - unsigned int length, - unsigned int blockSize, - BasicBlock* block, - Statement* stmt); + void MorphNewArrNodeIntoStackAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); + void MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); + struct BuildConnGraphVisitorCallbackData; bool CanLclVarEscapeViaParentStack(ArrayStack* parentStack, unsigned int lclNum, BasicBlock* block); void UpdateAncestorTypes(GenTree* tree, ArrayStack* parentStack, var_types newType); @@ -284,6 +286,11 @@ inline ObjectAllocator::ObjectAllocator(Compiler* comp) m_ConnGraphAdjacencyMatrix = nullptr; m_StackAllocMaxSize = (unsigned)JitConfig.JitObjectStackAllocationSize(); + + // OSR does not support localloc (though seems like late-introduced localloc might be ok) + // + m_UseLocalloc = JitConfig.JitObjectStackAllocationLocalloc() && !comp->opts.IsOSR(); + m_UseLocallocInLoop = m_UseLocalloc && JitConfig.JitObjectStackAllocationInLoop(); } //------------------------------------------------------------------------ @@ -313,7 +320,8 @@ inline void ObjectAllocator::EnableObjectStackAllocation() // lclNum - Local variable number // clsHnd - Class/struct handle of the variable class // allocType - Type of allocation (newobj or newarr) -// length - Length of the array (for newarr) +// length - Length of the array (for newarr), 1 for runtime determined size +// lengthKnown - true if length is known // blockSize - [out, optional] exact size of the object // reason - [out, required] if result is false, reason why // preliminaryCheck - if true, allow checking before analysis is done @@ -326,6 +334,7 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu CORINFO_CLASS_HANDLE clsHnd, ObjectAllocationType allocType, ssize_t length, + bool lengthKnown, unsigned int* blockSize, const char** reason, bool preliminaryCheck) @@ -361,6 +370,15 @@ inline bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNu ClassLayout* const layout = comp->typGetArrayLayout(clsHnd, (unsigned)length); classSize = layout->GetSize(); + + if (!lengthKnown && layout->HasGCPtr()) + { + // We can't represent GC info for these yet + // + assert(length == 1); + *reason = "[unknown length, gc elements]"; + return false; + } } else if (allocType == OAT_NEWOBJ) {