From 59c64326d11d9c31e5cf265eca57e2715e183bc5 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 3 Jun 2022 10:54:37 -0700 Subject: [PATCH 1/2] Fixing the costing of GT_CNS_DBL and GT_CNS_VEC instructions --- src/coreclr/jit/gentree.cpp | 49 ++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 85217814eb2951..7669bc892d1328 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -4575,17 +4575,21 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) { level = 0; #if defined(TARGET_XARCH) - /* We use fldz and fld1 to load 0.0 and 1.0, but all other */ - /* floating point constants are loaded using an indirection */ if (tree->IsFloatPositiveZero()) { + // We generate `xorp* tgtReg, tgtReg` which is 3-5 bytes + // but which can be elided by the instruction decoder. + costEx = 1; - costSz = 1; + costSz = 2; } else { + // We generate `movs* tgtReg, [mem]` which is 4-6 bytes + // and which has the same cost as an indirection. + costEx = IND_COST_EX; - costSz = 4; + costSz = 2; } #elif defined(TARGET_ARM) var_types targetType = tree->TypeGet(); @@ -4603,13 +4607,18 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) #elif defined(TARGET_ARM64) if (tree->IsFloatPositiveZero() || emitter::emitIns_valid_imm_for_fmov(tree->AsDblCon()->gtDconVal)) { + // Zero and certain other immediates can be specially created with a single instruction + // These can be cheaply reconstituted but still take up 4-bytes of native codegen + costEx = 1; - costSz = 1; + costSz = 2; } else { + // We load the constant from memory and so will take the same cost as GT_IND + costEx = IND_COST_EX; - costSz = 4; + costSz = 2; } #elif defined(TARGET_LOONGARCH64) // TODO-LoongArch64-CQ: tune the costs. @@ -4623,9 +4632,25 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) case GT_CNS_VEC: { - costEx = IND_COST_EX; - costSz = 4; level = 0; + + if (tree->AsVecCon()->IsAllBitsSet() || tree->AsVecCon()->IsZero()) + { + // We generate `cmpeq* tgtReg, tgtReg`, which is 4-5 bytes, for AllBitsSet + // and generate `xorp* tgtReg, tgtReg`, which is 3-5 bytes, for Zero + // both of which can be elided by the instruction decoder. + + costEx = 1; + costSz = 2; + } + else + { + // We generate `movup* tgtReg, [mem]` which is 4-6 bytes + // and which has the same cost as an indirection. + + costEx = IND_COST_EX; + costSz = 2; + } break; } @@ -4972,16 +4997,12 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz += 1; } +#ifdef TARGET_ARM if (isflt) { - if (tree->TypeGet() == TYP_DOUBLE) - { - costEx += 1; - } -#ifdef TARGET_ARM costSz += 2; -#endif // TARGET_ARM } +#endif // TARGET_ARM // Can we form an addressing mode with this indirection? // TODO-CQ: Consider changing this to op1->gtEffectiveVal() to take into account From 479c660694746b10c98f6c06e6f1eb946c5d1e67 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 3 Jun 2022 16:24:28 -0700 Subject: [PATCH 2/2] Applying formatting patch --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7669bc892d1328..1ee666fd78cf13 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -4632,7 +4632,7 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) case GT_CNS_VEC: { - level = 0; + level = 0; if (tree->AsVecCon()->IsAllBitsSet() || tree->AsVecCon()->IsZero()) {