From 1d9e50cb4735df46d3de0cee5791e97295eaf588 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Wed, 30 Sep 2020 02:35:44 +0300 Subject: [PATCH] [RyuJIT] Add "rorx" instruction (BMI2) and emit it instead of "rol" when possible (#41772) * Use rorx instead of rol when possible --- src/coreclr/src/jit/codegenxarch.cpp | 22 ++++++++++++++++++---- src/coreclr/src/jit/emitxarch.cpp | 4 ++++ src/coreclr/src/jit/instrsxarch.h | 1 + 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 6bca839d10514b..3bcda89f437d15 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -4080,10 +4080,11 @@ void CodeGen::genCodeForShift(GenTree* tree) if (shiftBy->isContainedIntOrIImmed()) { + emitAttr size = emitTypeSize(tree); + // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg" if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1)) { - emitAttr size = emitTypeSize(tree); if (tree->GetRegNum() == operandReg) { GetEmitter()->emitIns_R_R(INS_add, size, tree->GetRegNum(), operandReg); @@ -4095,6 +4096,21 @@ void CodeGen::genCodeForShift(GenTree* tree) } else { + int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); + +#if defined(TARGET_64BIT) + // Try to emit rorx if BMI2 is available instead of mov+rol + // it makes sense only for 64bit integers + if ((genActualType(targetType) == TYP_LONG) && (tree->GetRegNum() != operandReg) && + compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && tree->OperIs(GT_ROL, GT_ROR) && + (shiftByValue > 0) && (shiftByValue < 64)) + { + const int value = tree->OperIs(GT_ROL) ? (64 - shiftByValue) : shiftByValue; + GetEmitter()->emitIns_R_R_I(INS_rorx, size, tree->GetRegNum(), operandReg, value); + genProduceReg(tree); + return; + } +#endif // First, move the operand to the destination register and // later on perform the shift in-place. // (LSRA will try to avoid this situation through preferencing.) @@ -4102,9 +4118,7 @@ void CodeGen::genCodeForShift(GenTree* tree) { inst_RV_RV(INS_mov, tree->GetRegNum(), operandReg, targetType); } - - int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); - inst_RV_SH(ins, emitTypeSize(tree), tree->GetRegNum(), shiftByValue); + inst_RV_SH(ins, size, tree->GetRegNum(), shiftByValue); } } else diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index 7248dcb106aa6f..58af8ad5306d58 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -524,6 +524,7 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) case INS_mulx: case INS_pdep: case INS_pext: + case INS_rorx: return true; default: return false; @@ -758,6 +759,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c { switch (ins) { + case INS_rorx: case INS_pdep: case INS_mulx: { @@ -1242,6 +1244,7 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_pextrq: case INS_pextrw: case INS_pextrw_sse41: + case INS_rorx: { // These SSE instructions write to a general purpose integer register. return false; @@ -14944,6 +14947,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_tzcnt: case INS_popcnt: case INS_crc32: + case INS_rorx: case INS_pdep: case INS_pext: case INS_addsubps: diff --git a/src/coreclr/src/jit/instrsxarch.h b/src/coreclr/src/jit/instrsxarch.h index 60616c5177b58f..c6a50690f70290 100644 --- a/src/coreclr/src/jit/instrsxarch.h +++ b/src/coreclr/src/jit/instrsxarch.h @@ -594,6 +594,7 @@ INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Bit Field Extract // BMI2 +INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_FLAGS_None) INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_Flags_IsDstDstSrcAVXInstruction) // Zero High Bits Starting with Specified Bit Position