diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8dd8234ca554d7..2e6a5e80d84834 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -26044,21 +26044,13 @@ GenTree* Compiler::gtNewSimdShuffleNode( if (value < elementCount) { - if (simdSize == 32) - { - // Most of the 256-bit shuffle/permute instructions operate as if - // the inputs were 2x 128-bit values. If the selected indices cross - // the respective 128-bit "lane" we may need to specialize the codegen + // Most of the 256-bit shuffle/permute instructions operate as if + // the inputs were 2x 128-bit values. If the selected indices cross + // the respective 128-bit "lane" we may need to specialize the codegen. + // Also, for AVX-512: If we don't cross 128-bit lanes, then we can emit vpshufb + // instead of vperm* - which has lower latency & allows zeroing in 1 step. - if (index < (elementCount / 2)) - { - crossLane |= (value >= (elementCount / 2)); - } - else - { - crossLane |= (value < (elementCount / 2)); - } - } + crossLane ||= ((((uint64_t)index ^ value) * elementSize) & ~(uint64_t)15) != 0; // Setting the control for byte/sbyte and short/ushort is unnecessary // and will actually compute an incorrect control word. But it simplifies @@ -26283,7 +26275,15 @@ GenTree* Compiler::gtNewSimdShuffleNode( else if (simdSize == 64) { assert(IsBaselineVector512IsaSupportedDebugOnly()); - if (elementSize == 4) + if (!crossLane) + { + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; + return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512BW_Shuffle, simdBaseJitType, simdSize); + } + else if (elementSize == 4) { for (uint32_t i = 0; i < elementCount; i++) {