Skip to content

Commit

Permalink
Additional optimisation for V512 constant index shuffle
Browse files Browse the repository at this point in the history
- When 128-bit lanes are not crossed, emit vpshufb instead of vperm*
  • Loading branch information
hamarb123 committed Jan 22, 2025
1 parent c5ca4e2 commit fc4b8f5
Showing 1 changed file with 15 additions and 15 deletions.
30 changes: 15 additions & 15 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26044,21 +26044,13 @@ GenTree* Compiler::gtNewSimdShuffleNode(

if (value < elementCount)
{
if (simdSize == 32)
{
// Most of the 256-bit shuffle/permute instructions operate as if
// the inputs were 2x 128-bit values. If the selected indices cross
// the respective 128-bit "lane" we may need to specialize the codegen
// Most of the 256-bit shuffle/permute instructions operate as if
// the inputs were 2x 128-bit values. If the selected indices cross
// the respective 128-bit "lane" we may need to specialize the codegen.
// Also, for AVX-512: If we don't cross 128-bit lanes, then we can emit vpshufb
// instead of vperm* - which has lower latency & allows zeroing in 1 step.

if (index < (elementCount / 2))
{
crossLane |= (value >= (elementCount / 2));
}
else
{
crossLane |= (value < (elementCount / 2));
}
}
crossLane ||= ((((uint64_t)index ^ value) * elementSize) & ~(uint64_t)15) != 0;

// Setting the control for byte/sbyte and short/ushort is unnecessary
// and will actually compute an incorrect control word. But it simplifies
Expand Down Expand Up @@ -26283,7 +26275,15 @@ GenTree* Compiler::gtNewSimdShuffleNode(
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
if (elementSize == 4)
if (!crossLane)
{
op2 = gtNewVconNode(type);
op2->AsVecCon()->gtSimdVal = vecCns;

simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512BW_Shuffle, simdBaseJitType, simdSize);
}
else if (elementSize == 4)
{
for (uint32_t i = 0; i < elementCount; i++)
{
Expand Down

0 comments on commit fc4b8f5

Please sign in to comment.