Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ShuffleNative methods and optimise Shuffle for non-constant indices #99596

Open
wants to merge 41 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
95b8eb8
Squash into 1 commit
hamarb123 Aug 6, 2024
57e4884
Remove internal dependency on ShuffleUnsafe's behaviour wrt high bit
hamarb123 Aug 6, 2024
b9be44e
Optimise some codegen
hamarb123 Aug 7, 2024
1423e85
jit format
hamarb123 Aug 7, 2024
ff76287
jit format
hamarb123 Aug 7, 2024
ca1a5fa
Simplify logic for using Shuffle for ShuffleUnsafe
hamarb123 Aug 7, 2024
d64cad2
Merge branch 'main' into main12
hamarb123 Aug 19, 2024
bb974ca
Merge branch 'main' into main12
hamarb123 Sep 10, 2024
c1ff983
Merge branch 'main' into main12
hamarb123 Nov 13, 2024
7062967
Merge branch 'main' into main12
hamarb123 Jan 16, 2025
1c06b5d
Move `ShuffleUnsafeModified` out of `Base64Helper`
hamarb123 Jan 21, 2025
91f3a1c
Remove unnecessary `CompExactlyDependsOn` and `using`s
hamarb123 Jan 21, 2025
05e991d
Update SearchValues.cs
hamarb123 Jan 21, 2025
9d00885
Merge branch 'main' into main12
hamarb123 Jan 22, 2025
c5ca4e2
Support AVX-512/AVX-10.1 acceleration of Shuffle V128<ulong/long/double>
hamarb123 Jan 22, 2025
fc4b8f5
Additional optimisation for V512 constant index shuffle
hamarb123 Jan 22, 2025
9437603
jit format & typo
hamarb123 Jan 22, 2025
f381cf0
Fix operand order
hamarb123 Jan 22, 2025
f32bf01
Changes to `IsValidForShuffle` & jit format
hamarb123 Jan 22, 2025
5c4ba29
jit format
hamarb123 Jan 22, 2025
70e68b2
Update hwintrinsicxarch.cpp
hamarb123 Jan 22, 2025
eacf11a
Update hwintrinsicxarch.cpp
hamarb123 Jan 22, 2025
98a3e61
Update gentree.cpp
hamarb123 Jan 23, 2025
a35768c
Make `op2DupSafe` be consistently ordered
hamarb123 Jan 23, 2025
036263b
jit format
hamarb123 Jan 23, 2025
a4a6365
Use `compIsEvexOpportunisticallySupported` instead of explicit AVX-10…
hamarb123 Jan 23, 2025
39fda9c
jit format
hamarb123 Jan 23, 2025
89fc050
Update gentree.cpp
hamarb123 Jan 24, 2025
8683e8a
Update Vector128.cs
hamarb123 Jan 24, 2025
6950dfa
Use `BlockNonDeterministicIntrinsics` instead of `CompExactlyDependsOn`
hamarb123 Jan 24, 2025
26d68df
Ensure V128<byte> ShuffleUnsafe is not regressed on mono
hamarb123 Jan 24, 2025
1044d8e
Update gentree.cpp
hamarb123 Jan 24, 2025
85d9b46
Rename `ShuffleUnsafe` to `ShuffleNative`
hamarb123 Jan 28, 2025
0dffb6e
jit format
hamarb123 Jan 28, 2025
33d9b4d
jit format again
hamarb123 Jan 28, 2025
81fe049
um
hamarb123 Jan 28, 2025
912596e
Move assertion on arm64 to correct spot
hamarb123 Jan 29, 2025
d58e1be
Normalize indices for `vpshufb` in all cases of constant indices Shuffle
hamarb123 Jan 29, 2025
2e9ef8a
jit format
hamarb123 Jan 29, 2025
7657a97
Revert last pair of commits
hamarb123 Jan 29, 2025
690d8f8
Feedback
hamarb123 Jan 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3484,11 +3484,19 @@ class Compiler
GenTree* gtNewSimdRoundNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);

GenTree* gtNewSimdShuffleNodeVariable(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isUnsafe);

GenTree* gtNewSimdShuffleNode(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize);
unsigned simdSize,
bool isUnsafe);

GenTree* gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);
Expand Down Expand Up @@ -4788,7 +4796,7 @@ class Compiler
bool mustExpand);

#ifdef FEATURE_HW_INTRINSICS
bool IsValidForShuffle(GenTreeVecCon* vecCon, unsigned simdSize, var_types simdBaseType) const;
bool IsValidForShuffle(GenTree* indices, unsigned simdSize, var_types simdBaseType, bool* canBecomeValid) const;

GenTree* impHWIntrinsic(NamedIntrinsic intrinsic,
CORINFO_CLASS_HANDLE clsHnd,
Expand Down
858 changes: 777 additions & 81 deletions src/coreclr/jit/gentree.cpp

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2251,38 +2251,53 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector64_Shuffle:
case NI_Vector128_Shuffle:
case NI_Vector64_ShuffleUnsafe:
case NI_Vector128_ShuffleUnsafe:
{
assert((sig->numArgs == 2) || (sig->numArgs == 3));
assert((simdSize == 8) || (simdSize == 16));

GenTree* indices = impStackTop(0).val;

if (!indices->IsCnsVec() || !IsValidForShuffle(indices->AsVecCon(), simdSize, simdBaseType))
// Check if the required intrinsics to emit are available.
if (!IsValidForShuffle(indices, simdSize, simdBaseType, nullptr))
{
break;
}

// If the indices might become constant later, then we don't emit for now, delay until later.
if (!indices->IsCnsVec())
{
assert(sig->numArgs == 2);

if (!opts.OptimizationEnabled())
if (opts.OptimizationEnabled())
{
// Only enable late stage rewriting if optimizations are enabled
// as we won't otherwise encounter a constant at the later point
return nullptr;
}
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();

op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize);

retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize);

retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint));
break;
retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint));
break;
}
}

if (sig->numArgs == 2)
{
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();

retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize);
bool isUnsafe = intrinsic == NI_Vector64_ShuffleUnsafe || intrinsic == NI_Vector128_ShuffleUnsafe;
if (indices->IsCnsVec())
{
retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize, isUnsafe);
}
else
{
retNode = gtNewSimdShuffleNodeVariable(retType, op1, op2, simdBaseJitType, simdSize, isUnsafe);
}
}
break;
}
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ HARDWARE_INTRINSIC(Vector64, Narrow,
HARDWARE_INTRINSIC(Vector64, Round, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, ShiftLeft, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, Shuffle, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector64, ShuffleUnsafe, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector64, Sqrt, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, StoreAligned, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, StoreAlignedNonTemporal, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down Expand Up @@ -213,6 +214,7 @@ HARDWARE_INTRINSIC(Vector128, Narrow,
HARDWARE_INTRINSIC(Vector128, Round, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, ShiftLeft, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, Shuffle, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector128, ShuffleUnsafe, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ HARDWARE_INTRINSIC(Vector128, Narrow,
HARDWARE_INTRINSIC(Vector128, Round, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, ShiftLeft, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, Shuffle, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector128, ShuffleUnsafe, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down Expand Up @@ -228,6 +229,7 @@ HARDWARE_INTRINSIC(Vector256, Narrow,
HARDWARE_INTRINSIC(Vector256, Round, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector256, ShiftLeft, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector256, Shuffle, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector256, ShuffleUnsafe, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector256, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, StoreAligned, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
Expand Down Expand Up @@ -348,6 +350,7 @@ HARDWARE_INTRINSIC(Vector512, Narrow,
HARDWARE_INTRINSIC(Vector512, Round, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector512, ShiftLeft, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector512, Shuffle, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector512, ShuffleUnsafe, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector512, Sqrt, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector512, StoreAligned, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down
Loading
Loading