Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Emulate missing x86 shift instructions for xplat intrinsics #111108

Merged
merged 8 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12358,10 +12358,18 @@ void emitter::emitDispIns(
reg2 = reg3;
reg3 = tmp;
}

emitAttr attr3 = attr;
if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0))
{
// Shift instructions take xmm for the 3rd operand regardless of instruction size.
attr3 = EA_16BYTE;
}
Comment on lines +12362 to +12367
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite sure this is the "best" fix.

This feels like a case where we could utilize the existing tags for memory operand size for the register as well rather than hardcoding a one-off special case.

Not going to block on it, especially since this is only used for disassembly; but its something that would be nice to have improved for this and other cases where the last operand is a different register size than the others.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate on your preferred fix here? This is already using the tuple type rather than hard-coded instruction list -- I just mention shift in the comment explicitly because it's the only thing I'm aware of that uses full xmm/m128 for what is ultimately an 8-bit scalar, and this should be the only encoding form where that xmm fix is required.


printf("%s", emitRegName(id->idReg1(), attr));
emitDispEmbMasking(id);
printf(", %s, ", emitRegName(reg2, attr));
printf("%s", emitRegName(reg3, attr));
printf("%s", emitRegName(reg3, attr3));
emitDispEmbRounding(id);
break;
}
Expand Down
66 changes: 50 additions & 16 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20925,8 +20925,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(

unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;

GenTree* nonConstantByteShiftCountOp = NULL;

if (op2->IsCnsIntOrI())
{
op2->AsIntCon()->gtIconVal &= shiftCountMask;
Expand Down Expand Up @@ -21090,39 +21088,75 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}

#if defined(TARGET_XARCH)
case GT_RSZ:
case GT_LSH:
case GT_RSH:
case GT_RSZ:
{
// We don't have actual instructions for shifting bytes, so we'll emulate them
// by shifting 32-bit values and masking off the bits that should be zeroed.
// This emulates byte shift instructions, which don't exist in x86 SIMD,
// plus arithmetic shift of qwords, which did not exist before AVX-512.

assert(varTypeIsByte(simdBaseType));
assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));

intrinsic =
GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false);
// We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op;
intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup,
genActualType(simdBaseType), simdSize, false);
assert(intrinsic != NI_Illegal);

GenTree* maskAmountOp;

if (op2->IsCnsIntOrI())
{
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
ssize_t mask = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF);

maskAmountOp = gtNewIconNode(mask, type);
if (varTypeIsByte(simdBaseType))
{
ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount);
maskAmountOp = gtNewIconNode(mask, type);
}
else
{
int64_t mask = static_cast<int64_t>(0xFFFFFFFFFFFFFFFFULL >> shiftCount);
maskAmountOp = gtNewLconNode(mask);
}
}
else
{
assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar));

GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
if (op == GT_RSH)
{
// For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means
// the mask will be evaluated before the shift. We swap the copied operand with the shift amount
// operand here in order to preserve correct evaluation order for the masked shift count.
std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
}

maskAmountOp =
varTypeIsByte(simdBaseType)
? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
: gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
saucecontrol marked this conversation as resolved.
Show resolved Hide resolved
}

GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
if (op == GT_RSH)
{
GenTree* op1Dup = fgMakeMultiUse(&op1);
GenTree* signOp =
gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize);

CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType;
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize);
}
else
{
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
}
}
#endif // TARGET_XARCH

Expand Down
14 changes: 5 additions & 9 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3443,20 +3443,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if (varTypeIsByte(simdBaseType))
{
// byte and sbyte would require more work to support
break;
}

if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
#if defined(TARGET_X86)
if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
{
if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
{
// long, ulong, and double would require more work to support
// we need vpsraq to handle signed long with non-const shift amount, use software fallback
saucecontrol marked this conversation as resolved.
Show resolved Hide resolved
break;
}
}
#endif // TARGET_X86

if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/importercalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd,

bool betterToExpand = false;

// Allow some lighweight intrinsics in Tier0 which can improve throughput
// Allow some lightweight intrinsics in Tier0 which can improve throughput
// we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
if (!mustExpand && opts.Tier0OptimizationEnabled())
{
Expand Down
Loading