Skip to content

Commit

Permalink
JIT: Emulate missing x86 shift instructions for xplat intrinsics (#11…
Browse files Browse the repository at this point in the history
…1108)

* emulate missing x86 shift instructions

* disable vpsraq emulation on 32-bit

* use logical shift for mask

* fix disasm for shift instructions

* allow vpsraq emulation on 32-bit for const shift amount

* review feedback

---------

Co-authored-by: Tanner Gooding <[email protected]>
  • Loading branch information
saucecontrol and tannergooding authored Jan 10, 2025
1 parent 289aa17 commit 574b967
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 27 deletions.
10 changes: 9 additions & 1 deletion src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12358,10 +12358,18 @@ void emitter::emitDispIns(
reg2 = reg3;
reg3 = tmp;
}

emitAttr attr3 = attr;
if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0))
{
// Shift instructions take xmm for the 3rd operand regardless of instruction size.
attr3 = EA_16BYTE;
}

printf("%s", emitRegName(id->idReg1(), attr));
emitDispEmbMasking(id);
printf(", %s, ", emitRegName(reg2, attr));
printf("%s", emitRegName(reg3, attr));
printf("%s", emitRegName(reg3, attr3));
emitDispEmbRounding(id);
break;
}
Expand Down
78 changes: 62 additions & 16 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7936,12 +7936,26 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type)

switch (type)
{
case TYP_BYTE:
case TYP_UBYTE:
{
return gtNewIconNode(0xFF);
}

case TYP_SHORT:
case TYP_USHORT:
{
return gtNewIconNode(0xFFFF);
}

case TYP_INT:
case TYP_UINT:
{
return gtNewIconNode(-1);
}

case TYP_LONG:
case TYP_ULONG:
{
return gtNewLconNode(-1);
}
Expand Down Expand Up @@ -20925,8 +20939,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(

unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;

GenTree* nonConstantByteShiftCountOp = NULL;

if (op2->IsCnsIntOrI())
{
op2->AsIntCon()->gtIconVal &= shiftCountMask;
Expand Down Expand Up @@ -21090,39 +21102,73 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}

#if defined(TARGET_XARCH)
case GT_RSZ:
case GT_LSH:
case GT_RSH:
case GT_RSZ:
{
// We don't have actual instructions for shifting bytes, so we'll emulate them
// by shifting 32-bit values and masking off the bits that should be zeroed.
// This emulates byte shift instructions, which don't exist in x86 SIMD,
// plus arithmetic shift of qwords, which did not exist before AVX-512.

assert(varTypeIsByte(simdBaseType));
assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));

intrinsic =
GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false);
// We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op;
intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup,
genActualType(simdBaseType), simdSize, false);
assert(intrinsic != NI_Illegal);

GenTree* maskAmountOp;

if (op2->IsCnsIntOrI())
{
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
ssize_t mask = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF);

maskAmountOp = gtNewIconNode(mask, type);
if (varTypeIsByte(simdBaseType))
{
ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount);
maskAmountOp = gtNewIconNode(mask, type);
}
else
{
int64_t mask = static_cast<int64_t>(0xFFFFFFFFFFFFFFFFULL >> shiftCount);
maskAmountOp = gtNewLconNode(mask);
}
}
else
{
assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar));

GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
if (op == GT_RSH)
{
// For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means
// the mask will be evaluated before the shift. We swap the copied operand with the shift amount
// operand here in order to preserve correct evaluation order for the masked shift count.
std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
}

maskAmountOp = gtNewOperNode(instrOp, genActualType(simdBaseType), gtNewAllBitsSetConNode(simdBaseType),
shiftCountDup);
}

GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
if (op == GT_RSH)
{
GenTree* op1Dup = fgMakeMultiUse(&op1);
GenTree* signOp =
gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize);

CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType;
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize);
}
else
{
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
}
}
#endif // TARGET_XARCH

Expand Down
15 changes: 6 additions & 9 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3443,20 +3443,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if (varTypeIsByte(simdBaseType))
{
// byte and sbyte would require more work to support
break;
}

if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
#if defined(TARGET_X86)
if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
{
if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
{
// long, ulong, and double would require more work to support
// If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const
// amount. Otherwise, more work is required for long types, so we fall back to managed for now.
break;
}
}
#endif // TARGET_X86

if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/importercalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd,

bool betterToExpand = false;

// Allow some lighweight intrinsics in Tier0 which can improve throughput
// Allow some lightweight intrinsics in Tier0 which can improve throughput
// we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
if (!mustExpand && opts.Tier0OptimizationEnabled())
{
Expand Down

0 comments on commit 574b967

Please sign in to comment.