dotnet · tannergooding · Jan 10, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 7, 2025
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
@@ -12358,10 +12358,18 @@ void emitter::emitDispIns(
                 reg2          = reg3;
                 reg3          = tmp;
             }
+
+            emitAttr attr3 = attr;
+            if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0))
+            {
+                // Shift instructions take xmm for the 3rd operand regardless of instruction size.
+                attr3 = EA_16BYTE;
+            }
+
             printf("%s", emitRegName(id->idReg1(), attr));
             emitDispEmbMasking(id);
             printf(", %s, ", emitRegName(reg2, attr));
-            printf("%s", emitRegName(reg3, attr));
+            printf("%s", emitRegName(reg3, attr3));
             emitDispEmbRounding(id);
             break;
         }

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
@@ -20925,8 +20925,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
 
             unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;
 
-            GenTree* nonConstantByteShiftCountOp = NULL;
-
             if (op2->IsCnsIntOrI())
             {
                 op2->AsIntCon()->gtIconVal &= shiftCountMask;
@@ -21090,39 +21088,75 @@ GenTree* Compiler::gtNewSimdBinOpNode(
         }
 
 #if defined(TARGET_XARCH)
-        case GT_RSZ:
         case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
         {
-            // We don't have actual instructions for shifting bytes, so we'll emulate them
-            // by shifting 32-bit values and masking off the bits that should be zeroed.
+            // This emulates byte shift instructions, which don't exist in x86 SIMD,
+            // plus arithmetic shift of qwords, which did not exist before AVX-512.
 
-            assert(varTypeIsByte(simdBaseType));
+            assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));
 
-            intrinsic =
-                GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false);
+            // We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
+            genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op;
+            intrinsic          = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup,
+                                                                              genActualType(simdBaseType), simdSize, false);
             assert(intrinsic != NI_Illegal);
 
             GenTree* maskAmountOp;
 
             if (op2->IsCnsIntOrI())
             {
                 ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
-                ssize_t mask       = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF);
-
-                maskAmountOp = gtNewIconNode(mask, type);
+                if (varTypeIsByte(simdBaseType))
+                {
+                    ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount);
+                    maskAmountOp = gtNewIconNode(mask, type);
+                }
+                else
+                {
+                    int64_t mask = static_cast<int64_t>(0xFFFFFFFFFFFFFFFFULL >> shiftCount);
+                    maskAmountOp = gtNewLconNode(mask);
+                }
             }
             else
             {
                 assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar));
 
-                GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
-                maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
+                GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
+                if (op == GT_RSH)
+                {
+                    // For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means
+                    // the mask will be evaluated before the shift. We swap the copied operand with the shift amount
+                    // operand here in order to preserve correct evaluation order for the masked shift count.
+                    std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
+                }
+
+                maskAmountOp =
+                    varTypeIsByte(simdBaseType)
+                        ? gtNewOperNode(instrOp, TYP_INT, gtNewIconNode(0xFF), shiftCountDup)
+                        : gtNewOperNode(instrOp, simdBaseType, gtNewLconNode(0xFFFFFFFFFFFFFFFFULL), shiftCountDup);
             }
 
-            GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
-            GenTree* maskOp  = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
+            if (op == GT_RSH)
+            {
+                GenTree* op1Dup = fgMakeMultiUse(&op1);
+                GenTree* signOp =
+                    gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize);
+
+                CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType;
+                GenTree*    shiftOp   = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize);
+                GenTree*    maskOp    = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
 
-            return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
+                return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize);
+            }
+            else
+            {
+                GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
+                GenTree* maskOp  = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
+
+                return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
+            }
         }
 #endif // TARGET_XARCH
 

diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -3443,20 +3443,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 2);
 
-            if (varTypeIsByte(simdBaseType))
-            {
-                // byte and sbyte would require more work to support
-                break;
-            }
-
-            if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
+#if defined(TARGET_X86)
+            if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
             {
-                if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
+                if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
                 {
-                    // long, ulong, and double would require more work to support
+                    // we need vpsraq to handle signed long with non-const shift amount, use software fallback
                     break;
                 }
             }
+#endif // TARGET_X86
 
             if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
             {

diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp
@@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE    clsHnd,
 
     bool betterToExpand = false;
 
-    // Allow some lighweight intrinsics in Tier0 which can improve throughput
+    // Allow some lightweight intrinsics in Tier0 which can improve throughput
     // we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
     if (!mustExpand && opts.Tier0OptimizationEnabled())
     {