From 8df64ed77727ab9b7540819f2fe64379e88a50be Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 12 Jan 2025 22:03:21 +0000 Subject: [PATCH] [LV] Don't consider IV increments uniform if exit value is used outside. In some cases, there might be a chain of uniform instructions producing the exit value. To generate correct code in all cases, consider the IV increment not uniform, if there are users outside the loop. Instead, let VPlan narrow the IV, if possible using the logic from 3ff1d01985752. Test case from #122602 verified with Alive2: https://alive2.llvm.org/ce/z/bA4EGj Fixes https://github.com/llvm/llvm-project/issues/122496. Fixes https://github.com/llvm/llvm-project/issues/122602. --- .../Transforms/Vectorize/LoopVectorize.cpp | 2 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 22 ----- .../LoopVectorize/X86/uniform-phi.ll | 2 - .../LoopVectorize/iv_outside_user.ll | 17 ++-- .../LoopVectorize/scalable-iv-outside-user.ll | 90 +++++++++++++++++++ 5 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b017b61a45a0c32..d32a463a996c4f8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3806,7 +3806,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // uniform after vectorization. bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + return I == Ind || Worklist.count(I) || IsVectorizedMemAccessUse(I, IndUpdate); }); if (!UniformIndUpdate) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f440bf2eb022376..545d277d7aa0187 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -621,28 +621,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Def->replaceAllUsesWith(Clone); } - // Check if any uniform VPReplicateRecipes using the phi recipe are used by - // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to - // ensure the final value is available. - // TODO: Remove once uniformity analysis is done on VPlan. - for (VPUser *U : Users) { - auto *ExitIRI = dyn_cast(U); - VPValue *Op; - if (!ExitIRI || !match(ExitIRI->getOperand(0), - m_VPInstruction( - m_VPValue(Op), m_VPValue()))) - continue; - auto *RepR = dyn_cast(Op); - if (!RepR || !RepR->isUniform()) - continue; - assert(!RepR->isPredicated() && "RepR must not be predicated"); - Instruction *I = RepR->getUnderlyingInstr(); - auto *Clone = - new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false); - Clone->insertAfter(RepR); - RepR->replaceAllUsesWith(Clone); - } - // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast(&Phi)) { diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll index f0641154f85c7b0..5e97627e7688b3e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll @@ -51,8 +51,6 @@ for.end: ; CHECK-LABEL: goo ; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop. -; CHECK-DAG: LV: Found uniform instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] -; CHECK-DAG: LV: Found uniform instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK-DAG: LV: Found uniform instruction: %exitcond = icmp eq i64 %indvars.iv, 1599 define i64 @goo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index 6b0c677b56d2c6c..3e61546da2cebc1 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -1176,7 +1176,6 @@ e.exit: } ; Test case for https://github.com/llvm/llvm-project/issues/122496. -; FIXME: Currently an incorrect live-out is used. define i32 @iv_ext_used_outside( ptr %dst) { ; VEC-LABEL: define i32 @iv_ext_used_outside( ; VEC-SAME: ptr [[DST:%.*]]) { @@ -1186,15 +1185,19 @@ define i32 @iv_ext_used_outside( ptr %dst) { ; VEC-NEXT: br label %[[VECTOR_BODY:.*]] ; VEC: [[VECTOR_BODY]]: ; VEC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; VEC-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 ; VEC-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 ; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]] ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0 ; VEC-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4 -; VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1 +; VEC-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1) +; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 ; VEC-NEXT: [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32 -; VEC-NEXT: [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32 +; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 +; VEC-NEXT: [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32 ; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; VEC-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; VEC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; VEC-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; VEC: [[MIDDLE_BLOCK]]: @@ -1213,7 +1216,7 @@ define i32 @iv_ext_used_outside( ptr %dst) { ; VEC-NEXT: [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128 ; VEC-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}} ; VEC: [[EXIT]]: -; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; VEC-NEXT: ret i32 [[IV_1_EXT_LCSSA]] ; ; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside( @@ -1274,7 +1277,6 @@ exit: } ; Test case for https://github.com/llvm/llvm-project/issues/122602. -; FIXME: Currently an incorrect live-out is used. define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-LABEL: define i64 @test_iv_increment_incremented( ; VEC-SAME: ptr [[DST:%.*]]) { @@ -1288,8 +1290,9 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1 ; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2 ; VEC-NEXT: [[TMP3:%.*]] = add i64 2, -1 +; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1 ; VEC-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 1 -; VEC-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], 1 +; VEC-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 ; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VEC: [[MIDDLE_BLOCK]]: ; VEC-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -1307,7 +1310,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-NEXT: [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1 ; VEC-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} ; VEC: [[EXIT]]: -; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] +; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; VEC-NEXT: ret i64 [[IV_1_NEXT_LCSSA]] ; ; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented( diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll new file mode 100644 index 000000000000000..d88b0583ffc042e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors=true -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck %s + +define i32 @iv_live_out_wide(ptr %dst) { +; CHECK-LABEL: define i32 @iv_live_out_wide( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32 +; CHECK-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[STEP_2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-NEXT: store zeroinitializer, ptr [[TMP11]], align 2 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP14]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = add [[BROADCAST_SPLAT2]], [[STEP_ADD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[TMP15]], i32 [[TMP19]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[E_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i16 0, ptr [[GEP_DST]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]] +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 2000 +; CHECK-NEXT: br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[E_EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %step.1 = sext i8 0 to i32 + %step.2 = add nsw i32 %step.1, 1 + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv + store i16 0, ptr %gep.dst, align 2 + %iv.next = add i32 %step.2, %iv + %cmp.i = icmp slt i32 %iv.next, 2000 + br i1 %cmp.i, label %loop, label %e.exit + +e.exit: + %res = phi i32 [ %iv.next, %loop ] + ret i32 %res +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.