Skip to content

Commit

Permalink
[LV] Don't consider IV increments uniform if exit value is used outside.
Browse files Browse the repository at this point in the history
In some cases, there might be a chain of uniform instructions producing
the exit value. To generate correct code in all cases, consider the IV
increment not uniform, if there are users outside the loop.

Instead, let VPlan narrow the IV, if possible using the logic from
3ff1d01.

Test case from #122602 verified with Alive2:
    https://alive2.llvm.org/ce/z/bA4EGj

Fixes #122496.
Fixes #122602.
  • Loading branch information
fhahn committed Jan 12, 2025
1 parent b4ce29a commit 8df64ed
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 32 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3806,7 +3806,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// uniform after vectorization.
bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
return I == Ind || Worklist.count(I) ||
IsVectorizedMemAccessUse(I, IndUpdate);
});
if (!UniformIndUpdate)
Expand Down
22 changes: 0 additions & 22 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,28 +621,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
Def->replaceAllUsesWith(Clone);
}

// Check if any uniform VPReplicateRecipes using the phi recipe are used by
// ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
// ensure the final value is available.
// TODO: Remove once uniformity analysis is done on VPlan.
for (VPUser *U : Users) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
VPValue *Op;
if (!ExitIRI || !match(ExitIRI->getOperand(0),
m_VPInstruction<VPInstruction::ExtractFromEnd>(
m_VPValue(Op), m_VPValue())))
continue;
auto *RepR = dyn_cast<VPReplicateRecipe>(Op);
if (!RepR || !RepR->isUniform())
continue;
assert(!RepR->isPredicated() && "RepR must not be predicated");
Instruction *I = RepR->getUnderlyingInstr();
auto *Clone =
new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false);
Clone->insertAfter(RepR);
RepR->replaceAllUsesWith(Clone);
}

// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ for.end:

; CHECK-LABEL: goo
; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
; CHECK-DAG: LV: Found uniform instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-DAG: LV: Found uniform instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-DAG: LV: Found uniform instruction: %exitcond = icmp eq i64 %indvars.iv, 1599

define i64 @goo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) #0 {
Expand Down
17 changes: 10 additions & 7 deletions llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1176,7 +1176,6 @@ e.exit:
}

; Test case for https://github.com/llvm/llvm-project/issues/122496.
; FIXME: Currently an incorrect live-out is used.
define i32 @iv_ext_used_outside( ptr %dst) {
; VEC-LABEL: define i32 @iv_ext_used_outside(
; VEC-SAME: ptr [[DST:%.*]]) {
Expand All @@ -1186,15 +1185,19 @@ define i32 @iv_ext_used_outside( ptr %dst) {
; VEC-NEXT: br label %[[VECTOR_BODY:.*]]
; VEC: [[VECTOR_BODY]]:
; VEC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VEC-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
; VEC-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
; VEC-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
; VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1
; VEC-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
; VEC-NEXT: [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
; VEC-NEXT: [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32
; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
; VEC-NEXT: [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; VEC-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
; VEC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
; VEC-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
; VEC: [[MIDDLE_BLOCK]]:
Expand All @@ -1213,7 +1216,7 @@ define i32 @iv_ext_used_outside( ptr %dst) {
; VEC-NEXT: [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
; VEC-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
; VEC: [[EXIT]]:
; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
; VEC-NEXT: ret i32 [[IV_1_EXT_LCSSA]]
;
; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
Expand Down Expand Up @@ -1274,7 +1277,6 @@ exit:
}

; Test case for https://github.com/llvm/llvm-project/issues/122602.
; FIXME: Currently an incorrect live-out is used.
define i64 @test_iv_increment_incremented(ptr %dst) {
; VEC-LABEL: define i64 @test_iv_increment_incremented(
; VEC-SAME: ptr [[DST:%.*]]) {
Expand All @@ -1288,8 +1290,9 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
; VEC-NEXT: [[TMP3:%.*]] = add i64 2, -1
; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1
; VEC-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 1
; VEC-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], 1
; VEC-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1
; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; VEC: [[MIDDLE_BLOCK]]:
; VEC-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
Expand All @@ -1307,7 +1310,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
; VEC-NEXT: [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
; VEC-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
; VEC: [[EXIT]]:
; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; VEC-NEXT: ret i64 [[IV_1_NEXT_LCSSA]]
;
; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(
Expand Down
90 changes: 90 additions & 0 deletions llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors=true -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck %s

define i32 @iv_live_out_wide(ptr %dst) {
; CHECK-LABEL: define i32 @iv_live_out_wide(
; CHECK-SAME: ptr [[DST:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32
; CHECK-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 2
; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
; CHECK-NEXT: store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
; CHECK-NEXT: store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i32> [[BROADCAST_SPLAT2]], [[STEP_ADD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 2
; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i32> [[TMP15]], i32 [[TMP19]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
; CHECK-NEXT: store i16 0, ptr [[GEP_DST]], align 2
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 2000
; CHECK-NEXT: br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[E_EXIT]]:
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RES]]
;
entry:
%step.1 = sext i8 0 to i32
%step.2 = add nsw i32 %step.1, 1
br label %loop

loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
store i16 0, ptr %gep.dst, align 2
%iv.next = add i32 %step.2, %iv
%cmp.i = icmp slt i32 %iv.next, 2000
br i1 %cmp.i, label %loop, label %e.exit

e.exit:
%res = phi i32 [ %iv.next, %loop ]
ret i32 %res
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
;.

0 comments on commit 8df64ed

Please sign in to comment.