From 66efb986322b206834e7c9e1eb777fa053912c39 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Fri, 30 Dec 2022 19:06:59 +0300 Subject: [PATCH] [CVP] Expand bound `urem`s This kind of thing happens really frequently in LLVM's very own shuffle combining methods, and it is even considered bad practice to use `%` there, instead of using this expansion directly. Though, many of the cases there have variable divisors, so this won't help everything. Simple case: https://alive2.llvm.org/ce/z/PjvYf- There's alternative expansion via `umin`: https://alive2.llvm.org/ce/z/hWCVPb BUT while we can transform the first expansion into the `umin` one (e.g. for SCEV): https://alive2.llvm.org/ce/z/iNxKmJ ... we can't go in the opposite direction. Also, the non-`umin` expansion seems somewhat more codegen-friendly: https://godbolt.org/z/qzjx5bqWK https://godbolt.org/z/a7bj1axbx There's second variant of precondition: https://alive2.llvm.org/ce/z/zE6cbM but there the numerator must be non-undef / must be frozen. --- .../Scalar/CorrelatedValuePropagation.cpp | 60 ++++++++++++++++++++++ .../CorrelatedValuePropagation/urem-expansion.ll | 53 ++++++++++++++----- .../Transforms/CorrelatedValuePropagation/urem.ll | 6 +-- llvm/test/Transforms/PhaseOrdering/cmp-logic.ll | 11 ++-- 4 files changed, 108 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 5c3fdc1..1093f4e 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -769,6 +769,63 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { return true; } +static bool expandURem(BinaryOperator *Instr, LazyValueInfo *LVI) { + assert(Instr->getOpcode() == Instruction::URem); + assert(!Instr->getType()->isVectorTy()); + + Value *X = Instr->getOperand(0); + Value *Y = Instr->getOperand(1); + + ConstantRange XCR = LVI->getConstantRange(X, Instr); + ConstantRange YCR = LVI->getConstantRange(Y, Instr); + + // Given + // R = X u% Y + // We can represent the modulo operation as a loop/self-recursion: + // urem_rec(X, Y): + // Z = X - Y + // if X u< Y + // ret X + // else + // ret urem_rec(Z, Y) + // which isn't better, but if we only need a single iteration + // to compute the answer, this becomes quite good: + // R = X < Y ? X : X - Y iff X u< 2*Y (w/ unsigned saturation) + // Now, we do not care about all full multiples of Y in X, they do not change + // the answer, thus we could rewrite the expression as: + // X* = X - (Y * |_ X / Y _|) + // R = X* % Y + // so we don't need the *first* iteration to return, we just need to + // know *which* iteration will always return, so we could also rewrite it as: + // X* = X - (Y * |_ X / Y _|) + // R = X* % Y iff X* u< 2*Y (w/ unsigned saturation) + // but that does not seem profitable here. + + bool XIsBound = + XCR.icmp(ICmpInst::ICMP_ULT, YCR.umul_sat(APInt(YCR.getBitWidth(), 2))); + // Even if we don't know X's range, the divisor may be so large, X can't ever + // be 2x larger than that. I.e. if divisor is always negative. + if (!XIsBound && !YCR.isAllNegative()) + return false; + + IRBuilder<> B{Instr}; + if (!XIsBound) { + // NOTE: this transformation increases use count on X, but that is fine + // unless the transformation is valid because the divisor is negative, + // and is non-variable, and thus we didn't have any extra uses. + if (auto *Ycst = dyn_cast(Y); Ycst && Ycst->isNegative()) + X = B.CreateFreeze(X, X->getName() + ".frozen"); + } + auto *AdjX = B.CreateNUWSub(X, Y, Instr->getName() + ".urem"); + auto *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, X, Y, Instr->getName() + ".cmp"); + auto *ExpandedURem = B.CreateSelect(Cmp, X, AdjX); + ExpandedURem->takeName(Instr); + Instr->replaceAllUsesWith(ExpandedURem); + Instr->eraseFromParent(); + ++NumURemExpanded; + return true; +} + static bool processURem(BinaryOperator *Instr, LazyValueInfo *LVI) { assert(Instr->getOpcode() == Instruction::URem); assert(!Instr->getType()->isVectorTy()); @@ -787,6 +844,9 @@ static bool processURem(BinaryOperator *Instr, LazyValueInfo *LVI) { return true; } + if (expandURem(Instr, LVI)) + return true; + return false; } diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll b/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll index 41a6519..26e8bc3 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll @@ -20,7 +20,9 @@ define i8 @constant.divisor.v4(i8 %x) { ; CHECK-LABEL: @constant.divisor.v4( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 3 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 3 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 3 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x.upper = icmp ult i8 %x, 4 @@ -31,7 +33,9 @@ define i8 @constant.divisor.v4(i8 %x) { define i8 @constant.divisor.x.range.v4(ptr %x.ptr) { ; CHECK-LABEL: @constant.divisor.x.range.v4( ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[X_PTR:%.*]], align 1, !range [[RNG0:![0-9]+]] -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 3 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 3 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 3 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %x = load i8, ptr %x.ptr, !range !{ i8 0, i8 4 } @@ -42,7 +46,9 @@ define i8 @constant.divisor.v5(i8 %x) { ; CHECK-LABEL: @constant.divisor.v5( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], 5 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 3 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 3 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 3 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x.upper = icmp ult i8 %x, 5 @@ -54,7 +60,9 @@ define i8 @constant.divisor.v6(i8 %x) { ; CHECK-LABEL: @constant.divisor.v6( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 3 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 3 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 3 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x.upper = icmp ult i8 %x, 6 @@ -104,7 +112,9 @@ define i8 @variable.v4(i8 %x, i8 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x = icmp ult i8 %x, 4 @@ -120,7 +130,9 @@ define i8 @variable.v4.range(ptr %x.ptr, ptr %y.ptr) { ; CHECK-LABEL: @variable.v4.range( ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[X_PTR:%.*]], align 1, !range [[RNG0]] ; CHECK-NEXT: [[Y:%.*]] = load i8, ptr [[Y_PTR:%.*]], align 1, !range [[RNG1:![0-9]+]] -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %x = load i8, ptr %x.ptr, !range !{ i8 0, i8 4 } @@ -136,7 +148,9 @@ define i8 @variable.v5(i8 %x, i8 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x = icmp ult i8 %x, 5 @@ -156,7 +170,9 @@ define i8 @variable.v6(i8 %x, i8 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], [[Y]] +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x = icmp ult i8 %x, 6 @@ -206,7 +222,9 @@ define i8 @large.divisor.v1(i8 %x) { ; CHECK-LABEL: @large.divisor.v1( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], -128 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 127 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 127 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 127 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x.upper = icmp ult i8 %x, 128 @@ -217,7 +235,9 @@ define i8 @large.divisor.v1(i8 %x) { define i8 @large.divisor.v1.range(ptr %x.ptr) { ; CHECK-LABEL: @large.divisor.v1.range( ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[X_PTR:%.*]], align 1, !range [[RNG2:![0-9]+]] -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], 127 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], 127 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], 127 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %x = load i8, ptr %x.ptr, !range !{ i8 0, i8 128 } @@ -248,7 +268,9 @@ define i8 @large.divisor.with.overflow.v1(i8 %x) { ; CHECK-LABEL: @large.divisor.with.overflow.v1( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], -127 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], -128 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], -128 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], -128 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %cmp.x.upper = icmp ult i8 %x, 129 @@ -259,7 +281,9 @@ define i8 @large.divisor.with.overflow.v1(i8 %x) { define i8 @large.divisor.with.overflow.v1.range(ptr %x.ptr) { ; CHECK-LABEL: @large.divisor.with.overflow.v1.range( ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[X_PTR:%.*]], align 1, !range [[RNG3:![0-9]+]] -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X]], -128 +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X]], -128 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X]], -128 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %x = load i8, ptr %x.ptr, !range !{ i8 0, i8 129 } @@ -268,7 +292,10 @@ define i8 @large.divisor.with.overflow.v1.range(ptr %x.ptr) { } define i8 @large.divisor.with.overflow.v2.unbound.x(i8 %x) { ; CHECK-LABEL: @large.divisor.with.overflow.v2.unbound.x( -; CHECK-NEXT: [[REM:%.*]] = urem i8 [[X:%.*]], -128 +; CHECK-NEXT: [[X_FROZEN:%.*]] = freeze i8 [[X:%.*]] +; CHECK-NEXT: [[REM_UREM:%.*]] = sub nuw i8 [[X_FROZEN]], -128 +; CHECK-NEXT: [[REM_CMP:%.*]] = icmp ult i8 [[X_FROZEN]], -128 +; CHECK-NEXT: [[REM:%.*]] = select i1 [[REM_CMP]], i8 [[X_FROZEN]], i8 [[REM_UREM]] ; CHECK-NEXT: ret i8 [[REM]] ; %rem = urem i8 %x, 128 diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll b/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll index 88395ef..19dfc68 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll @@ -117,9 +117,9 @@ exit: define void @test5(i32 %n) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[TRUNC:%.*]] = and i32 [[N:%.*]], 63 -; CHECK-NEXT: [[DIV_LHS_TRUNC:%.*]] = trunc i32 [[TRUNC]] to i8 -; CHECK-NEXT: [[DIV1:%.*]] = urem i8 [[DIV_LHS_TRUNC]], 42 -; CHECK-NEXT: [[DIV_ZEXT:%.*]] = zext i8 [[DIV1]] to i32 +; CHECK-NEXT: [[DIV_UREM:%.*]] = sub nuw i32 [[TRUNC]], 42 +; CHECK-NEXT: [[DIV_CMP:%.*]] = icmp ult i32 [[TRUNC]], 42 +; CHECK-NEXT: [[DIV:%.*]] = select i1 [[DIV_CMP]], i32 [[TRUNC]], i32 [[DIV_UREM]] ; CHECK-NEXT: ret void ; %trunc = and i32 %n, 63 diff --git a/llvm/test/Transforms/PhaseOrdering/cmp-logic.ll b/llvm/test/Transforms/PhaseOrdering/cmp-logic.ll index 1d32e5d..008e88e 100644 --- a/llvm/test/Transforms/PhaseOrdering/cmp-logic.ll +++ b/llvm/test/Transforms/PhaseOrdering/cmp-logic.ll @@ -123,16 +123,15 @@ define i32 @PR56119(i32 %e.coerce) { ; ; OZ-LABEL: @PR56119( ; OZ-NEXT: entry: -; OZ-NEXT: [[E_COERCE_FR:%.*]] = freeze i32 [[E_COERCE:%.*]] -; OZ-NEXT: [[TMP0:%.*]] = and i32 [[E_COERCE_FR]], 255 -; OZ-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 7 -; OZ-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; OZ-NEXT: [[CONV2:%.*]] = and i32 [[E_COERCE:%.*]], 255 +; OZ-NEXT: [[CMP1:%.*]] = icmp eq i32 [[CONV2]], 7 +; OZ-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; OZ: if.then: ; OZ-NEXT: tail call void (...) @foo() ; OZ-NEXT: br label [[IF_END]] ; OZ: if.end: -; OZ-NEXT: [[TMP1:%.*]] = load i32, ptr @c, align 4 -; OZ-NEXT: ret i32 [[TMP1]] +; OZ-NEXT: [[TMP0:%.*]] = load i32, ptr @c, align 4 +; OZ-NEXT: ret i32 [[TMP0]] ; entry: %e = alloca %struct.a, align 4 -- 2.7.4