From: Matt Arsenault Date: Thu, 16 Jan 2020 14:59:56 +0000 (-0500) Subject: AMDGPU: Do permlane16 vdst_in discard optimization in InstCombine X-Git-Tag: llvmorg-12-init~17679 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3ef8cdf6660fc20baeb09eae5008b741f178b624;p=platform%2Fupstream%2Fllvm.git AMDGPU: Do permlane16 vdst_in discard optimization in InstCombine There's more potential value to discarding the source value earlier, since we always know the value of the fi/bc bits. --- diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index f463c5f..f3019fe 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3959,6 +3959,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { II->setOperand(0, UndefValue::get(Old->getType())); return II; } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + // Discard vdst_in if it's not going to be read. + Value *VDstIn = II->getArgOperand(0); + if (isa(VDstIn)) + break; + + ConstantInt *FetchInvalid = cast(II->getArgOperand(4)); + ConstantInt *BoundCtrl = cast(II->getArgOperand(5)); + if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) + break; + + II->setArgOperand(0, UndefValue::get(VDstIn->getType())); + return II; + } case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { // A constant value is trivially uniform. diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 7c11a74..1c622a3 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2655,5 +2655,83 @@ define amdgpu_kernel void @update_dpp_undef_old(i32 addrspace(1)* %out, i32 %in1 ret void } -; CHECK: attributes [[CONVERGENT]] = { convergent } +; -------------------------------------------------------------------- +; llvm.amdgcn.permlane16 +; -------------------------------------------------------------------- + +declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg) + +define amdgpu_kernel void @permlane16(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @permlane16_bound_ctrl(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16_bound_ctrl( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.permlanex16 +; -------------------------------------------------------------------- + +declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg) + +define amdgpu_kernel void @permlanex16(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @permlanex16_bound_ctrl(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16_bound_ctrl( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) { +; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; CHECK: attributes [[CONVERGENT]] = { convergent }