From: Matt Arsenault Date: Thu, 22 May 2014 18:09:07 +0000 (+0000) Subject: R600: Add dag combine for BFE X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5565f65e1398ff7633ba7e52d80f6b02066b8f33;p=platform%2Fupstream%2Fllvm.git R600: Add dag combine for BFE llvm-svn: 209461 --- diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 78e79bc..a4cb4f5 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -1292,6 +1292,17 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { DCI.CommitTargetLoweringOpt(TLO); } +template +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, + uint32_t Offset, uint32_t Width) { + if (Width + Offset < 32) { + IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); + return DAG.getConstant(Result, MVT::i32); + } + + return DAG.getConstant(Src0 >> Offset, MVT::i32); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1338,6 +1349,64 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: { return CombineMinMax(N, DAG); } + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + assert(!N->getValueType(0).isVector() && + "Vector handling of BFE not implemented"); + ConstantSDNode *Width = dyn_cast(N->getOperand(2)); + if (!Width) + break; + + uint32_t WidthVal = Width->getZExtValue() & 0x1f; + if (WidthVal == 0) + return DAG.getConstant(0, MVT::i32); + + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (!Offset) + break; + + SDValue BitsFrom = N->getOperand(0); + uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; + + bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; + + if (OffsetVal == 0) { + // This is already sign / zero extended, so try to fold away extra BFEs. + unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); + + unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); + if (OpSignBits >= SignBits) + return BitsFrom; + } + + if (ConstantSDNode *Val = dyn_cast(N->getOperand(0))) { + if (Signed) { + return constantFoldBFE(DAG, + Val->getSExtValue(), + OffsetVal, + WidthVal); + } + + return constantFoldBFE(DAG, + Val->getZExtValue(), + OffsetVal, + WidthVal); + } + + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } } return SDValue(); } @@ -1560,6 +1629,11 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return std::max(SignBits, Op0SignBits); } + case AMDGPUISD::BFE_U32: { + ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); + return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; + } + default: return 1; } diff --git a/llvm/lib/Target/R600/R600ISelLowering.cpp b/llvm/lib/Target/R600/R600ISelLowering.cpp index 489565e6..d6c6830 100644 --- a/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -1762,7 +1762,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, NewArgs); } } - return SDValue(); + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } static bool diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index b51c46c..c9e247c 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -1075,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } } - return SDValue(); + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll index b3fec06..71d2b6e 100644 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll @@ -48,3 +48,251 @@ define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) no store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset +; SI-NOT: BFE +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset +; SI-NOT: BFE +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_0 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_1 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_2 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_3 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_4 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_5 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_6 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_7 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_8 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_9 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_10 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_11 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_12 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_13 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_14 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_15 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_16 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_17 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_constant_fold_test_18 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll index 0d47863..6ed1ad5 100644 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll @@ -38,3 +38,422 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset +; SI-NOT: BFE +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset +; SI-NOT: BFE +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zextload_i8 +; SI: BUFFER_LOAD_UBYTE +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %load = load i8 addrspace(1)* %in + %ext = zext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i8 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI-NEXT: V_AND_B32_e32 +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i16 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI-NEXT: V_AND_B32_e32 +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8 +; SI-NEXT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80 +; SI-NEXT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8 +; SI: BUFFER_LOAD_DWORD +; SI: V_ADD_I32 +; SI-NEXT: BFE +; SI: S_ENDPGM +define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_1 +; SI: BUFFER_LOAD_DWORD +; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; SI: S_ENDPGM +; EG: BFE_UINT +define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_4 +; SI-NOT: LSHL +; SI-NOT: SHR +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = lshr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_5 +; SI: BUFFER_LOAD_DWORD +; SI-NOT: LSHL +; SI-NOT: SHR +; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; SI: S_ENDPGM +define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_6 +; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 1, 31 +; SI: S_ENDPGM +define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_7 +; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_8 +; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 31, 1 +; SI: S_ENDPGM +define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_0 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_1 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_2 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_3 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_4 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_5 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_6 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_7 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_8 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_9 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFEfppppppppppppp +define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_10 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_11 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_12 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_13 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_14 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_15 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_16 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/R600/sext-in-reg.ll b/llvm/test/CodeGen/R600/sext-in-reg.ll index d2ab0b9..404c9b8 100644 --- a/llvm/test/CodeGen/R600/sext-in-reg.ll +++ b/llvm/test/CodeGen/R600/sext-in-reg.ll @@ -382,10 +382,57 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone +; FUNC-LABEL: @bfe_0_width +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32 addrspace(1)* %ptr, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_8_bfe_8 +; SI: V_BFE_I32 +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_8_bfe_16 +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI: S_ENDPGM +define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; This really should be folded into 1 +; FUNC-LABEL: @bfe_16_bfe_8 +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI: S_ENDPGM +define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + ; Make sure there isn't a redundant BFE ; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe ; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000 ; SI-NOT: BFE +; SI: S_ENDPGM define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone @@ -394,3 +441,40 @@ define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) n store i32 %ashr, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong +define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sextload_i8_to_i32_bfe +; SI: BUFFER_LOAD_SBYTE +; SI-NOT: BFE +; SI: S_ENDPGM +define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sextload_i8_to_i32_bfe_0: +; SI-NOT: BFE +; SI: S_ENDPGM +define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +}