From f2290336b7466ba51f55a51f2d2357c81ccdbe17 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 6 Jan 2015 23:00:39 +0000 Subject: [PATCH] R600/SI: Add basic DAG combines for fp_class llvm-svn: 225306 --- llvm/lib/Target/R600/SIISelLowering.cpp | 49 ++++++++- llvm/lib/Target/R600/SIISelLowering.h | 2 + llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll | 162 ++++++++++++++++++++++++++++ 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index a2ea2fa..9f0d793 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -218,7 +218,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); - + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); // All memory operations. Some folding on the pointer operand is done to help @@ -1302,6 +1302,49 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + Src, DAG.getConstant(NewMask, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, MVT::i1); + } + + return SDValue(); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -1531,6 +1574,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/llvm/lib/Target/R600/SIISelLowering.h b/llvm/lib/Target/R600/SIISelLowering.h index 7bf406e..4da8628 100644 --- a/llvm/lib/Target/R600/SIISelLowering.h +++ b/llvm/lib/Target/R600/SIISelLowering.h @@ -58,6 +58,8 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performSHLPtrCombine(SDNode *N, unsigned AS, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll index f27cec1..974e3c7 100644 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll @@ -331,5 +331,167 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3 ret void } +; SI-LABEL: {{^}}test_fold_or_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or3_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + + %sext = sext i1 %or.1 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 + %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 + %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 + %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 + %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 + %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + %or.2 = or i1 %or.1, %class3 + %or.3 = or i1 %or.2, %class4 + %or.4 = or i1 %or.3, %class5 + %or.5 = or i1 %or.4, %class6 + %or.6 = or i1 %or.5, %class7 + %or.7 = or i1 %or.6, %class8 + %or.8 = or i1 %or.7, %class9 + %sext = sext i1 %or.8 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_1: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_2: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} +; SI: s_or_b64 +; SI: s_endpgm +define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f32: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f64: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -- 2.7.4