From ca3976f7aef49af78926ff542809e28c2eac22f1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 15 Jul 2014 02:06:31 +0000 Subject: [PATCH] R600: Add dag combine for copy of an illegal type. This helps avoid redundant instructions to unpack, and repack the vectors. Ideally we could recognize that pattern and eliminate it. Currently v4i8 and other small element type vectors are scalarized, so this has the added bonus of avoiding that. llvm-svn: 213031 --- llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 56 ++++++++- llvm/lib/Target/R600/AMDGPUISelLowering.h | 1 + llvm/test/CodeGen/R600/copy-illegal-type.ll | 166 ++++++++++++++++++++++++++ llvm/test/CodeGen/R600/indirect-private-64.ll | 24 ++-- llvm/test/CodeGen/R600/load.ll | 6 +- 5 files changed, 240 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/R600/copy-illegal-type.ll diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 9777142..aae275a 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -360,6 +360,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); @@ -1896,6 +1897,56 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, MVT::i32); } +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + return SDValue(); + + LoadSDNode *LoadVal = cast(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -1928,7 +1979,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, } SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -2026,6 +2077,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } + + case ISD::STORE: + return performStoreCombine(N, DCI); } return SDValue(); } diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index 98a92ad..4445f81 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -64,6 +64,7 @@ private: SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: diff --git a/llvm/test/CodeGen/R600/copy-illegal-type.ll b/llvm/test/CodeGen/R600/copy-illegal-type.ll new file mode 100644 index 0000000..f7c2321 --- /dev/null +++ b/llvm/test/CodeGen/R600/copy-illegal-type.ll @@ -0,0 +1,166 @@ +; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: @test_copy_v4i8 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x3 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x4 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; After scalarizing v4i8 loads is fixed. +; XSI: BUFFER_LOAD_DWORD +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; XSI: BUFFER_LOAD_DWORD +; XSI: BFE +; XSI: BUFFER_STORE_DWORD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI-NEXT: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v3i8 +; SI-NOT: BFE +; SI-NOT: BFI +; SI: S_ENDPGM +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_load +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_store +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/R600/indirect-private-64.ll b/llvm/test/CodeGen/R600/indirect-private-64.ll index 1e23fd7..2f62845 100644 --- a/llvm/test/CodeGen/R600/indirect-private-64.ll +++ b/llvm/test/CodeGen/R600/indirect-private-64.ll @@ -31,10 +31,14 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-ALLOCA: V_MOVRELS_B32_e32 ; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_READ_B64 -; SI-PROMOTE: DS_READ_B64 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -77,10 +81,14 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-ALLOCA: V_MOVRELS_B32_e32 ; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_WRITE_B64 -; SI-PROMOTE: DS_READ_B64 -; SI-PROMOTE: DS_READ_B64 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_WRITE_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 +; SI-PROMOTE: DS_READ_B32 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 diff --git a/llvm/test/CodeGen/R600/load.ll b/llvm/test/CodeGen/R600/load.ll index a57df5c..8905fbd 100644 --- a/llvm/test/CodeGen/R600/load.ll +++ b/llvm/test/CodeGen/R600/load.ll @@ -254,8 +254,8 @@ entry: ; load a v2f32 value from the global address space ; FUNC-LABEL: @load_v2f32 +; R600-CHECK: MEM_RAT ; R600-CHECK: VTX_READ_64 - ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { entry: @@ -265,9 +265,7 @@ entry: } ; FUNC-LABEL: @load_i64 -; R600-CHECK: MEM_RAT -; R600-CHECK: MEM_RAT - +; R600-CHECK: VTX_READ_64 ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: -- 2.7.4