From: Tom Stellard Date: Fri, 16 Aug 2013 01:12:11 +0000 (+0000) Subject: R600: Add support for global vector stores with elements less than 32-bits X-Git-Tag: llvmorg-3.4.0-rc1~6305 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fbab827e2a0781ec554750ffcd4cb0f67d6bdfee;p=platform%2Fupstream%2Fllvm.git R600: Add support for global vector stores with elements less than 32-bits Tested-by: Aaron Watry llvm-svn: 188520 --- diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 7ceab2df0ef5..78495ca8daa6 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -67,6 +67,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f64, Promote); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); @@ -187,6 +194,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: return LowerVectorStore(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); } return Op; @@ -487,6 +495,59 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, 2, DL); } +SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = dyn_cast(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack + // 32-bit vector truncatating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + const SDValue &Value = Store->getValue(); + EVT VT = Value.getValueType(); + const SDValue &Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + SDValue Mask; + switch(MemEltBits) { + case 8: + Mask = DAG.getConstant(0xFF, PackedVT); + break; + case 16: + Mask = DAG.getConstant(0xFFFF, PackedVT); + break; + default: + llvm_unreachable("Cannot lower this vector store"); + } + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + EVT ElemVT = VT.getVectorElementType(); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); + Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); + SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); + Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); + } + } + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index 8788c20903dd..e3a0dcca810a 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -51,6 +51,10 @@ protected: void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl &Ins) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue LowerVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + public: AMDGPUTargetLowering(TargetMachine &TM); diff --git a/llvm/lib/Target/R600/R600ISelLowering.cpp b/llvm/lib/Target/R600/R600ISelLowering.cpp index b6b65609c1e3..e10af2b16baf 100644 --- a/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -1011,10 +1011,15 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Value = Op.getOperand(1); SDValue Ptr = Op.getOperand(2); + SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); - assert(VT == MVT::i32); + assert(VT.bitsLE(MVT::i32)); EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1571,6 +1576,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } } } + case AMDGPUISD::EXPORT: { SDValue Arg = N->getOperand(1); if (Arg.getOpcode() != ISD::BUILD_VECTOR) diff --git a/llvm/test/CodeGen/R600/store.ll b/llvm/test/CodeGen/R600/store.ll index cba01a31c528..f24de04b8b5c 100644 --- a/llvm/test/CodeGen/R600/store.ll +++ b/llvm/test/CodeGen/R600/store.ll @@ -63,6 +63,49 @@ entry: ret void } +; EG-CHECK: @store_v2i8 +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK-NOT: MEM_RAT MSKOR +; SI-CHECK: @store_v2i8 +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out + ret void +} + + +; EG-CHECK: @store_v2i16 +; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW +; CM-CHECK: @store_v2i16 +; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD +; SI-CHECK: @store_v2i16 +; SI-CHECK: BUFFER_STORE_DWORD +define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out + ret void +} + +; EG-CHECK: @store_v4i8 +; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW +; CM-CHECK: @store_v4i8 +; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD +; SI-CHECK: @store_v4i8 +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +; SI-CHECK: BUFFER_STORE_BYTE +define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out + ret void +} + ; floating-point store ; EG-CHECK: @store_f32 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 @@ -76,6 +119,25 @@ define void @store_f32(float addrspace(1)* %out, float %in) { ret void } +; EG-CHECK: @store_v4i16 +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK: MEM_RAT MSKOR +; EG-CHECK-NOT: MEM_RAT MSKOR +; SI-CHECK: @store_v4i16 +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK: BUFFER_STORE_SHORT +; SI-CHECK-NOT: BUFFER_STORE_BYTE +define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i16> + store <4 x i16> %0, <4 x i16> addrspace(1)* %out + ret void +} + ; vec2 floating-point stores ; EG-CHECK: @store_v2f32 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW