From 2f348ea1c75646d98b9079fae527c6e600a2beb1 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 9 May 2018 23:46:19 +0000 Subject: [PATCH] [NVPTX] Added a feature to use short pointers for const/local/shared AS. Const/local/shared address spaces are all < 4GB and we can always use 32-bit pointers to access them. This has substantial performance impact on kernels that uses shared memory for intermediary results. The feature is disabled by default. Differential Revision: https://reviews.llvm.org/D46147 llvm-svn: 331941 --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 127 ++++++++++++--------- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 + llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 ++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 1 - llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 18 ++- llvm/lib/Target/NVPTX/NVPTXTargetMachine.h | 3 + llvm/test/CodeGen/NVPTX/addrspacecast.ll | 99 ++++++++-------- llvm/test/CodeGen/NVPTX/ld-addrspace.ll | 159 ++++++++++++-------------- llvm/test/CodeGen/NVPTX/st-addrspace.ll | 165 ++++++++++++--------------- 11 files changed, 305 insertions(+), 287 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 9538d79..a6b7807 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -66,6 +66,10 @@ bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { return TL->allowUnsafeFPMath(*MF); } +bool NVPTXDAGToDAGISel::useShortPointers() const { + return TM.useShortPointers(); +} + /// Select - Select instructions not customized! Used for /// expanded, promoted and normal instructions. void NVPTXDAGToDAGISel::Select(SDNode *N) { @@ -732,7 +736,6 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *CastN = cast(N); unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); unsigned DstAddrSpace = CastN->getDestAddressSpace(); - assert(SrcAddrSpace != DstAddrSpace && "addrspacecast must be between different address spaces"); @@ -745,13 +748,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes; + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432 + : NVPTX::cvta_shared_yes_64) + : NVPTX::cvta_shared_yes; break; case ADDRESS_SPACE_CONST: - Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes; + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432 + : NVPTX::cvta_const_yes_64) + : NVPTX::cvta_const_yes; break; case ADDRESS_SPACE_LOCAL: - Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes; + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432 + : NVPTX::cvta_local_yes_64) + : NVPTX::cvta_local_yes; break; } ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), @@ -769,16 +778,19 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { : NVPTX::cvta_to_global_yes; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64 + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264 + : NVPTX::cvta_to_shared_yes_64) : NVPTX::cvta_to_shared_yes; break; case ADDRESS_SPACE_CONST: - Opc = - TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes; + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264 + : NVPTX::cvta_to_const_yes_64) + : NVPTX::cvta_to_const_yes; break; case ADDRESS_SPACE_LOCAL: - Opc = - TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes; + Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264 + : NVPTX::cvta_to_local_yes_64) + : NVPTX::cvta_to_local_yes; break; case ADDRESS_SPACE_PARAM: Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 @@ -834,18 +846,20 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { return false; // Address Space Setting - unsigned int codeAddrSpace = getCodeAddrSpace(LD); - - if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) { + unsigned int CodeAddrSpace = getCodeAddrSpace(LD); + if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } + unsigned int PointerSize = + CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); + // Volatile Setting // - .volatile is only availalble for .global and .shared bool isVolatile = LD->isVolatile(); - if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) isVolatile = false; // Type Setting: fromType + fromTypeWidth @@ -892,27 +906,27 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), + SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), getI32Imm(fromTypeWidth, dl), Addr, Chain }; NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT, MVT::Other, Ops); - } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) - : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { + } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) + : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi, NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), + SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT, MVT::Other, Ops); - } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset) - : SelectADDRri(N1.getNode(), N1, Base, Offset)) { - if (TM.is64Bit()) + } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) + : SelectADDRri(N1.getNode(), N1, Base, Offset)) { + if (PointerSize == 64) Opcode = pickOpcodeForVT( TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64, @@ -924,13 +938,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), + SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT, MVT::Other, Ops); } else { - if (TM.is64Bit()) + if (PointerSize == 64) Opcode = pickOpcodeForVT( TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64, @@ -943,7 +957,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl), + SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), getI32Imm(fromTypeWidth, dl), N1, Chain }; NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT, @@ -977,11 +991,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); - if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } + unsigned int PointerSize = + CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); + // Volatile Setting // - .volatile is only availalble for .global and .shared bool IsVolatile = MemSD->isVolatile(); @@ -1064,8 +1080,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { getI32Imm(VecType, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Addr, Chain }; LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops); - } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) - : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { + } else if (PointerSize == 64 + ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) + : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { switch (N->getOpcode()) { default: return false; @@ -1090,9 +1107,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { getI32Imm(VecType, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops); - } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) - : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { - if (TM.is64Bit()) { + } else if (PointerSize == 64 + ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) + : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { + if (PointerSize == 64) { switch (N->getOpcode()) { default: return false; @@ -1140,7 +1158,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops); } else { - if (TM.is64Bit()) { + if (PointerSize == 64) { switch (N->getOpcode()) { default: return false; @@ -1685,14 +1703,16 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; // Address Space Setting - unsigned int codeAddrSpace = getCodeAddrSpace(ST); + unsigned int CodeAddrSpace = getCodeAddrSpace(ST); + unsigned int PointerSize = + CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); // Volatile Setting // - .volatile is only availalble for .global and .shared bool isVolatile = ST->isVolatile(); - if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) isVolatile = false; // Vector Setting @@ -1735,12 +1755,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), + getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr, Chain }; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) - : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) + : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi, @@ -1748,13 +1768,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), + getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, Offset, Chain }; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); - } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset) - : SelectADDRri(N2.getNode(), N2, Base, Offset)) { - if (TM.is64Bit()) + } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) + : SelectADDRri(N2.getNode(), N2, Base, Offset)) { + if (PointerSize == 64) Opcode = pickOpcodeForVT( SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64, @@ -1768,12 +1788,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), + getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base, Offset, Chain }; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); } else { - if (TM.is64Bit()) + if (PointerSize == 64) Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, @@ -1787,7 +1807,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = { N1, getI32Imm(isVolatile, dl), - getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl), + getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2, Chain }; NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops); @@ -1816,11 +1836,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { // Address Space Setting unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); } + unsigned int PointerSize = + CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); // Volatile Setting // - .volatile is only availalble for .global and .shared @@ -1901,8 +1922,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } StOps.push_back(Addr); - } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) - : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) + : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { default: return false; @@ -1923,9 +1944,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { } StOps.push_back(Base); StOps.push_back(Offset); - } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset) - : SelectADDRri(N2.getNode(), N2, Base, Offset)) { - if (TM.is64Bit()) { + } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) + : SelectADDRri(N2.getNode(), N2, Base, Offset)) { + if (PointerSize == 64) { switch (N->getOpcode()) { default: return false; @@ -1968,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { StOps.push_back(Base); StOps.push_back(Offset); } else { - if (TM.is64Bit()) { + if (PointerSize == 64) { switch (N->getOpcode()) { default: return false; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b49e157..9b16bd9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -35,6 +35,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool useF32FTZ() const; bool allowFMA() const; bool allowUnsafeFPMath() const; + bool useShortPointers() const; public: explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 1590231..6ccd9eb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1233,9 +1233,9 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, SDValue NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - const GlobalValue *GV = cast(Op)->getGlobal(); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT); + const GlobalAddressSDNode *GAN = cast(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); + Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 7b2bf38..443b077 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -147,6 +147,7 @@ def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; +def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 66419f0..31bed35 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1937,6 +1937,12 @@ multiclass NG_TO_G { def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), !strconcat("cvta.", Str, ".u64 \t$result, $src;"), [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; + def _yes_6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src), + "{{ .reg .b64 %tmp;\n\t" + #" cvt.u64.u32 \t%tmp, $src;\n\t" + #" cvta." # Str # ".u64 \t$result, %tmp; }}", + [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>, + Requires<[useShortPtr]>; } multiclass G_TO_NG { @@ -1946,6 +1952,12 @@ multiclass G_TO_NG { def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"), [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; + def _yes_3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src), + "{{ .reg .b64 %tmp;\n\t" + #" cvta.to." # Str # ".u64 \t%tmp, $src;\n\t" + #" cvt.u32.u64 \t$result, %tmp; }}", + [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>, + Requires<[useShortPtr]>; } defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index fa35673..7030fe5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -79,7 +79,6 @@ public: bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool allowFP16Math() const; - unsigned int getSmVersion() const { return SmVersion; } std::string getTargetName() const { return TargetName; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index a3cd99e..a1b1604 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -52,6 +52,12 @@ static cl::opt DisableRequireStructuredCFG( "unexpected regressions happen."), cl::init(false), cl::Hidden); +static cl::opt UseShortPointersOpt( + "nvptx-short-ptr", + cl::desc( + "Use 32-bit pointers for accessing const/local/shared address spaces."), + cl::init(false), cl::Hidden); + namespace llvm { void initializeNVVMIntrRangePass(PassRegistry&); @@ -83,11 +89,13 @@ extern "C" void LLVMInitializeNVPTXTarget() { initializeNVPTXLowerAggrCopiesPass(PR); } -static std::string computeDataLayout(bool is64Bit) { +static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { std::string Ret = "e"; if (!is64Bit) Ret += "-p:32:32"; + else if (UseShortPointers) + Ret += "-p3:32:32-p4:32:32-p5:32:32"; Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; @@ -108,9 +116,11 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool is64bit) // The pic relocation model is used regardless of what the client has // specified, as it is the only relocation model currently supported. - : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, - Reloc::PIC_, getEffectiveCodeModel(CM), OL), - is64bit(is64bit), TLOF(llvm::make_unique()), + : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, + CPU, FS, Options, Reloc::PIC_, + getEffectiveCodeModel(CM), OL), + is64bit(is64bit), UseShortPointers(UseShortPointersOpt), + TLOF(llvm::make_unique()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index eeebf64..ca540b8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -26,6 +26,8 @@ namespace llvm { /// class NVPTXTargetMachine : public LLVMTargetMachine { bool is64bit; + // Use 32-bit pointers for accessing const/local/short AS. + bool UseShortPointers; std::unique_ptr TLOF; NVPTX::DrvInterface drvInterface; NVPTXSubtarget Subtarget; @@ -45,6 +47,7 @@ public: } const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; } bool is64Bit() const { return is64bit; } + bool useShortPointers() const { return UseShortPointers; } NVPTX::DrvInterface getDrvInterface() const { return drvInterface; } ManagedStringPool *getManagedStrPool() const { return const_cast(&ManagedStrPool); diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll index 4451edf..dbcd2be 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,97 +1,96 @@ -; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32 -; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64 +; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32 +; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64 +; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64 +; ALL-LABEL: conv1 define i32 @conv1(i32 addrspace(1)* %ptr) { -; PTX32: conv1 -; PTX32: cvta.global.u32 -; PTX32: ld.u32 -; PTX64: conv1 -; PTX64: cvta.global.u64 -; PTX64: ld.u32 +; G32: cvta.global.u32 +; ALL-NOT: cvt.u64.u32 +; G64: cvta.global.u64 +; ALL: ld.u32 %genptr = addrspacecast i32 addrspace(1)* %ptr to i32* %val = load i32, i32* %genptr ret i32 %val } +; ALL-LABEL: conv2 define i32 @conv2(i32 addrspace(3)* %ptr) { -; PTX32: conv2 -; PTX32: cvta.shared.u32 -; PTX32: ld.u32 -; PTX64: conv2 -; PTX64: cvta.shared.u64 -; PTX64: ld.u32 +; CLS32: cvta.shared.u32 +; PTRCONV: cvt.u64.u32 +; NOPTRCONV-NOT: cvt.u64.u32 +; CLS64: cvta.shared.u64 +; ALL: ld.u32 %genptr = addrspacecast i32 addrspace(3)* %ptr to i32* %val = load i32, i32* %genptr ret i32 %val } +; ALL-LABEL: conv3 define i32 @conv3(i32 addrspace(4)* %ptr) { -; PTX32: conv3 -; PTX32: cvta.const.u32 -; PTX32: ld.u32 -; PTX64: conv3 -; PTX64: cvta.const.u64 -; PTX64: ld.u32 +; CLS32: cvta.const.u32 +; PTRCONV: cvt.u64.u32 +; NOPTRCONV-NOT: cvt.u64.u32 +; CLS64: cvta.const.u64 +; ALL: ld.u32 %genptr = addrspacecast i32 addrspace(4)* %ptr to i32* %val = load i32, i32* %genptr ret i32 %val } +; ALL-LABEL: conv4 define i32 @conv4(i32 addrspace(5)* %ptr) { -; PTX32: conv4 -; PTX32: cvta.local.u32 -; PTX32: ld.u32 -; PTX64: conv4 -; PTX64: cvta.local.u64 -; PTX64: ld.u32 +; CLS32: cvta.local.u32 +; PTRCONV: cvt.u64.u32 +; NOPTRCONV-NOT: cvt.u64.u32 +; CLS64: cvta.local.u64 +; ALL: ld.u32 %genptr = addrspacecast i32 addrspace(5)* %ptr to i32* %val = load i32, i32* %genptr ret i32 %val } +; ALL-LABEL: conv5 define i32 @conv5(i32* %ptr) { -; PTX32: conv5 -; PTX32: cvta.to.global.u32 -; PTX32: ld.global.u32 -; PTX64: conv5 -; PTX64: cvta.to.global.u64 -; PTX64: ld.global.u32 +; CLS32: cvta.to.global.u32 +; ALL-NOT: cvt.u64.u32 +; CLS64: cvta.to.global.u64 +; ALL: ld.global.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(1)* %val = load i32, i32 addrspace(1)* %specptr ret i32 %val } +; ALL-LABEL: conv6 define i32 @conv6(i32* %ptr) { -; PTX32: conv6 -; PTX32: cvta.to.shared.u32 -; PTX32: ld.shared.u32 -; PTX64: conv6 -; PTX64: cvta.to.shared.u64 -; PTX64: ld.shared.u32 +; CLS32: cvta.to.shared.u32 +; CLS64: cvta.to.shared.u64 +; PTRCONV: cvt.u32.u64 +; NOPTRCONV-NOT: cvt.u32.u64 +; ALL: ld.shared.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(3)* %val = load i32, i32 addrspace(3)* %specptr ret i32 %val } +; ALL-LABEL: conv7 define i32 @conv7(i32* %ptr) { -; PTX32: conv7 -; PTX32: cvta.to.const.u32 -; PTX32: ld.const.u32 -; PTX64: conv7 -; PTX64: cvta.to.const.u64 -; PTX64: ld.const.u32 +; CLS32: cvta.to.const.u32 +; CLS64: cvta.to.const.u64 +; PTRCONV: cvt.u32.u64 +; NOPTRCONV-NOT: cvt.u32.u64 +; ALL: ld.const.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(4)* %val = load i32, i32 addrspace(4)* %specptr ret i32 %val } +; ALL-LABEL: conv8 define i32 @conv8(i32* %ptr) { -; PTX32: conv8 -; PTX32: cvta.to.local.u32 -; PTX32: ld.local.u32 -; PTX64: conv8 -; PTX64: cvta.to.local.u64 -; PTX64: ld.local.u32 +; CLS32: cvta.to.local.u32 +; CLS64: cvta.to.local.u64 +; PTRCONV: cvt.u32.u64 +; NOPTRCONV-NOT: cvt.u32.u64 +; ALL: ld.local.u32 %specptr = addrspacecast i32* %ptr to i32 addrspace(5)* %val = load i32, i32 addrspace(5)* %specptr ret i32 %val diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll index 0018e61..4a2eb39 100644 --- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll @@ -1,171 +1,160 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 ;; i8 define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { -; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_i8 +; G32: ld.global.u8 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.u8 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i8, i8 addrspace(1)* %ptr ret i8 %a } - define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { -; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_shared_i8 +; LS32: ld.shared.u8 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.u8 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i8, i8 addrspace(3)* %ptr ret i8 %a } - define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { -; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_i8 +; LS32: ld.local.u8 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.u8 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i8, i8 addrspace(5)* %ptr ret i8 %a } ;; i16 define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { -; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_i16 +; G32: ld.global.u16 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.u16 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i16, i16 addrspace(1)* %ptr ret i16 %a } - define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { -; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_shared_i16 +; LS32: ld.shared.u16 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.u16 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i16, i16 addrspace(3)* %ptr ret i16 %a } - define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { -; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_i16 +; LS32: ld.local.u16 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.u16 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i16, i16 addrspace(5)* %ptr ret i16 %a } ;; i32 define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { -; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_i32 +; G32: ld.global.u32 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.u32 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i32, i32 addrspace(1)* %ptr ret i32 %a } - define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { -; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] +; ALL-LABEL: ld_shared_i32 +; LS32: ld.shared.u32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.u32 %{{.*}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32, i32 addrspace(3)* %ptr ret i32 %a } - define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { -; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_i32 +; LS32: ld.local.u32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.u32 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i32, i32 addrspace(5)* %ptr ret i32 %a } ;; i64 define i64 @ld_global_i64(i64 addrspace(1)* %ptr) { -; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_i64 +; G32: ld.global.u64 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.u64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i64, i64 addrspace(1)* %ptr ret i64 %a } - define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) { -; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_shared_i64 +; LS32: ld.shared.u64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.u64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i64, i64 addrspace(3)* %ptr ret i64 %a } - define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { -; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_i64 +; LS32: ld.local.u64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.u64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load i64, i64 addrspace(5)* %ptr ret i64 %a } ;; f32 define float @ld_global_f32(float addrspace(1)* %ptr) { -; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_f32 +; G32: ld.global.f32 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.f32 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load float, float addrspace(1)* %ptr ret float %a } - define float @ld_shared_f32(float addrspace(3)* %ptr) { -; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_shared_f32 +; LS32: ld.shared.f32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.f32 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load float, float addrspace(3)* %ptr ret float %a } - define float @ld_local_f32(float addrspace(5)* %ptr) { -; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_f32 +; LS32: ld.local.f32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.f32 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load float, float addrspace(5)* %ptr ret float %a } ;; f64 define double @ld_global_f64(double addrspace(1)* %ptr) { -; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_global_f64 +; G32: ld.global.f64 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.f64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load double, double addrspace(1)* %ptr ret double %a } - define double @ld_shared_f64(double addrspace(3)* %ptr) { -; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_shared_f64 +; LS32: ld.shared.f64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.f64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load double, double addrspace(3)* %ptr ret double %a } - define double @ld_local_f64(double addrspace(5)* %ptr) { -; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: ret -; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] -; PTX64: ret +; ALL-LABEL: ld_local_f64 +; LS32: ld.local.f64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.f64 %{{.*}}, [%rd{{[0-9]+}}] +; ALL: ret %a = load double, double addrspace(5)* %ptr ret double %a } diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll index 34a83f3..6c5df1c 100644 --- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll @@ -1,177 +1,160 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 ;; i8 - +; ALL-LABEL: st_global_i8 define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { -; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; G32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; G64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i8 %a, i8 addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_i8 define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { -; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; LS32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i8 %a, i8 addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_i8 define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { -; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; LS32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i8 %a, i8 addrspace(5)* %ptr ret void } ;; i16 - +; ALL-LABEL: st_global_i16 define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) { -; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; G32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; G64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i16 %a, i16 addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_i16 define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) { -; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; LS32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i16 %a, i16 addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_i16 define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) { -; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} -; PTX64: ret +; LS32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; ALL: ret store i16 %a, i16 addrspace(5)* %ptr ret void } ;; i32 - +; ALL-LABEL: st_global_i32 define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) { -; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} -; PTX64: ret +; G32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; G64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; ALL: ret store i32 %a, i32 addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_i32 define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) { -; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; LS32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; LS64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_i32 define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) { -; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} -; PTX64: ret +; LS32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; LS64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; ALL: ret store i32 %a, i32 addrspace(5)* %ptr ret void } ;; i64 - +; ALL-LABEL: st_global_i64 define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) { -; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} -; PTX64: ret +; G32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; G64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; ALL: ret store i64 %a, i64 addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_i64 define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) { -; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} -; PTX64: ret +; LS32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; LS64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; ALL: ret store i64 %a, i64 addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_i64 define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) { -; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} -; PTX64: ret +; LS32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; LS64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; ALL: ret store i64 %a, i64 addrspace(5)* %ptr ret void } ;; f32 - +; ALL-LABEL: st_global_f32 define void @st_global_f32(float addrspace(1)* %ptr, float %a) { -; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} -; PTX64: ret +; G32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; G64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; ALL: ret store float %a, float addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_f32 define void @st_shared_f32(float addrspace(3)* %ptr, float %a) { -; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} -; PTX64: ret +; LS32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; LS64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; ALL: ret store float %a, float addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_f32 define void @st_local_f32(float addrspace(5)* %ptr, float %a) { -; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} -; PTX64: ret +; LS32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; LS64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; ALL: ret store float %a, float addrspace(5)* %ptr ret void } ;; f64 - +; ALL-LABEL: st_global_f64 define void @st_global_f64(double addrspace(1)* %ptr, double %a) { -; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; PTX32: ret -; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} -; PTX64: ret +; G32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; G64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; ALL: ret store double %a, double addrspace(1)* %ptr ret void } - +; ALL-LABEL: st_shared_f64 define void @st_shared_f64(double addrspace(3)* %ptr, double %a) { -; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; PTX32: ret -; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} -; PTX64: ret +; LS32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; LS64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; ALL: ret store double %a, double addrspace(3)* %ptr ret void } - +; ALL-LABEL: st_local_f64 define void @st_local_f64(double addrspace(5)* %ptr, double %a) { -; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; PTX32: ret -; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} -; PTX64: ret +; LS32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; LS64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; ALL: ret store double %a, double addrspace(5)* %ptr ret void } -- 2.7.4