namespace {
+static cl::opt<bool> WidenLoads(
+ "amdgpu-codegenprepare-widen-constant-loads",
+ cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const SISubtarget *ST = nullptr;
}
bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+ if (!WidenLoads)
+ return false;
+
if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
canWidenScalarExtLoad(I)) {
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+ // Try to avoid using an extload by loading earlier than the argument address,
+ // and extracting the relevant bits. The load should hopefully be merged with
+ // the previous argument.
+ if (Align < 4) {
+ //if (MemVT.getStoreSize() < 4) {
+ assert(MemVT.getStoreSize() < 4);
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ // TODO: If we passed in the base kernel offset we could have a better
+ // alignment than 4, but we don't really need it.
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+ ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+ return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+ }
+
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
MachineMemOperand::MODereferenceable |
}
}
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+ ISD::LoadExtType ExtType, SDValue Op,
+ const SDLoc &SL, EVT VT) {
+ if (VT.bitsLT(Op.getValueType()))
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+ switch (ExtType) {
+ case ISD::SEXTLOAD:
+ return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+ case ISD::ZEXTLOAD:
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+ case ISD::EXTLOAD:
+ return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+ case ISD::NON_EXTLOAD:
+ return Op;
+ }
+
+ llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ return SDValue();
+
+ // FIXME: Constant loads should all be marked invariant.
+ unsigned AS = Ld->getAddressSpace();
+ if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+ AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+ (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+ return SDValue();
+
+ // Don't do this early, since it may interfere with adjacent load merging for
+ // illegal types. We can avoid losing alignment information for exotic types
+ // pre-legalize.
+ EVT MemVT = Ld->getMemoryVT();
+ if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+ MemVT.getSizeInBits() >= 32)
+ return SDValue();
+
+ SDLoc SL(Ld);
+
+ assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+ "unexpected vector extload");
+
+ // TODO: Drop only high part of range.
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+ MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(),
+ Ld->getPointerInfo(), MVT::i32,
+ Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags(),
+ Ld->getAAInfo(),
+ nullptr); // Drop ranges
+
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ if (MemVT.isFloatingPoint()) {
+ assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+ "unexpected fp extload");
+ TruncVT = MemVT.changeTypeToInteger();
+ }
+
+ SDValue Cvt = NewLoad;
+ if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+ Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+ DAG.getValueType(TruncVT));
+ } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+ } else {
+ assert(Ld->getExtensionType() == ISD::EXTLOAD);
+ }
+
+ EVT VT = Ld->getValueType(0);
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+ // the appropriate extension from the 32-bit load.
+ Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // Handle conversion back to floating point if necessary.
+ Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+ return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
+}
+
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
return performMinMaxCombine(N, DCI);
break;
}
- case ISD::LOAD:
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE:
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
ret void
}
-; FIXME: Should use SGPRs
; FUNC-LABEL: {{^}}s_and_i1:
-; SI: v_and_b32
+; SI: s_load_dword [[LOAD:s[0-9]+]]
+; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8
+; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]]
+; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}}
+; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]]
+; SI: buffer_store_byte [[V_AND_TRUNC]]
define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
%and = and i1 %a, %b
store i1 %and, i1 addrspace(1)* %out
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+; FIXME: Should be same on CI/VI
; GCN-LABEL: {{^}}s_ashr_v2i16:
; GFX9: s_load_dword [[LHS:s[0-9]+]]
; GFX9: s_load_dword [[RHS:s[0-9]+]]
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; VI: s_load_dword [[LHS:s[0-9]+]]
-; VI: s_load_dword [[RHS:s[0-9]+]]
+; CIVI: s_load_dword [[LHS:s[0-9]+]]
+; CIVI: s_load_dword [[RHS:s[0-9]+]]
+
; VI: s_ashr_i32
; VI: s_ashr_i32
; VI: s_sext_i32_i16
; VI: s_and_b32
; VI: s_or_b32
-; CI-DAG: v_ashrrev_i32_e32
-; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_or_b32_e32
+; CI: s_ashr_i32
+; CI: s_and_b32
+; CI: s_lshr_b32
+; CI: s_sext_i32_i16
+; CI: s_ashr_i32
+; CI: s_ashr_i32
+; CI: s_lshl_b32
+; CI: s_and_b32
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = ashr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
-; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_branch:
; GCNNOOPT: v_writelane_b32
}
; GCN-LABEL: {{^}}test_brcc_i1:
-; GCN: buffer_load_ubyte
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 1,
-; GCN: v_cmp_eq_u32_e32 vcc,
-; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
+; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
+; GCN: s_cmp_eq_u32
+; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
; GCN: buffer_store_dword
; GCN: s_cbranch_vccnz [[LOOPBB]]
; GCN-NEXT: ; %bb.2
; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
+ %cond = load volatile i1, i1 addrspace(3)* null
br label %for.body
for.exit:
}
; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: s_load_dword s
+; GCN: s_load_dword s
define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2
ret void
}
+; FIXME: Why sometimes vector shift?
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
-; SICIVI: buffer_load_ushort
-; SICIVI: buffer_load_ushort
-; SICIVI: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
; GFX9-DAG: global_load_short_d16_hi v
; GFX9-DAG: global_load_short_d16 v
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
-; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: {{buffer|global}}_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,GFX89 %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
}
; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dword s
; GCN: buffer_store_short
; GCN: buffer_store_short
define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
}
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; SICI: buffer_load_ushort
-; SICI: buffer_load_ushort
-; SICI: buffer_store_short
-; SICI: buffer_store_short
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: buffer_store_short
+; SI: buffer_store_short
; VI: s_load_dword s
; VI: s_load_dword s
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
-; SICI: buffer_load_ushort
-; SICI: buffer_load_ushort
-; SICI: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}
-; SICI: buffer_store_short
-; SICI: buffer_store_short
-; SICI: buffer_store_short
-
-; SICI: buffer_load_ushort
-; SICI: buffer_store_short
-
-; GFX9-DAG: global_load_short_d16_hi v
-; GFX9-DAG: global_load_short_d16 v
+; FIXME: Unnecessary repacking
+; GFX9: s_pack_ll_b32_b16
+; GFX9: s_pack_lh_b32_b16
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
-; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+
+; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
; GCN: {{buffer|global}}_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
+; GCN: s_load_dword [[LOAD:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
+; GCN: buffer_store_byte [[V_LOAD]]
define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
%p0 = extractelement <1 x i8> %foo, i32 0
store i8 %p0, i8 addrspace(1)* %out
}
; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: s_load_dword s
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; GCN-NOT: {{flat|buffer|global}}
; GCN: buffer_store_byte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
}
; GCN-LABEL: {{^}}extract_vector_elt_v3i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: s_load_dword s
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-NOT: {{flat|buffer|global}}
; GCN: buffer_store_byte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
}
; GCN-LABEL: {{^}}extract_vector_elt_v4i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: s_load_dword s
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-NOT: {{flat|buffer|global}}
; GCN: buffer_store_byte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
}
; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
+; GCN-NOT: {{flat|buffer|global}}
; GCN: buffer_store_byte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
}
; GCN-LABEL: {{^}}extract_vector_elt_v16i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
+; GCN: s_load_dword [[LOAD0:s[0-9]+]]
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
+; GCN: buffer_store_byte [[V_ELT2]]
+; GCN: buffer_store_byte [[V_LOAD0]]
define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
%p0 = extractelement <16 x i8> %foo, i32 0
%p1 = extractelement <16 x i8> %foo, i32 2
}
; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
+; GCN: s_load_dword [[LOAD0:s[0-9]+]]
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
+; GCN: buffer_store_byte [[V_ELT2]]
+; GCN: buffer_store_byte [[V_LOAD0]]
define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
%p0 = extractelement <32 x i8> %foo, i32 0
%p1 = extractelement <32 x i8> %foo, i32 2
}
; GCN-LABEL: {{^}}extract_vector_elt_v64i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
+; GCN: s_load_dword [[LOAD0:s[0-9]+]]
+; GCN-NOT: {{flat|buffer|global}}
+; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
+; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
+; GCN: buffer_store_byte [[V_ELT2]]
+; GCN: buffer_store_byte [[V_LOAD0]]
define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
%p0 = extractelement <64 x i8> %foo, i32 0
%p1 = extractelement <64 x i8> %foo, i32 2
; FIXME: SI generates much worse code from that's a pain to match
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
-; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
-; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; FIXME: 16-bit and 32-bit shift not combined after legalize to to
+; isTypeDesirableForOp in SimplifyDemandedBits
-; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]]
+; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI-NOT: {{flat|buffer|global}}
+; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
+; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
+; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
+; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
+; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
+; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
%elt = extractelement <2 x i8> %foo, i32 %idx
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
-; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]],
-; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]],
-; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
-
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI-NOT: {{flat|buffer|global}}
+; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
+; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
+; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
+; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-
-; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]]
-; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]]
; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
-; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c
-; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]]
+; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]]
+
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
- %p0 = extractelement <4 x i8> %foo, i32 %idx
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+ %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
+ %p0 = extractelement <4 x i8> %vec, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
-; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c
-; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]]
+; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 {
- %p0 = extractelement <8 x i8> %foo, i32 %idx
+define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+ %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
+ %p0 = extractelement <8 x i8> %vec, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
ret void
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; unless isFabsFree returns true
; GCN-LABEL: {{^}}s_fabs_free_f16:
-; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]],
-; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
+; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
+; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
+; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
}
; GCN-LABEL: {{^}}s_fabs_f16:
-; CI: flat_load_ushort [[VAL:v[0-9]+]],
-; CI: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
+; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
+; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+
+; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
+; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
+; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, half addrspace(1)* %out
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-
; GCN: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
}
; GCN-LABEL: {{^}}fabs_fold_f16:
-; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]]
-; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]]
+; GCN: s_load_dword [[IN0:s[0-9]+]]
+; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
-; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
-; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
-; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]]
-; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
+; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
+; CI-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]]
+; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
-; VI-NOT: and
-; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
+; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
+; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
%fabs = call half @llvm.fabs.f16(half %in0)
%fmul = fmul half %fabs, %in1
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
-; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
+; CI-DAG: v_cvt_f32_f16_e32
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |s{{[0-9]+}}|
; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]]
; GFX89-NOT: _and
-; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
+; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
; CI-DAG: v_cvt_f32_f16_e32
-; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{s[0-9]+}}|
; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]]
; CI: v_cvt_f16_f32_e32
; GFX89-NOT: _and
-; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}|
; GFX89-NOT: [[MUL]]
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
; unless isFabsFree returns true
; GCN-LABEL: {{^}}fneg_fabs_free_f16:
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000
define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
%bc = bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
}
; GCN-LABEL: {{^}}fneg_fabs_f16:
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000
define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
%fabs = call half @llvm.fabs.f16(half %in)
%fsub = fsub half -0.0, %fabs
ret void
}
-; GCN-LABEL: {{^}}fneg_free_f16:
-; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]],
+; GCN-LABEL: {{^}}s_fneg_free_f16:
+; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]],
-; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
-; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
+; CI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
+; CI: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
+
+; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000
+; GFX89: v_xor_b32_e32 [[XOR:v[0-9]+]], [[NEG_VALUE]], [[MASK]]
+; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
+define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
%bc = bitcast i16 %in to half
%fsub = fsub half -0.0, %bc
store half %fsub, half addrspace(1)* %out
; half args should be promoted to float for SI and lower.
; GCN-LABEL: {{^}}load_f16_arg:
-; GCN: flat_load_ushort [[ARG:v[0-9]+]]
-; GCN-NOT: [[ARG]]
-; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ARG]]
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
+; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]]
define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
store half %arg, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}load_v3f16_arg:
-; GCN: flat_load_ushort
; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: {buffer|flat|global}}_load_
; GCN-NOT: _load
; GCN-DAG: _store_dword
}
; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
-; GCN: flat_load_ushort
-; GCN: flat_load_ushort
-; GCN: flat_load_ushort
-; GCN-NOT: {{buffer|flat|global}}_load
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: _load
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
}
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-; SI: flat_load_ushort
-
-
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
}
; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
-; GCN: flat_load_ushort [[ARG:v[0-9]+]]
+; GCN: s_load_dword [[ARG:s[0-9]+]]
; GCN: v_cvt_f32_f16_e32 v[[ARG_F32:[0-9]+]], [[ARG]]
; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[ARG_F32]]
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
}
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
-; SI-DAG: flat_load_ushort v
-; SI-DAG: flat_load_ushort v
-
-; VI-DAG: s_load_dword s
-; VI: s_lshr_b32
+; GCN: s_load_dword
+; GCN: s_lshr_b32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
}
; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
-; GCN-DAG: flat_load_ushort v
-; GCN-DAG: flat_load_ushort v
-; GCN-DAG: flat_load_ushort v
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: s_lshr_b32
+
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
}
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-
-; VI: s_load_dword s
-; VI: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
}
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-; SI: flat_load_ushort v
-
-
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-
-
+; GCN: s_load_dword s
+; GCN-NEXT: s_load_dword s
+; GCN-NEXT: s_load_dword s
+; GCN-NEXT: s_load_dword s
+; GCN-NOT: _load_
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f64_f32_e32
; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
; GCN: s_endpgm
define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
}
; GCN-LABEL: {{^}}add_inline_imm_0.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0.0
}
; GCN-LABEL: {{^}}add_inline_imm_0.5_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0.5
}
; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, -0.5
}
; GCN-LABEL: {{^}}add_inline_imm_1.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 1.0
}
; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, -1.0
}
; GCN-LABEL: {{^}}add_inline_imm_2.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 2.0
}
; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, -2.0
}
; GCN-LABEL: {{^}}add_inline_imm_4.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 4.0
}
; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, -4.0
}
; GCN-LABEL: {{^}}add_inline_imm_1_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0xH0001
}
; GCN-LABEL: {{^}}add_inline_imm_2_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0xH0002
}
; GCN-LABEL: {{^}}add_inline_imm_16_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 16{{$}}
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0xH0010
; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16:
; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, -1
; VI: buffer_store_short [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
- %xbc = bitcast half %x to i16
- %y = add i16 %xbc, -1
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+ %x = load i16, i16 addrspace(1)* %in
+ %y = add i16 %x, -1
%ybc = bitcast i16 %y to half
store half %ybc, half addrspace(1)* %out
ret void
; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16:
; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfffe
; VI: buffer_store_short [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
- %xbc = bitcast half %x to i16
- %y = add i16 %xbc, -2
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+ %x = load i16, i16 addrspace(1)* %in
+ %y = add i16 %x, -2
%ybc = bitcast i16 %y to half
store half %ybc, half addrspace(1)* %out
ret void
; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16:
; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfff0
; VI: buffer_store_short [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
- %xbc = bitcast half %x to i16
- %y = add i16 %xbc, -16
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+ %x = load i16, i16 addrspace(1)* %in
+ %y = add i16 %x, -16
%ybc = bitcast i16 %y to half
store half %ybc, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_63_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 63
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0xH003F
}
; GCN-LABEL: {{^}}add_inline_imm_64_f16:
-; VI: buffer_load_ushort [[VAL:v[0-9]+]]
-; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]]
+; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 64
; VI: buffer_store_short [[REG]]
define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
%y = fadd half %x, 0xH0040
}
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
-; VI: buffer_load_ushort [[LOAD:v[0-9]]]
-; VI: s_load_dword [[IDX:s[0-9]]]
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI-NOT: _load
+; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1
-; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]]
-; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]]
-; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]]
-; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]]
+; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
+; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
+; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
+
+; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
+; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
+
+; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
+; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
+; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
; VI: buffer_store_short [[OR]]
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
ret void
}
+; FIXME: post legalize i16 and i32 shifts aren't merged because of
+; isTypeDesirableForOp in SimplifyDemandedBits
+
; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
-; VI: buffer_load_ubyte
-; VI: buffer_load_ushort
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3
-; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff,
-; VI: s_not_b32
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; VI: v_or_b32_e32
-; VI: v_and_b32
-; VI: v_bfi_b32
-; VI: v_lshrrev_b32
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI-NOT: _load
+
+; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
+; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
+; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
+; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
+; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
+
+; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
+; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
+; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
+
+; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
+; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
+; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
+; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
+; VI: buffer_store_short [[BFI]]
+; VI: buffer_store_byte [[HI2]]
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
}
; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
-; VI: s_load_dword [[VEC:s[0-9]+]]
-; VI: s_load_dword [[IDX:s[0-9]]]
+; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI-NOT: _load
+
+; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
+; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
+; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
+
+
+; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
+; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
+; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
+; VI: v_or_b32_e32
+; VI: v_or_b32_sdwa
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
-; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]]
-; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]]
+; VI: v_or_b32_sdwa
+; VI: s_lshl_b32
+; VI: v_bfi_b32
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
ret void
}
-; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
-; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
+; VI-NOT: {{buffer|flat|global}}
; VI: s_load_dword [[IDX:s[0-9]]]
+; VI-NOT: {{buffer|flat|global}}
+; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; VI-NOT: {{buffer|flat|global}}
+
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
-define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
+ %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
%vecins = insertelement <8 x i8> %a, i8 5, i32 %b
store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
ret void
}
; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: s_load_dwordx2
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: _load_
+
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
+
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
entry:
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
entry:
%0 = zext i8 %in to i32
; HSA-VI: kernarg_segment_alignment = 4
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
+; HSA-VI: flat_store_dword
define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
entry:
%0 = sext i8 %in to i32
; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+
; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
entry:
%0 = zext i16 %in to i32
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
entry:
%0 = zext i16 %in to i32
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
+; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
entry:
%0 = sext i16 %in to i32
; EG: VTX_READ_8
; EG: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; HSA: flat_load_ushort
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
entry:
store <2 x i8> %in, <2 x i8> addrspace(1)* %out
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; MESA-VI: buffer_load_ushort
-; MESA-VI: buffer_load_ubyte
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ubyte
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
entry:
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; GCN-DAG: s_load_dword s
-; GCN-DAG: {{buffer|flat}}_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dword s
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
entry:
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
; EG: VTX_READ_8
; EG: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; VI: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
entry:
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
; EG: VTX_READ_16
; EG: VTX_READ_16
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
+; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
ret void
}
+; FIXME: Lots of unpack and re-pack junk on VI
; FUNC-LABEL: {{^}}v8i8_arg:
; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_8
; EG: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI-NOT: {{buffer|flat|global}}_load
+
+; VI: s_load_dword s
+; VI: s_load_dword s
+
+; VI: v_lshlrev_b16
+; VI: v_or_b32_e32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: s_lshr_b32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
entry:
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
; EG: VTX_READ_16
; EG: VTX_READ_16
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI-NOT: {{buffer|flat|global}}_load
+
; VI: s_load_dwordx2
; VI: s_load_dword s
ret void
}
+; FIXME: Pack/repack on VI
+
; FUNC-LABEL: {{^}}v16i8_arg:
; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_8
; EG: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI-NOT: {{buffer|flat|global}}_load
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
+
+; VI: s_load_dword s
+; VI: s_load_dword s
+; VI: s_load_dword s
+; VI: s_load_dword s
+
+; VI: s_lshr_b32
+; VI: v_lshlrev_b16
+; VI: s_lshr_b32
+; VI: s_lshr_b32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: v_lshlrev_b16
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: v_lshlrev_b16
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
entry:
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
; EG: VTX_READ_16
; EG: VTX_READ_16
; EG: VTX_READ_16
+
; EG: VTX_READ_16
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+
+; SI-NOT: {{buffer|flat|global}}_load
+
; VI: s_load_dword s
; VI: s_load_dword s
; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32
-; SI: buffer_store_byte
-; SI: s_endpgm
+; GCN: s_load_dword s
+; GCN: s_and_b32
+; GCN: {{buffer|flat}}_store_byte
define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
store i1 %x, i1 addrspace(1)* %out, align 1
ret void
; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
+; GCN: s_load_dword
+; SGCN: buffer_store_dword
define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
%ext = zext i1 %x to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
-; SI: buffer_load_ubyte
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: s_load_dword s
+; GCN: {{buffer|flat}}_store_dwordx2
define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
%ext = zext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
+; GCN: s_load_dword
+; GCN: {{buffer|flat}}_store_dword
define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
%ext = sext i1 %x to i32
store i32 %ext, i32addrspace(1)* %out, align 4
; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: s_load_dword
+; GCN: s_bfe_i64
+; GCN: {{buffer|flat}}_store_dwordx2
define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
%ext = sext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s
; GCN-LABEL: {{^}}buffer_store_format_d16_x:
-; GCN: {{buffer|flat|global}}_load_ushort v[[LO:[0-9]+]]
-; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; GCN: s_load_dword s[[LO:[0-9]+]]
+; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
+; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
main_body:
call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
}
; GCN-LABEL: {{^}}class_f16_fabs:
-; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]]
-; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]]
-; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[SA_F16]]|, s[[SB_I32]]
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
+; GCN: s_load_dword s[[SB_I32:[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
+; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]]
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
ret void
}
-; GCN-LABEL: {{^}}class_f16_fneg
-; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
+; GCN-LABEL: {{^}}class_f16_fneg:
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
; GCN: s_load_dword s[[SB_I32:[0-9]+]]
-; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[SA_F16]], s[[SB_I32]]
+; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
+; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]]
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
ret void
}
-; GCN-LABEL: {{^}}class_f16_fabs_fneg
-; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]]
-; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]]
-; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[SA_F16]]|, s[[SB_I32]]
+; GCN-LABEL: {{^}}class_f16_fabs_fneg:
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
+; GCN: s_load_dword s[[SB_I32:[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
+; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]]
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
}
; GCN-LABEL: {{^}}class_f16_1:
-; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
-; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 1{{$}}
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
+; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 1{{$}}
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
}
; GCN-LABEL: {{^}}class_f16_64
-; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
-; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 64{{$}}
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
+; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 64{{$}}
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
}
; GCN-LABEL: {{^}}class_f16_full_mask:
-; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}}
-; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]]
+; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]]
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
ret void
}
-; GCN-LABEL: {{^}}class_f16_nine_bit_mask
-; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
+; GCN-LABEL: {{^}}class_f16_nine_bit_mask:
+; GCN: s_load_dword s[[SA_F16:[0-9]+]]
; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}}
-; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]]
+; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]]
; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
; GCN: buffer_store_dword v[[VR_I32]]
; GCN: s_endpgm
}
; GCN-LABEL: {{^}}image_store_f16
-; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]],
-; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
+; GCN: s_load_dword s[[LO:[0-9]+]],
+; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
+; GCN: image_store v[[V_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
main_body:
call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
; GCN-LABEL: {{^}}tbuffer_store_d16_x:
-; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]],
-; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
+; GCN: s_load_dword s[[S_LO:[0-9]+]]
+; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
+; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
; VI-DAG: s_lshl_b32
; VI: v_or_b32_e32
-; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
-; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: s_load_dword s
+; CI-NEXT: s_load_dword s
+; CI-NOT: {{buffer|flat}}
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CI: s_and_b32
+; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; CI: s_and_b32
+; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
+; CI: s_lshl_b32
+; CI: v_or_b32_e32
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = lshr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
; extloads with mubuf instructions.
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
-; GCN: buffer_load_sbyte
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: _load_
-; SI: v_min_i32
-; SI: v_min_i32
-; SI: v_min_i32
-; SI: v_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
-; VI: v_min_i32
-; VI: v_min_i32
-; VI: v_min_i32
-; VI: v_min_i32
+; VI: s_min_i32
+; VI: s_min_i32
+; VI: s_min_i32
+; VI: s_min_i32
; GFX9: v_min_i16
; GFX9: v_min_i16
; GFX9: v_min_i16
; GFX9: v_min_i16
-; GCN: s_endpgm
-
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
}
; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
-; SI: v_min_i32
-; SI: v_min_i32
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+
+; SI: s_ashr_i32
+; SI: s_ashr_i32
+; SI: s_sext_i32_i16
+; SI: s_sext_i32_i16
+; SI: s_min_i32
+; SI: s_min_i32
; VI: s_sext_i32_i16
; VI: s_sext_i32_i16
}
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
-; SI: v_min_i32
-; SI: v_min_i32
-; SI: v_min_i32
-; SI: v_min_i32
+; SI-NOT: buffer_load
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
+; SI: s_min_i32
; VI: s_min_i32
; VI: s_min_i32
}
; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
+; GCN-NOT: {{buffer|flat|global}}_load
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
+; SI: s_min_u32
; VI: s_min_u32
; VI: s_min_u32
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
; GCN: s_load_dword s
-; GCN: s_load_dwordx2 s
+; GCN-NEXT: s_load_dword s
+; GCN-NEXT: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}
+
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
%x.bc = bitcast <4 x i16> %x to <2 x i32>
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
+; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
-; FUNC-LABEL: {{^}}select_i1:
-; SI: v_cndmask_b32
-; SI-NOT: v_cndmask_b32
+; GCN-LABEL: {{^}}select_i1:
+; GCN: v_cndmask_b32
+; GCN-NOT: v_cndmask_b32
define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
%cmp = icmp ugt i32 %cond, 5
%sel = select i1 %cmp, i1 %a, i1 %b
ret void
}
-; FUNC-LABEL: {{^}}s_minmax_i1:
-; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45
-; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+; GCN-LABEL: {{^}}s_minmax_i1:
+; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
+; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
+; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
+; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
+; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
+; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
%cmp = icmp slt i1 %cond, false
%sel = select i1 %cmp, i1 %a, i1 %b
; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16:
; GFX9: v_pk_add_u16
-; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
-; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_add_u16
; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16:
; GFX9: v_pk_add_u16
-; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
-
; GFX9: v_pk_add_u16
; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
; VI: s_lshr_b32
; VI: s_and_b32
; VI: s_and_b32
-; SI: s_and_B32
-; SI: s_or_b32
+; VI: s_and_b32
+; VI: s_or_b32
+
-; CI-DAG: v_lshlrev_b32_e32
-; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_or_b32_e32
+; CI: s_load_dword s
+; CI: s_load_dword s
+; CI: s_lshr_b32
+; CI: s_and_b32
+; CI: s_lshr_b32
+; CI: s_lshl_b32
+; CI: s_lshl_b32
+; CI: s_lshl_b32
+; CI: s_and_b32
+; CI: s_or_b32
+; CI: _store_dword
define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = shl <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
; SI: s_and_b32
; SI: s_or_b32
-; CI: v_sub_i32_e32
-; CI-DAG: v_sub_i32_e32
-; CI: v_bfe_i32
-; CI-DAG: v_bfe_i32
-; CI-DAG: v_add_i32_e32
-; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
-; CI: v_add_i32_e32
-; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
-; CI: v_or_b32_e32
+; CI-NOT: {{buffer|flat}}_load
+; CI: s_load_dword s
+; CI-NOT: {{buffer|flat}}_load
+; CI: s_lshr_b32
+; CI: s_ashr_i32
+; CI: s_sext_i32_i16
+; CI: s_sub_i32
+; CI: s_sub_i32
+; CI: s_sext_i32_i16
+; CI: s_sext_i32_i16
+; CI: s_max_i32
+; CI: s_max_i32
+; CI: s_lshl_b32
+; CI: s_add_i32
+; CI: s_add_i32
+; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff
+; CI: s_or_b32
+
define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
%neg = sub <2 x i16> zeroinitializer, %val
%cond = icmp sgt <2 x i16> %val, %neg
--- /dev/null
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+
+; GCN-LABEL: {{^}}widen_i16_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_addk_i32 [[VAL]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4
+define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
+ %load = load i16, i16 addrspace(4)* %arg, align 4
+ %add = add i16 %load, 999
+ %or = or i16 %add, 4
+ store i16 %or, i16 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i16_constant_load_zext_i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}}
+; GCN: s_addk_i32 [[TRUNC]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4
+define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) {
+ %load = load i16, i16 addrspace(4)* %arg, align 4
+ %ext = zext i16 %load to i32
+ %add = add i32 %ext, 999
+ %or = or i32 %add, 4
+ store i32 %or, i32 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i16_constant_load_sext_i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_sext_i32_i16 [[EXT:s[0-9]+]], [[VAL]]
+; GCN: s_addk_i32 [[EXT]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[EXT]], 4
+define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) {
+ %load = load i16, i16 addrspace(4)* %arg, align 4
+ %ext = sext i16 %load to i32
+ %add = add i32 %ext, 999
+ %or = or i32 %add, 4
+ store i32 %or, i32 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i17_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 34
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[ADD]], 4
+; GCN: s_bfe_u32 s{{[0-9]+}}, [[OR]], 0x10010
+define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
+ %load = load i17, i17 addrspace(4)* %arg, align 4
+ %add = add i17 %load, 34
+ %or = or i17 %add, 4
+ store i17 %or, i17 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_f16_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[VAL]]
+; SI: v_add_f32_e32 [[ADD:v[0-9]+]], 4.0, [[CVT]]
+
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[VAL]], 4.0
+define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
+ %load = load half, half addrspace(4)* %arg, align 4
+ %add = fadd half %load, 4.0
+ store half %add, half addrspace(1)* null
+ ret void
+}
+
+; FIXME: valu usage on VI
+; GCN-LABEL: {{^}}widen_v2i8_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_add_i32
+; SI: s_or_b32
+; SI: s_addk_i32
+; SI: s_and_b32
+; SI: s_or_b32
+; SI: s_or_b32
+
+; VI: s_add_i32
+; VI: v_add_u32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_e32
+define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
+ %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4
+ %add = add <2 x i8> %load, <i8 12, i8 44>
+ %or = or <2 x i8> %add, <i8 4, i8 3>
+ store <2 x i8> %or, <2 x i8> addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_widen_i16_constant_divergent_load:
+; GCN: {{buffer|flat}}_load_ushort
+define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = zext i32 %tid to i64
+ %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext
+ %load = load i16, i16 addrspace(4)* %gep.arg, align 4
+ %add = add i16 %load, 999
+ %or = or i16 %add, 4
+ store i16 %or, i16 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i1_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 {{s[0-9]+}}, [[VAL]], 1{{$}}
+define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
+ %load = load i1, i1 addrspace(4)* %arg, align 4
+ %and = and i1 %load, true
+ store i1 %and, i1 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i16_zextload_i64_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}}
+; GCN: s_addk_i32 [[TRUNC]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4
+define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) {
+ %load = load i16, i16 addrspace(4)* %arg, align 4
+ %zext = zext i16 %load to i32
+ %add = add i32 %zext, 999
+ %or = or i32 %add, 4
+ store i32 %or, i32 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i1_zext_to_i64_constant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 [[AND:s[0-9]+]], [[VAL]], 1
+; GCN: s_add_u32 [[ADD:s[0-9]+]], [[AND]], 0x3e7
+; GCN: s_addc_u32 s{{[0-9]+}}, 0, 0
+define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) {
+ %load = load i1, i1 addrspace(4)* %arg, align 4
+ %zext = zext i1 %load to i64
+ %add = add i64 %zext, 999
+ store i64 %add, i64 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i16_constant32_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_addk_i32 [[VAL]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4
+define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
+ %load = load i16, i16 addrspace(6)* %arg, align 4
+ %add = add i16 %load, 999
+ %or = or i16 %add, 4
+ store i16 %or, i16 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}widen_i16_global_invariant_load:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_addk_i32 [[VAL]], 0x3e7
+; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 1
+define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) {
+ %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0
+ %add = add i16 %load, 999
+ %or = or i16 %add, 1
+ store i16 %or, i16 addrspace(1)* null
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{}