}
}
+static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
+ bool Store) {
+ if (SuperRegSize % 16 == 0) {
+ return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
+ }
+
+ if (SuperRegSize % 8 == 0) {
+ return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
+ }
+
+ return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
+ AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
+}
+
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS) const {
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- const unsigned EltSize = 4;
unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
}
}
+ unsigned ScalarStoreOp;
+ unsigned EltSize = 4;
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+ }
+
+ const TargetRegisterClass *SubRC = nullptr;
+ unsigned NumSubRegs = 1;
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+
+ if (!SplitParts.empty()) {
+ NumSubRegs = SplitParts.size();
+ SubRC = getSubRegClass(RC, SplitParts[0]);
+ }
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToSMEM) {
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ // The allocated memory size is really the wavefront size * the frame
+ // index size. The widest register class is 64 bytes, so a 4-byte scratch
+ // allocation is enough to spill this in a single stack object.
+ //
+ // FIXME: Frame size/offsets are computed earlier than this, so the extra
+ // space is still unnecessarily allocated.
+
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
- // Add i * 4 wave offset.
- //
// SMEM instructions only support a single offset, so increment the wave
// offset.
- int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
.addReg(MFI->getScratchWaveOffsetReg())
.addReg(MFI->getScratchWaveOffsetReg());
}
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR))
+ BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
.addReg(SubReg, getKillRegState(IsKill)) // sdata
.addReg(MFI->getScratchRSrcReg()) // sbase
.addReg(OffsetReg, RegState::Kill) // soff
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
}
}
+ unsigned EltSize = 4;
+ unsigned ScalarLoadOp;
+
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ if (SpillToSMEM && isSGPRClass(RC)) {
+ // XXX - if private_element_size is larger than 4 it might be useful to be
+ // able to spill wider vmem spills.
+ std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+ }
+
+ const TargetRegisterClass *SubRC = nullptr;
+ unsigned NumSubRegs = 1;
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+
+ if (!SplitParts.empty()) {
+ NumSubRegs = SplitParts.size();
+ SubRC = getSubRegClass(RC, SplitParts[0]);
+ }
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
- const unsigned EltSize = 4;
-
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToSMEM) {
+ // FIXME: Size may be > 4 but extra bytes wasted.
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
EltSize, MinAlign(Align, EltSize * i));
// Add i * 4 offset
- int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
.addReg(MFI->getScratchWaveOffsetReg())
}
auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg)
+ BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
.addReg(MFI->getScratchRSrcReg()) // sbase
.addReg(OffsetReg, RegState::Kill) // soff
.addImm(0) // glc
.addMemOperand(MMO);
if (NumSubRegs > 1)
- MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
continue;
}
.addImm(Spill.Lane);
if (NumSubRegs > 1)
- MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
} else {
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
; SGPR-NEXT: s_nop 4
; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
-
; Make sure scratch wave offset register is correctly incremented and
; then restored.
; SMEM: s_mov_b32 m0, s91{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
-; SMEM: s_add_u32 m0, s91, 0x100{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
-; SMEM: s_add_u32 m0, s91, 0x200{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
-; SMEM: s_add_u32 m0, s91, 0x300{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
-
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Spill
; SMEM: s_mov_b32 m0, s91{{$}}
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
-; SMEM: s_add_u32 m0, s91, 0x100{{$}}
-; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
-; SMEM: s_add_u32 m0, s91, 0x200{{$}}
-; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
-; SMEM: s_add_u32 m0, s91, 0x300{{$}}
-; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_dcache_wb
; ALL: s_endpgm
define void @test(i32 addrspace(1)* %out, i32 %in) {
call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
; TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s7, 0x100
-; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
-; TOSMEM: s_add_u32 m0, s7, 0x200
-; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b64 exec,
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
-; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
+; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
; GCN-NOT: v_readlane_b32 m0
; TOSMEM: s_mov_b32 vcc_hi, m0
; TOSMEM: s_mov_b32 m0, s3
-; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
; TOSMEM: s_mov_b32 m0, vcc_hi
; TOSMEM: s_mov_b64 exec,
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
; TOSMEM-NEXT: s_mov_b32 m0, s3
-; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
-; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
-
-; FIXME: Could delay this wait
-; TOSMEM-NEXT: s_waitcnt lgkmcnt(0)
-; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
-
+; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
; GCN-NOT: v_readlane_b32 m0
; GCN-NOT: s_buffer_store_dword m0
}
; GCN-LABEL: {{^}}restore_m0_lds:
+; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
; TOSMEM: s_cmp_eq_u32
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b32 m0, s3
-; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[84:87], m0 ; 8-byte Folded Spill
+; TOSMEM-NOT: m0
+; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[84:87], m0 ; 4-byte Folded Spill
; TOSMEM-NOT: m0
; TOSMEM: s_cbranch_scc1
; TOSMEM: s_mov_b32 vcc_hi, m0
; TOSMEM: s_mov_b32 m0, s3
-; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_waitcnt lgkmcnt(0)
-; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[84:87], m0 ; 8-byte Folded Reload
; TOSMEM: s_mov_b32 m0, vcc_hi
; TOSMEM: s_waitcnt lgkmcnt(0)
--- /dev/null
+; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s
+; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
+; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s
+
+; ALL-LABEL: {{^}}spill_sgpr_x2:
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill
+; SMEM: s_cbranch_scc1
+
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload
+
+; SMEM: s_dcache_wb
+; SMEM: s_endpgm
+
+; FIXME: Should only need 4 bytes
+; SMEM: ScratchSize: 12
+
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
+ %wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+ %cmp = icmp eq i32 %in, 0
+ br i1 %cmp, label %bb0, label %ret
+
+bb0:
+ call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0
+ br label %ret
+
+ret:
+ ret void
+}
+
+; ALL-LABEL: {{^}}spill_sgpr_x4:
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
+; SMEM: s_cbranch_scc1
+
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
+; SMEM: s_dcache_wb
+; SMEM: s_endpgm
+
+; FIXME: Should only need 4 bytes
+; SMEM: ScratchSize: 20
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
+
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
+ %wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
+ %cmp = icmp eq i32 %in, 0
+ br i1 %cmp, label %bb0, label %ret
+
+bb0:
+ call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0
+ br label %ret
+
+ret:
+ ret void
+}
+
+; ALL-LABEL: {{^}}spill_sgpr_x8:
+
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_cbranch_scc1
+
+; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+
+; SMEM: s_dcache_wb
+; SMEM: s_endpgm
+
+; SMEM: ScratchSize: 36
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
+ %wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+ %cmp = icmp eq i32 %in, 0
+ br i1 %cmp, label %bb0, label %ret
+
+bb0:
+ call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0
+ br label %ret
+
+ret:
+ ret void
+}
+
+; FIXME: x16 inlineasm seems broken
+; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
+; %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+; %cmp = icmp eq i32 %in, 0
+; br i1 %cmp, label %bb0, label %ret
+
+; bb0:
+; call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0
+; br label %ret
+
+; ret:
+; ret void
+; }
+
+attributes #0 = { nounwind }