The full complement of physical VGPRs for GFX11 is 50% more than GFX10.
Some subtargets have this, others stay the same as GFX10. This affects
occupancy calculations.
Differential Revision: https://reviews.llvm.org/D134522
"Does not need SW waitstates"
>;
+def FeatureGFX11FullVGPRs : SubtargetFeature<"gfx11-full-vgprs",
+ "HasGFX11FullVGPRs",
+ "true",
+ "GFX11 with 50% more physical VGPRs and 50% larger allocation granule than GFX10"
+>;
+
class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
"nsa-max-size-"#Value,
"NSAMaxSize",
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureUserSGPRInit16Bug])>;
+ [FeatureGFX11FullVGPRs,
+ FeatureUserSGPRInit16Bug])>;
def FeatureISAVersion11_0_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [])>;
+ [FeatureGFX11FullVGPRs])>;
def FeatureISAVersion11_0_2 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
bool HasFlatSegmentOffsetBug = false;
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
+ bool HasGFX11FullVGPRs = false;
bool HasVOPDInsts = false;
// Dummy feature to use for assembler in tablegen.
/// target.
bool hasNullExportTarget() const { return !GFX11Insts; }
+ bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
+
bool hasVOPDInsts() const { return HasVOPDInsts; }
bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
+ if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs))
+ return IsWave32 ? 24 : 12;
+
if (hasGFX10_3Insts(*STI))
return IsWave32 ? 16 : 8;
return 512;
if (!isGFX10Plus(*STI))
return 256;
- return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
+ bool IsWave32 = STI->getFeatureBits().test(FeatureWavefrontSize32);
+ if (STI->getFeatureBits().test(FeatureGFX11FullVGPRs))
+ return IsWave32 ? 1536 : 768;
+ return IsWave32 ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v18, v12, v14, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v19, v13, v15, s0
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v3
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, v4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, v5, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v3
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x8
+; GFX11-NEXT: s_clause 0xa
+; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
+; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
+; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160
; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1]
; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[52:55], v64, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112
-; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7
-; GFX11-NEXT: s_clause 0x6
-; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:128
-; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:160
-; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:176
-; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:192
-; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:208
-; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:224
-; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:240
-; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176
+; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:192
+; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:208
+; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:224
+; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:240
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:128
+; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128
; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144
-; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:160
-; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:176
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:192
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160
+; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: global_store_b128 v64, v[44:47], s[2:3] offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: global_store_b128 v64, v[48:51], s[2:3] offset:64
+; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80
+; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:208
+; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:208
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:224
+; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:224
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:240
+; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:240
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s14, 0
; GFX11-NEXT: s_mov_b32 s15, 0x40200000
+; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
; GFX11-NEXT: s_mov_b32 s13, 0x401c0000
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s11, 0x40180000
; GFX11-NEXT: s_mov_b32 s5, 0x40080000
; GFX11-NEXT: s_mov_b32 s4, s14
; GFX11-NEXT: s_mov_b64 s[2:3], 2.0
-; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
+; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2
; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0
+; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1
; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc
; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6
; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4
; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 7, v0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s1
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v0
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0
; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo
; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s14, s16
; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
+; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 2, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 3, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v2
; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v1, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0
+; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1
; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc
; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v14 :: v_dual_cndmask_b32 v5, v5, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
+; GFX11-NEXT: v_readfirstlane_b32 s10, v10
+; GFX11-NEXT: v_readfirstlane_b32 s11, v11
; GFX11-NEXT: v_readfirstlane_b32 s4, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_cndmask_b32 v7, v7, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
+; GFX11-NEXT: v_readfirstlane_b32 s12, v12
+; GFX11-NEXT: v_readfirstlane_b32 s13, v13
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_cndmask_b32 v9, v9, v15
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX11-NEXT: v_readfirstlane_b32 s8, v8
; GFX11-NEXT: v_readfirstlane_b32 s9, v9
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_cndmask_b32 v11, v11, v15
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
-; GFX11-NEXT: v_readfirstlane_b32 s10, v10
-; GFX11-NEXT: v_readfirstlane_b32 s11, v11
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_cndmask_b32 v13, v13, v15
-; GFX11-NEXT: v_readfirstlane_b32 s12, v12
-; GFX11-NEXT: v_readfirstlane_b32 s13, v13
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <7 x double> %vec, double %val, i32 %idx
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 4
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 3
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
-; GFX11-NEXT: v_readfirstlane_b32 s8, v8
-; GFX11-NEXT: v_readfirstlane_b32 s9, v9
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
+; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_readfirstlane_b32 s4, v4
+; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <5 x double> %vec, double %val, i32 %idx
; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
-; GFX11-NEXT: v_readfirstlane_b32 s8, v8
-; GFX11-NEXT: v_readfirstlane_b32 s9, v9
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-NEXT: v_readfirstlane_b32 s7, v7
+; GFX11-NEXT: v_readfirstlane_b32 s8, v8
+; GFX11-NEXT: v_readfirstlane_b32 s4, v4
+; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s9, v9
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <5 x double> %vec, double %val, i32 %idx
; GFX10CU-WAVE64: NumVgprs: 128
; GFX11WGP-WAVE32: NumVgprs: 256
; GFX11WGP-WAVE64: NumVgprs: 256
-; GFX11CU-WAVE32: NumVgprs: 128
-; GFX11CU-WAVE64: NumVgprs: 128
+; GFX11CU-WAVE32: NumVgprs: 192
+; GFX11CU-WAVE64: NumVgprs: 192
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
; GFX10WGP-WAVE64: NumVgprs: 128
; GFX10CU-WAVE32: NumVgprs: 64
; GFX10CU-WAVE64: NumVgprs: 64
-; GFX11WGP-WAVE32: NumVgprs: 128
-; GFX11WGP-WAVE64: NumVgprs: 128
-; GFX11CU-WAVE32: NumVgprs: 64
-; GFX11CU-WAVE64: NumVgprs: 64
+; GFX11WGP-WAVE32: NumVgprs: 192
+; GFX11WGP-WAVE64: NumVgprs: 192
+; GFX11CU-WAVE32: NumVgprs: 96
+; GFX11CU-WAVE64: NumVgprs: 96
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1101 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1101 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1102 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
; GCN-LABEL: {{^}}max_occupancy:
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @max_occupancy() {
ret void
}
; GFX9: ; Occupancy: 3
; GFX10W64: ; Occupancy: 3
; GFX10W32: ; Occupancy: 4
+; GFX1100W64: ; Occupancy: 3
+; GFX1100W32: ; Occupancy: 5
define amdgpu_kernel void @limited_occupancy_3() #0 {
ret void
}
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 18
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @limited_occupancy_18() #1 {
ret void
}
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 18
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @limited_occupancy_19() #2 {
ret void
}
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_24_vgprs() {
call void asm sideeffect "", "~{v23}" ()
ret void
; GFX1010W64: ; Occupancy: 18
; GFX1010W32: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_28_vgprs() {
call void asm sideeffect "", "~{v27}" ()
ret void
; GFX10W64: ; Occupancy: 16
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_32_vgprs() {
call void asm sideeffect "", "~{v31}" ()
ret void
; GFX1010W32: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 12
; GFX1030W32: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_36_vgprs() {
call void asm sideeffect "", "~{v35}" ()
ret void
; GFX10W64: ; Occupancy: 12
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_40_vgprs() {
call void asm sideeffect "", "~{v39}" ()
ret void
; GFX1010W32: ; Occupancy: 20
; GFX1030W64: ; Occupancy: 10
; GFX1030W32: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_44_vgprs() {
call void asm sideeffect "", "~{v43}" ()
ret void
; GFX10W64: ; Occupancy: 10
; GFX1010W32: ; Occupancy: 20
; GFX1030W32: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_48_vgprs() {
call void asm sideeffect "", "~{v47}" ()
ret void
; GFX10W64: ; Occupancy: 9
; GFX1010W32: ; Occupancy: 18
; GFX1030W32: ; Occupancy: 16
+; GFX1100W64: ; Occupancy: 12
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_56_vgprs() {
call void asm sideeffect "", "~{v55}" ()
ret void
; GFX9: ; Occupancy: 4
; GFX10W64: ; Occupancy: 8
; GFX10W32: ; Occupancy: 16
+; GFX1100W64: ; Occupancy: 10
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_64_vgprs() {
call void asm sideeffect "", "~{v63}" ()
ret void
; GFX10W64: ; Occupancy: 7
; GFX1010W32: ; Occupancy: 14
; GFX1030W32: ; Occupancy: 12
+; GFX1100W64: ; Occupancy: 10
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_72_vgprs() {
call void asm sideeffect "", "~{v71}" ()
ret void
; GFX9: ; Occupancy: 3
; GFX10W64: ; Occupancy: 6
; GFX10W32: ; Occupancy: 12
+; GFX1100W64: ; Occupancy: 9
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_80_vgprs() {
call void asm sideeffect "", "~{v79}" ()
ret void
; GFX1010W32: ; Occupancy: 11
; GFX1030W64: ; Occupancy: 5
; GFX1030W32: ; Occupancy: 10
+; GFX1100W64: ; Occupancy: 9
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_84_vgprs() {
call void asm sideeffect "", "~{v83}" ()
ret void
; GFX10W64: ; Occupancy: 5
; GFX1010W32: ; Occupancy: 11
; GFX1030W32: ; Occupancy: 10
+; GFX1100W64: ; Occupancy: 8
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_88_vgprs() {
call void asm sideeffect "", "~{v87}" ()
ret void
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 5
; GFX10W32: ; Occupancy: 10
+; GFX1100W64: ; Occupancy: 8
+; GFX1100W32: ; Occupancy: 16
define amdgpu_kernel void @used_96_vgprs() {
call void asm sideeffect "", "~{v95}" ()
ret void
; GFX1010W64: ; Occupancy: 5
; GFX1030W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 9
+; GFX1100W64: ; Occupancy: 7
+; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_100_vgprs() {
call void asm sideeffect "", "~{v99}" ()
ret void
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 9
+; GFX1100W64: ; Occupancy: 6
+; GFX1100W32: ; Occupancy: 12
define amdgpu_kernel void @used_112_vgprs() {
call void asm sideeffect "", "~{v111}" ()
ret void
; GFX9: ; Occupancy: 2
; GFX10W64: ; Occupancy: 4
; GFX10W32: ; Occupancy: 8
+; GFX1100W64: ; Occupancy: 5
+; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_128_vgprs() {
call void asm sideeffect "", "~{v127}" ()
ret void
; GFX9: ; Occupancy: 1
; GFX10W64: ; Occupancy: 3
; GFX10W32: ; Occupancy: 7
+; GFX1100W64: ; Occupancy: 5
+; GFX1100W32: ; Occupancy: 10
define amdgpu_kernel void @used_144_vgprs() {
call void asm sideeffect "", "~{v143}" ()
ret void
; GFX10W64: ; Occupancy: 3
; GFX1010W32: ; Occupancy: 6
; GFX1030W32: ; Occupancy: 5
+; GFX1100W64: ; Occupancy: 4
+; GFX1100W32: ; Occupancy: 9
define amdgpu_kernel void @used_168_vgprs() {
call void asm sideeffect "", "~{v167}" ()
ret void
; GFX10W64: ; Occupancy: 2
; GFX1010W32: ; Occupancy: 5
; GFX1030W32: ; Occupancy: 4
+; GFX1100W64: ; Occupancy: 3
+; GFX1100W32: ; Occupancy: 7
define amdgpu_kernel void @used_200_vgprs() {
call void asm sideeffect "", "~{v199}" ()
ret void
; GFX9: ; Occupancy: 1
; GFX10W64: ; Occupancy: 2
; GFX10W32: ; Occupancy: 4
+; GFX1100W64: ; Occupancy: 2
+; GFX1100W32: ; Occupancy: 5
define amdgpu_kernel void @used_256_vgprs() {
call void asm sideeffect "", "~{v255}" ()
ret void
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_80_sgprs() {
call void asm sideeffect "", "~{s79}" ()
ret void
; GFX9: ; Occupancy: 9
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_88_sgprs() {
call void asm sideeffect "", "~{s87}" ()
ret void
; GFX9: ; Occupancy: 8
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_100_sgprs() {
call void asm sideeffect "", "~{s99}" ()
ret void
; GFX9: ; Occupancy: 7
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
define amdgpu_kernel void @used_101_sgprs() {
call void asm sideeffect "", "~{s100}" ()
ret void
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
@lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6552() {
%p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)*
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6556() {
%p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)*
; GFX9: ; Occupancy: 10
; GFX1010: ; Occupancy: 20
; GFX1030: ; Occupancy: 16
+; GFX1100: ; Occupancy: 16
@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
define amdgpu_kernel void @used_lds_13112() {
%p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
; GFX9: ; Occupancy: 7{{$}}
; GFX10W64: ; Occupancy: 7{{$}}
; GFX10W32: ; Occupancy: 14{{$}}
+; GFX1100W64: ; Occupancy: 7{{$}}
+; GFX1100W32: ; Occupancy: 14{{$}}
@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
+; GFX1100W64: ; Occupancy: 14{{$}}
+; GFX1100W32: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GFX10W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
; GFX1030W32: ; Occupancy: 16{{$}}
+; GFX1100W64: ; Occupancy: 14{{$}}
+; GFX1100W32: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
+; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
+; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
+; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GFX9: ; Occupancy: 10{{$}}
; GFX1010: ; Occupancy: 20{{$}}
; GFX1030: ; Occupancy: 16{{$}}
+; GFX1100: ; Occupancy: 16{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
; GFX9: ; Occupancy: 7{{$}}
; GFX10: ; Occupancy: 7{{$}}
+; GFX1100: ; Occupancy: 7{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p