struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
struct ir3_shader_variant *v = ir3_kernel->v;
+ const unsigned *local_size = kernel->local_size;
const struct ir3_info *i = &v->info;
enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
- OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1);
- OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
+ OUT_REG(ring, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
+ .isammode = ISAMMODE_GL,
+ .shared_consts_enable = false));
OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_ENABLE, 1);
OUT_RING(ring, A6XX_SP_PERFCTR_ENABLE_CS);
COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
COND(ir3_kernel->info.early_preamble, A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE) |
A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
+ if (CHIP == A7XX) {
+ OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
+
+ OUT_REG(ring, HLSQ_CONTROL_2_REG(CHIP, .dword = 0xfcfcfcfc),
+ HLSQ_CONTROL_3_REG(CHIP, .dword = 0xfcfcfcfc),
+ HLSQ_CONTROL_4_REG(CHIP, .dword = 0xfcfcfcfc),
+ HLSQ_CONTROL_5_REG(CHIP, .dword = 0x0000fc00), );
+ }
OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(1) |
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
+ } else {
+ enum a7xx_cs_yalign yalign = (local_size[1] % 8 == 0) ? CS_YALIGN_8
+ : (local_size[1] % 4 == 0) ? CS_YALIGN_4
+ : (local_size[1] % 2 == 0) ? CS_YALIGN_2
+ : CS_YALIGN_1;
+
+ OUT_REG(ring, A7XX_HLSQ_CS_CNTL_1(.linearlocalidregid = regid(63, 0),
+ .threadsize = thrsz,
+ .unk11 = true,
+ .unk22 = true,
+ .yalign = yalign, ));
}
if (CHIP == A7XX || a6xx_backend->info->a6xx.has_lpac) {
- OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
+ OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 1);
OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
- OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
+ OUT_REG(ring,
+ SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
+ .threadsize = thrsz, ));
}
OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1,
));
+ if (CHIP == A7XX) {
+ OUT_REG(ring, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = local_size[0] - 1,
+ .localsizey = local_size[1] - 1,
+ .localsizez = local_size[2] - 1, ));
+ }
+
OUT_REG(ring, HLSQ_CS_NDRANGE_1(CHIP,
.globalsize_x = local_size[0] * num_groups[0],
));
<bitfield name="LOCALIDREGID" low="24" high="31" type="a3xx_regid"/>
</reg32>
<!-- new in a6xx gen4, matches HLSQ_CS_CNTL_1 -->
- <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" usage="cmd">
+ <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A6XX" usage="cmd">
<!-- gl_LocalInvocationIndex -->
<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
<!-- a650 has 6 "SP cores" (but 3 "SP"). this makes it use only
<bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
<!-- 1 thread per wave (ignored if bit9 set) -->
<bitfield name="THREADSIZE_SCALAR" pos="10" type="boolean"/>
+ </reg32>
+
+ <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A7XX-" usage="cmd">
+ <!-- gl_LocalInvocationIndex -->
+ <bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
+ <!-- Must match SP_CS_CTRL -->
+ <bitfield name="THREADSIZE" pos="8" type="a6xx_threadsize"/>
+ <!-- 1 thread per wave (would hang if THREAD128 is also set) -->
+ <bitfield name="THREADSIZE_SCALAR" pos="9" type="boolean"/>
<!-- Affects getone. If enabled, getone sometimes executed 1? less times
than there are subgroups.
-->
- <bitfield name="UNK15" pos="15" type="boolean" variants="A7XX"/>
+ <bitfield name="UNK15" pos="15" type="boolean"/>
</reg32>
<!-- TODO: two 64kb aligned addresses at a9d0/a9d2 -->
<reg32 offset="0xa9dd" name="HLSQ_CS_KERNEL_GROUP_Y" variants="A7XX-" usage="rp_blit"/>
<reg32 offset="0xa9de" name="HLSQ_CS_KERNEL_GROUP_Z" variants="A7XX-" usage="rp_blit"/>
- <reg32 offset="0xa9db" name="HLSQ_CS_UNKNOWN_A9DB" variants="A7XX-" usage="rp_blit">
+ <enum name="a7xx_cs_yalign">
+ <value name="CS_YALIGN_1" value="8"/>
+ <value name="CS_YALIGN_2" value="4"/>
+ <value name="CS_YALIGN_4" value="2"/>
+ <value name="CS_YALIGN_8" value="1"/>
+ </enum>
+
+ <reg32 offset="0xa9db" name="HLSQ_CS_CNTL_1" variants="A7XX-" usage="rp_blit">
+ <!-- gl_LocalInvocationIndex -->
<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
+ <!-- Must match SP_CS_CTRL -->
<bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
<bitfield name="UNK11" pos="11" type="boolean"/>
<bitfield name="UNK22" pos="22" type="boolean"/>
- <bitfield name="UNK27" low="27" high="30" type="uint" variants="A7XX"/>
- <!-- TODO: other bits -->
+ <bitfield name="UNK26" pos="26" type="boolean"/>
+ <bitfield name="YALIGN" low="27" high="30" type="a7xx_cs_yalign"/>
</reg32>
<reg32 offset="0xa9df" name="HLSQ_CS_LOCAL_SIZE" variants="A7XX-" usage="cmd">