freedreno/computerator: Fix remaining issues with A7XX
authorDanylo Piliaiev <dpiliaiev@igalia.com>
Mon, 17 Apr 2023 13:39:09 +0000 (15:39 +0200)
committerMarge Bot <emma+marge@anholt.net>
Tue, 5 Sep 2023 16:19:29 +0000 (16:19 +0000)
Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23217>

src/freedreno/computerator/a6xx.cc
src/freedreno/computerator/meson.build
src/freedreno/registers/adreno/a6xx.xml

index d3ff32d..93a3de2 100644 (file)
@@ -119,11 +119,13 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
    struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
    struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
    struct ir3_shader_variant *v = ir3_kernel->v;
+   const unsigned *local_size = kernel->local_size;
    const struct ir3_info *i = &v->info;
    enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
 
-   OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1);
-   OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
+   OUT_REG(ring, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
+                                      .isammode = ISAMMODE_GL,
+                                      .shared_consts_enable = false));
 
    OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_ENABLE, 1);
    OUT_RING(ring, A6XX_SP_PERFCTR_ENABLE_CS);
@@ -168,6 +170,14 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
                COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
                COND(ir3_kernel->info.early_preamble, A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE) |
                A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
+   if (CHIP == A7XX) {
+      OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
+
+      OUT_REG(ring, HLSQ_CONTROL_2_REG(CHIP, .dword = 0xfcfcfcfc),
+              HLSQ_CONTROL_3_REG(CHIP, .dword = 0xfcfcfcfc),
+              HLSQ_CONTROL_4_REG(CHIP, .dword = 0xfcfcfcfc),
+              HLSQ_CONTROL_5_REG(CHIP, .dword = 0x0000fc00), );
+   }
 
    OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
    OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(1) |
@@ -192,16 +202,28 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
                         A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
       OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
                         A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
+   } else {
+      enum a7xx_cs_yalign yalign = (local_size[1] % 8 == 0)   ? CS_YALIGN_8
+                                   : (local_size[1] % 4 == 0) ? CS_YALIGN_4
+                                   : (local_size[1] % 2 == 0) ? CS_YALIGN_2
+                                                              : CS_YALIGN_1;
+
+      OUT_REG(ring, A7XX_HLSQ_CS_CNTL_1(.linearlocalidregid = regid(63, 0),
+                                        .threadsize = thrsz,
+                                        .unk11 = true,
+                                        .unk22 = true,
+                                        .yalign = yalign, ));
    }
 
    if (CHIP == A7XX || a6xx_backend->info->a6xx.has_lpac) {
-      OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
+      OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 1);
       OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
                         A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
                         A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
                         A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
-      OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
-                        A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
+      OUT_REG(ring,
+         SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
+                            .threadsize = thrsz, ));
    }
 
    OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
@@ -463,6 +485,12 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
                     .localsizey = local_size[1] - 1,
                     .localsizez = local_size[2] - 1,
                  ));
+   if (CHIP == A7XX) {
+      OUT_REG(ring, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = local_size[0] - 1,
+                                            .localsizey = local_size[1] - 1,
+                                            .localsizez = local_size[2] - 1, ));
+   }
+
    OUT_REG(ring, HLSQ_CS_NDRANGE_1(CHIP,
                     .globalsize_x = local_size[0] * num_groups[0],
                  ));
index dcdfd80..d0d21e5 100644 (file)
@@ -29,8 +29,13 @@ computerator_files = [
 
 computerator_cpp_args = cpp.get_supported_arguments([
   '-Wno-sign-compare',
+  '-Wno-array-bounds',
 ])
 
+if meson.is_cross_build()
+  computerator_cpp_args += '-Wno-array-bounds'
+endif
+
 computerator = executable(
   'computerator',
   computerator_files,
index 9fb5728..c01a5b3 100644 (file)
@@ -3684,7 +3684,7 @@ to upconvert to 32b float internally?
                <bitfield name="LOCALIDREGID" low="24" high="31" type="a3xx_regid"/>
        </reg32>
        <!-- new in a6xx gen4, matches HLSQ_CS_CNTL_1 -->
-       <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" usage="cmd">
+       <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A6XX" usage="cmd">
                <!-- gl_LocalInvocationIndex -->
                <bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
                <!-- a650 has 6 "SP cores" (but 3 "SP"). this makes it use only
@@ -3694,11 +3694,20 @@ to upconvert to 32b float internally?
                <bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
                <!-- 1 thread per wave (ignored if bit9 set) -->
                <bitfield name="THREADSIZE_SCALAR" pos="10" type="boolean"/>
+       </reg32>
+
+       <reg32 offset="0xa9c3" name="SP_CS_CNTL_1" variants="A7XX-" usage="cmd">
+               <!-- gl_LocalInvocationIndex -->
+               <bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
+               <!-- Must match SP_CS_CTRL -->
+               <bitfield name="THREADSIZE" pos="8" type="a6xx_threadsize"/>
+               <!-- 1 thread per wave (would hang if THREAD128 is also set) -->
+               <bitfield name="THREADSIZE_SCALAR" pos="9" type="boolean"/>
 
                <!-- Affects getone. If enabled, getone sometimes executed 1? less times
                     than there are subgroups.
                 -->
-               <bitfield name="UNK15" pos="15" type="boolean" variants="A7XX"/>
+               <bitfield name="UNK15" pos="15" type="boolean"/>
        </reg32>
 
        <!-- TODO: two 64kb aligned addresses at a9d0/a9d2 -->
@@ -4146,13 +4155,22 @@ to upconvert to 32b float internally?
        <reg32 offset="0xa9dd" name="HLSQ_CS_KERNEL_GROUP_Y" variants="A7XX-" usage="rp_blit"/>
        <reg32 offset="0xa9de" name="HLSQ_CS_KERNEL_GROUP_Z" variants="A7XX-" usage="rp_blit"/>
 
-       <reg32 offset="0xa9db" name="HLSQ_CS_UNKNOWN_A9DB" variants="A7XX-" usage="rp_blit">
+       <enum name="a7xx_cs_yalign">
+               <value name="CS_YALIGN_1" value="8"/>
+               <value name="CS_YALIGN_2" value="4"/>
+               <value name="CS_YALIGN_4" value="2"/>
+               <value name="CS_YALIGN_8" value="1"/>
+       </enum>
+
+       <reg32 offset="0xa9db" name="HLSQ_CS_CNTL_1" variants="A7XX-" usage="rp_blit">
+               <!-- gl_LocalInvocationIndex -->
                <bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
+               <!-- Must match SP_CS_CTRL -->
                <bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
                <bitfield name="UNK11" pos="11" type="boolean"/>
                <bitfield name="UNK22" pos="22" type="boolean"/>
-               <bitfield name="UNK27" low="27" high="30" type="uint" variants="A7XX"/>
-               <!-- TODO: other bits -->
+               <bitfield name="UNK26" pos="26" type="boolean"/>
+               <bitfield name="YALIGN" low="27" high="30" type="a7xx_cs_yalign"/>
        </reg32>
 
        <reg32 offset="0xa9df" name="HLSQ_CS_LOCAL_SIZE" variants="A7XX-" usage="cmd">