From e39b6e2b9b332ebc367b71fad2e4d4e51ca330c4 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Tue, 5 Sep 2023 15:44:49 +0200
Subject: [PATCH] freedreno: Make possible to specify A7XX feature flags

Previously the idea was for each generation to have unique list
of feature flags, now it makes more sense for new generation
to have only new flags defined and "inherit" older gen flags.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25086>
---
 src/freedreno/common/freedreno_dev_info.h | 257 +++++++++++++++---------------
 src/freedreno/common/freedreno_devices.py |  45 ++++--
 2 files changed, 162 insertions(+), 140 deletions(-)

diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index defd43a..5dda81a 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -62,135 +62,136 @@ struct fd_dev_info {
       uint32_t num_ccu;
    };
 
-   union {
+   struct {
+      uint32_t reg_size_vec4;
+
+      /* The size (in instrlen units (128 bytes)) of instruction cache where
+       * we preload a shader. Loading more than this could trigger a hang
+       * on gen3 and later.
+       */
+      uint32_t instr_cache_size;
+
+      bool has_hw_multiview;
+
+      bool has_fs_tex_prefetch;
+
+      /* Whether the PC_MULTIVIEW_MASK register exists. */
+      bool supports_multiview_mask;
+
+      /* info for setting RB_CCU_CNTL */
+      bool concurrent_resolve;
+      bool has_z24uint_s8uint;
+
+      bool tess_use_shared;
+
+      /* Does the hw support GL_QCOM_shading_rate? */
+      bool has_shading_rate;
+
+      /* newer a6xx allows using 16-bit descriptor for both 16-bit
+       * and 32-bit access
+       */
+      bool storage_16bit;
+
+      /* The latest known a630_sqe.fw fails to wait for WFI before
+       * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
+       * so we have to fall back to CP_WAIT_FOR_ME except for a650
+       * which has a fixed firmware.
+       *
+       * TODO: There may be newer a630_sqe.fw released in the future
+       * which fixes this, if so we should detect it and avoid this
+       * workaround.  Once we have uapi to query fw version, we can
+       * replace this with minimum fw version.
+       */
+      bool indirect_draw_wfm_quirk;
+
+      /* On some GPUs, the depth test needs to be enabled when the
+       * depth bounds test is enabled and the depth attachment uses UBWC.
+       */
+      bool depth_bounds_require_depth_test_quirk;
+
+      bool has_tex_filter_cubic;
+      bool has_separate_chroma_filter;
+
+      bool has_sample_locations;
+
+      /* The firmware on newer a6xx drops CP_REG_WRITE support as we
+       * can now use direct register writes for these regs.
+       */
+      bool has_cp_reg_write;
+
+      bool has_8bpp_ubwc;
+
+      bool has_lpac;
+
+      bool has_getfiberid;
+
+      bool has_dp2acc;
+      bool has_dp4acc;
+
+      /* LRZ fast-clear works on all gens, however blob disables it on
+       * gen1 and gen2. We also elect to disable fast-clear on these gens
+       * because for close to none gains it adds complexity and seem to work
+       * a bit differently from gen3+. Which creates at least one edge case:
+       * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
+       * the fast-clear value is undefined. For details see
+       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
+       */
+      bool enable_lrz_fast_clear;
+      bool has_lrz_dir_tracking;
+      bool lrz_track_quirk;
+
+      /* Some generations have a bit to add the multiview index to the
+       * viewport index, which lets us implement different scaling for
+       * different views.
+       */
+      bool has_per_view_viewport;
+      bool has_gmem_fast_clear;
+
+      /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
+       * in sysmem rendering. */
+      uint32_t sysmem_per_ccu_cache_size;
+      /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
+       * which require color cache (non-BLIT event case).
+       * The size is expressed as a fraction of ccu cache used by sysmem
+       * rendering. If a GMEM resolve requires color cache, the driver needs
+       * to make sure it will not overwrite pixel data in GMEM that is still
+       * needed.
+       */
+      /* see enum a6xx_ccu_color_cache_size */
+      uint32_t gmem_ccu_color_cache_fraction;
+
+      /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
+      uint32_t prim_alloc_threshold;
+
+      uint32_t vs_max_inputs_count;
+
+      bool supports_double_threadsize;
+
+      bool has_sampler_minmax;
       struct {
-         uint32_t reg_size_vec4;
-
-         /* The size (in instrlen units (128 bytes)) of instruction cache where
-          * we preload a shader. Loading more than this could trigger a hang
-          * on gen3 and later.
-          */
-         uint32_t instr_cache_size;
-
-         bool has_hw_multiview;
-
-         bool has_fs_tex_prefetch;
-
-         /* Whether the PC_MULTIVIEW_MASK register exists. */
-         bool supports_multiview_mask;
-
-         /* info for setting RB_CCU_CNTL */
-         bool concurrent_resolve;
-         bool has_z24uint_s8uint;
-
-         bool tess_use_shared;
-
-         /* Does the hw support GL_QCOM_shading_rate? */
-         bool has_shading_rate;
-
-         /* newer a6xx allows using 16-bit descriptor for both 16-bit
-          * and 32-bit access
-          */
-         bool storage_16bit;
-
-         /* The latest known a630_sqe.fw fails to wait for WFI before
-          * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
-          * so we have to fall back to CP_WAIT_FOR_ME except for a650
-          * which has a fixed firmware.
-          *
-          * TODO: There may be newer a630_sqe.fw released in the future
-          * which fixes this, if so we should detect it and avoid this
-          * workaround.  Once we have uapi to query fw version, we can
-          * replace this with minimum fw version.
-          */
-         bool indirect_draw_wfm_quirk;
-
-         /* On some GPUs, the depth test needs to be enabled when the
-          * depth bounds test is enabled and the depth attachment uses UBWC.
-          */
-         bool depth_bounds_require_depth_test_quirk;
-
-         bool has_tex_filter_cubic;
-         bool has_separate_chroma_filter;
-
-         bool has_sample_locations;
-
-         /* The firmware on newer a6xx drops CP_REG_WRITE support as we
-          * can now use direct register writes for these regs.
-          */
-         bool has_cp_reg_write;
-
-         bool has_8bpp_ubwc;
-
-         bool has_lpac;
-
-         bool has_getfiberid;
-
-         bool has_dp2acc;
-         bool has_dp4acc;
-
-         /* LRZ fast-clear works on all gens, however blob disables it on
-          * gen1 and gen2. We also elect to disable fast-clear on these gens
-          * because for close to none gains it adds complexity and seem to work
-          * a bit differently from gen3+. Which creates at least one edge case:
-          * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
-          * the fast-clear value is undefined. For details see
-          * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
-          */
-         bool enable_lrz_fast_clear;
-         bool has_lrz_dir_tracking;
-         bool lrz_track_quirk;
-
-         /* Some generations have a bit to add the multiview index to the
-          * viewport index, which lets us implement different scaling for
-          * different views.
-          */
-         bool has_per_view_viewport;
-         bool has_gmem_fast_clear;
-
-         /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
-          * in sysmem rendering. */
-         uint32_t sysmem_per_ccu_cache_size;
-         /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
-          * which require color cache (non-BLIT event case).
-          * The size is expressed as a fraction of ccu cache used by sysmem
-          * rendering. If a GMEM resolve requires color cache, the driver needs
-          * to make sure it will not overwrite pixel data in GMEM that is still
-          * needed.
-          */
-         /* see enum a6xx_ccu_color_cache_size */
-         uint32_t gmem_ccu_color_cache_fraction;
-
-         /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
-         uint32_t prim_alloc_threshold;
-
-         uint32_t vs_max_inputs_count;
-
-         bool supports_double_threadsize;
-
-         bool has_sampler_minmax;
-         struct {
-            uint32_t PC_POWER_CNTL;
-            uint32_t TPL1_DBG_ECO_CNTL;
-            uint32_t GRAS_DBG_ECO_CNTL;
-            uint32_t SP_CHICKEN_BITS;
-            uint32_t UCHE_CLIENT_PF;
-            uint32_t PC_MODE_CNTL;
-            uint32_t SP_DBG_ECO_CNTL;
-            uint32_t RB_DBG_ECO_CNTL;
-            uint32_t RB_DBG_ECO_CNTL_blit;
-            uint32_t HLSQ_DBG_ECO_CNTL;
-            uint32_t RB_UNKNOWN_8E01;
-            uint32_t VPC_DBG_ECO_CNTL;
-            uint32_t UCHE_UNKNOWN_0E12;
-         } magic;
-
-         struct {
-               uint32_t reg;
-               uint32_t value;
-         } magic_raw[32];
-      } a6xx;
-   };
+         uint32_t PC_POWER_CNTL;
+         uint32_t TPL1_DBG_ECO_CNTL;
+         uint32_t GRAS_DBG_ECO_CNTL;
+         uint32_t SP_CHICKEN_BITS;
+         uint32_t UCHE_CLIENT_PF;
+         uint32_t PC_MODE_CNTL;
+         uint32_t SP_DBG_ECO_CNTL;
+         uint32_t RB_DBG_ECO_CNTL;
+         uint32_t RB_DBG_ECO_CNTL_blit;
+         uint32_t HLSQ_DBG_ECO_CNTL;
+         uint32_t RB_UNKNOWN_8E01;
+         uint32_t VPC_DBG_ECO_CNTL;
+         uint32_t UCHE_UNKNOWN_0E12;
+      } magic;
+
+      struct {
+            uint32_t reg;
+            uint32_t value;
+      } magic_raw[32];
+   } a6xx;
+
+   struct {
+   } a7xx;
 };
 
 struct fd_dev_id {
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index 07eb244..04695d3 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -158,6 +158,8 @@ class A6xxGPUInfo(GPUInfo):
         self.num_ccu = num_ccu
 
         self.a6xx = Struct()
+        self.a7xx = Struct()
+
         self.a6xx.magic = Struct()
 
         for name, val in magic_regs.items():
@@ -183,10 +185,10 @@ class A6xxGPUInfo(GPUInfo):
 
         self.a6xx.vs_max_inputs_count = 32
 
-        for name, val in template.items():
-            if name == "magic": # handled above
-                continue
-            setattr(self.a6xx, name, val)
+        templates = template if type(template) is list else [template]
+        for template in templates:
+            template.apply_props(self)
+
 
     def __str__(self):
      return super(A6xxGPUInfo, self).__str__().replace('[', '{').replace("]", "}")
@@ -296,12 +298,27 @@ add_gpus([
         fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
     ))
 
+
+class A6XXProps(dict):
+    def apply_props(self, gpu_info):
+        for name, val in self.items():
+            if name == "magic":
+                continue
+            setattr(gpu_info.a6xx, name, val)
+
+
+class A7XXProps(dict):
+    def apply_props(self, gpu_info):
+        for name, val in self.items():
+            setattr(gpu_info.a7xx, name, val)
+
+
 # a6xx can be divided into distinct sub-generations, where certain device-
 # info parameters are keyed to the sub-generation.  These templates reduce
 # the copypaste
 
 # a615, a616, a618, a619, a620 and a630:
-a6xx_gen1 = dict(
+a6xx_gen1 = A6XXProps(
         reg_size_vec4 = 96,
         instr_cache_size = 64,
         concurrent_resolve = False,
@@ -311,7 +328,7 @@ a6xx_gen1 = dict(
     )
 
 # a605, a608, a610, 612
-a6xx_gen1_low = {**a6xx_gen1, **dict(
+a6xx_gen1_low = A6XXProps({**a6xx_gen1, **A6XXProps(
         has_gmem_fast_clear = False,
         reg_size_vec4 = 48,
         has_hw_multiview = False,
@@ -321,10 +338,10 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
         gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
         vs_max_inputs_count = 16,
         supports_double_threadsize = False,
-)}
+)})
 
 # a640, a680:
-a6xx_gen2 = dict(
+a6xx_gen2 = A6XXProps(
         reg_size_vec4 = 96,
         instr_cache_size = 64, # TODO
         supports_multiview_mask = True,
@@ -337,7 +354,7 @@ a6xx_gen2 = dict(
     )
 
 # a650:
-a6xx_gen3 = dict(
+a6xx_gen3 = A6XXProps(
         reg_size_vec4 = 64,
         # Blob limits it to 128 but we hang with 128
         instr_cache_size = 127,
@@ -358,7 +375,7 @@ a6xx_gen3 = dict(
     )
 
 # a635, a660:
-a6xx_gen4 = dict(
+a6xx_gen4 = A6XXProps(
         reg_size_vec4 = 64,
         # Blob limits it to 128 but we hang with 128
         instr_cache_size = 127,
@@ -685,12 +702,16 @@ add_gpus([
         )
     ))
 
+a7xx_730 = A7XXProps()
+
+a7xx_740 = A7XXProps()
+
 add_gpus([
         GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data
         GPUId(chip_id=0xffff07030001, name="FD730"), # Default no-speedbin fallback
     ], A6xxGPUInfo(
         CHIP.A7XX,
-        a6xx_gen4,
+        [a6xx_gen4, a7xx_730],
         num_ccu = 4,
         tile_align_w = 64,
         tile_align_h = 32,
@@ -746,7 +767,7 @@ add_gpus([
         GPUId(chip_id=0xffff43050a01, name="FD740"), # Default no-speedbin fallback
     ], A6xxGPUInfo(
         CHIP.A7XX,
-        a6xx_gen4,
+        [a6xx_gen4, a7xx_740],
         num_ccu = 6,
         tile_align_w = 64,
         tile_align_h = 32,
-- 
2.7.4