freedreno: Make possible to specify A7XX feature flags

author Danylo Piliaiev <dpiliaiev@igalia.com>

Tue, 5 Sep 2023 13:44:49 +0000 (15:44 +0200)

committer Marge Bot <emma+marge@anholt.net>

Wed, 4 Oct 2023 15:51:54 +0000 (15:51 +0000)
author Danylo Piliaiev <dpiliaiev@igalia.com>
Tue, 5 Sep 2023 13:44:49 +0000 (15:44 +0200)
committer Marge Bot <emma+marge@anholt.net>
Wed, 4 Oct 2023 15:51:54 +0000 (15:51 +0000)
diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h

index defd43a..5dda81a 100644 (file)
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -62,135 +62,136 @@ struct fd_dev_info {
        uint32_t num_ccu;
     };
  
-   union {
+   struct {
+      uint32_t reg_size_vec4;
+
+      /* The size (in instrlen units (128 bytes)) of instruction cache where
+       * we preload a shader. Loading more than this could trigger a hang
+       * on gen3 and later.
+       */
+      uint32_t instr_cache_size;
+
+      bool has_hw_multiview;
+
+      bool has_fs_tex_prefetch;
+
+      /* Whether the PC_MULTIVIEW_MASK register exists. */
+      bool supports_multiview_mask;
+
+      /* info for setting RB_CCU_CNTL */
+      bool concurrent_resolve;
+      bool has_z24uint_s8uint;
+
+      bool tess_use_shared;
+
+      /* Does the hw support GL_QCOM_shading_rate? */
+      bool has_shading_rate;
+
+      /* newer a6xx allows using 16-bit descriptor for both 16-bit
+       * and 32-bit access
+       */
+      bool storage_16bit;
+
+      /* The latest known a630_sqe.fw fails to wait for WFI before
+       * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
+       * so we have to fall back to CP_WAIT_FOR_ME except for a650
+       * which has a fixed firmware.
+       *
+       * TODO: There may be newer a630_sqe.fw released in the future
+       * which fixes this, if so we should detect it and avoid this
+       * workaround.  Once we have uapi to query fw version, we can
+       * replace this with minimum fw version.
+       */
+      bool indirect_draw_wfm_quirk;
+
+      /* On some GPUs, the depth test needs to be enabled when the
+       * depth bounds test is enabled and the depth attachment uses UBWC.
+       */
+      bool depth_bounds_require_depth_test_quirk;
+
+      bool has_tex_filter_cubic;
+      bool has_separate_chroma_filter;
+
+      bool has_sample_locations;
+
+      /* The firmware on newer a6xx drops CP_REG_WRITE support as we
+       * can now use direct register writes for these regs.
+       */
+      bool has_cp_reg_write;
+
+      bool has_8bpp_ubwc;
+
+      bool has_lpac;
+
+      bool has_getfiberid;
+
+      bool has_dp2acc;
+      bool has_dp4acc;
+
+      /* LRZ fast-clear works on all gens, however blob disables it on
+       * gen1 and gen2. We also elect to disable fast-clear on these gens
+       * because for close to none gains it adds complexity and seem to work
+       * a bit differently from gen3+. Which creates at least one edge case:
+       * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
+       * the fast-clear value is undefined. For details see
+       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
+       */
+      bool enable_lrz_fast_clear;
+      bool has_lrz_dir_tracking;
+      bool lrz_track_quirk;
+
+      /* Some generations have a bit to add the multiview index to the
+       * viewport index, which lets us implement different scaling for
+       * different views.
+       */
+      bool has_per_view_viewport;
+      bool has_gmem_fast_clear;
+
+      /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
+       * in sysmem rendering. */
+      uint32_t sysmem_per_ccu_cache_size;
+      /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
+       * which require color cache (non-BLIT event case).
+       * The size is expressed as a fraction of ccu cache used by sysmem
+       * rendering. If a GMEM resolve requires color cache, the driver needs
+       * to make sure it will not overwrite pixel data in GMEM that is still
+       * needed.
+       */
+      /* see enum a6xx_ccu_color_cache_size */
+      uint32_t gmem_ccu_color_cache_fraction;
+
+      /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
+      uint32_t prim_alloc_threshold;
+
+      uint32_t vs_max_inputs_count;
+
+      bool supports_double_threadsize;
+
+      bool has_sampler_minmax;
        struct {
-         uint32_t reg_size_vec4;
-
-         /* The size (in instrlen units (128 bytes)) of instruction cache where
-          * we preload a shader. Loading more than this could trigger a hang
-          * on gen3 and later.
-          */
-         uint32_t instr_cache_size;
-
-         bool has_hw_multiview;
-
-         bool has_fs_tex_prefetch;
-
-         /* Whether the PC_MULTIVIEW_MASK register exists. */
-         bool supports_multiview_mask;
-
-         /* info for setting RB_CCU_CNTL */
-         bool concurrent_resolve;
-         bool has_z24uint_s8uint;
-
-         bool tess_use_shared;
-
-         /* Does the hw support GL_QCOM_shading_rate? */
-         bool has_shading_rate;
-
-         /* newer a6xx allows using 16-bit descriptor for both 16-bit
-          * and 32-bit access
-          */
-         bool storage_16bit;
-
-         /* The latest known a630_sqe.fw fails to wait for WFI before
-          * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
-          * so we have to fall back to CP_WAIT_FOR_ME except for a650
-          * which has a fixed firmware.
-          *
-          * TODO: There may be newer a630_sqe.fw released in the future
-          * which fixes this, if so we should detect it and avoid this
-          * workaround.  Once we have uapi to query fw version, we can
-          * replace this with minimum fw version.
-          */
-         bool indirect_draw_wfm_quirk;
-
-         /* On some GPUs, the depth test needs to be enabled when the
-          * depth bounds test is enabled and the depth attachment uses UBWC.
-          */
-         bool depth_bounds_require_depth_test_quirk;
-
-         bool has_tex_filter_cubic;
-         bool has_separate_chroma_filter;
-
-         bool has_sample_locations;
-
-         /* The firmware on newer a6xx drops CP_REG_WRITE support as we
-          * can now use direct register writes for these regs.
-          */
-         bool has_cp_reg_write;
-
-         bool has_8bpp_ubwc;
-
-         bool has_lpac;
-
-         bool has_getfiberid;
-
-         bool has_dp2acc;
-         bool has_dp4acc;
-
-         /* LRZ fast-clear works on all gens, however blob disables it on
-          * gen1 and gen2. We also elect to disable fast-clear on these gens
-          * because for close to none gains it adds complexity and seem to work
-          * a bit differently from gen3+. Which creates at least one edge case:
-          * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
-          * the fast-clear value is undefined. For details see
-          * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
-          */
-         bool enable_lrz_fast_clear;
-         bool has_lrz_dir_tracking;
-         bool lrz_track_quirk;
-
-         /* Some generations have a bit to add the multiview index to the
-          * viewport index, which lets us implement different scaling for
-          * different views.
-          */
-         bool has_per_view_viewport;
-         bool has_gmem_fast_clear;
-
-         /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
-          * in sysmem rendering. */
-         uint32_t sysmem_per_ccu_cache_size;
-         /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
-          * which require color cache (non-BLIT event case).
-          * The size is expressed as a fraction of ccu cache used by sysmem
-          * rendering. If a GMEM resolve requires color cache, the driver needs
-          * to make sure it will not overwrite pixel data in GMEM that is still
-          * needed.
-          */
-         /* see enum a6xx_ccu_color_cache_size */
-         uint32_t gmem_ccu_color_cache_fraction;
-
-         /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
-         uint32_t prim_alloc_threshold;
-
-         uint32_t vs_max_inputs_count;
-
-         bool supports_double_threadsize;
-
-         bool has_sampler_minmax;
-         struct {
-            uint32_t PC_POWER_CNTL;
-            uint32_t TPL1_DBG_ECO_CNTL;
-            uint32_t GRAS_DBG_ECO_CNTL;
-            uint32_t SP_CHICKEN_BITS;
-            uint32_t UCHE_CLIENT_PF;
-            uint32_t PC_MODE_CNTL;
-            uint32_t SP_DBG_ECO_CNTL;
-            uint32_t RB_DBG_ECO_CNTL;
-            uint32_t RB_DBG_ECO_CNTL_blit;
-            uint32_t HLSQ_DBG_ECO_CNTL;
-            uint32_t RB_UNKNOWN_8E01;
-            uint32_t VPC_DBG_ECO_CNTL;
-            uint32_t UCHE_UNKNOWN_0E12;
-         } magic;
-
-         struct {
-               uint32_t reg;
-               uint32_t value;
-         } magic_raw[32];
-      } a6xx;
-   };
+         uint32_t PC_POWER_CNTL;
+         uint32_t TPL1_DBG_ECO_CNTL;
+         uint32_t GRAS_DBG_ECO_CNTL;
+         uint32_t SP_CHICKEN_BITS;
+         uint32_t UCHE_CLIENT_PF;
+         uint32_t PC_MODE_CNTL;
+         uint32_t SP_DBG_ECO_CNTL;
+         uint32_t RB_DBG_ECO_CNTL;
+         uint32_t RB_DBG_ECO_CNTL_blit;
+         uint32_t HLSQ_DBG_ECO_CNTL;
+         uint32_t RB_UNKNOWN_8E01;
+         uint32_t VPC_DBG_ECO_CNTL;
+         uint32_t UCHE_UNKNOWN_0E12;
+      } magic;
+
+      struct {
+            uint32_t reg;
+            uint32_t value;
+      } magic_raw[32];
+   } a6xx;
+
+   struct {
+   } a7xx;
  };
  
  struct fd_dev_id {
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py

index 07eb244..04695d3 100644 (file)
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -158,6 +158,8 @@ class A6xxGPUInfo(GPUInfo):
          self.num_ccu = num_ccu
  
          self.a6xx = Struct()
+        self.a7xx = Struct()
+
          self.a6xx.magic = Struct()
  
          for name, val in magic_regs.items():
@@ -183,10 +185,10 @@ class A6xxGPUInfo(GPUInfo):
  
          self.a6xx.vs_max_inputs_count = 32
  
-        for name, val in template.items():
-            if name == "magic": # handled above
-                continue
-            setattr(self.a6xx, name, val)
+        templates = template if type(template) is list else [template]
+        for template in templates:
+            template.apply_props(self)
+
  
      def __str__(self):
       return super(A6xxGPUInfo, self).__str__().replace('[', '{').replace("]", "}")
@@ -296,12 +298,27 @@ add_gpus([
          fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
      ))
  
+
+class A6XXProps(dict):
+    def apply_props(self, gpu_info):
+        for name, val in self.items():
+            if name == "magic":
+                continue
+            setattr(gpu_info.a6xx, name, val)
+
+
+class A7XXProps(dict):
+    def apply_props(self, gpu_info):
+        for name, val in self.items():
+            setattr(gpu_info.a7xx, name, val)
+
+
  # a6xx can be divided into distinct sub-generations, where certain device-
  # info parameters are keyed to the sub-generation.  These templates reduce
  # the copypaste
  
  # a615, a616, a618, a619, a620 and a630:
-a6xx_gen1 = dict(
+a6xx_gen1 = A6XXProps(
          reg_size_vec4 = 96,
          instr_cache_size = 64,
          concurrent_resolve = False,
@@ -311,7 +328,7 @@ a6xx_gen1 = dict(
      )
  
  # a605, a608, a610, 612
-a6xx_gen1_low = {**a6xx_gen1, **dict(
+a6xx_gen1_low = A6XXProps({**a6xx_gen1, **A6XXProps(
          has_gmem_fast_clear = False,
          reg_size_vec4 = 48,
          has_hw_multiview = False,
@@ -321,10 +338,10 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
          gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
          vs_max_inputs_count = 16,
          supports_double_threadsize = False,
-)}
+)})
  
  # a640, a680:
-a6xx_gen2 = dict(
+a6xx_gen2 = A6XXProps(
          reg_size_vec4 = 96,
          instr_cache_size = 64, # TODO
          supports_multiview_mask = True,
@@ -337,7 +354,7 @@ a6xx_gen2 = dict(
      )
  
  # a650:
-a6xx_gen3 = dict(
+a6xx_gen3 = A6XXProps(
          reg_size_vec4 = 64,
          # Blob limits it to 128 but we hang with 128
          instr_cache_size = 127,
@@ -358,7 +375,7 @@ a6xx_gen3 = dict(
      )
  
  # a635, a660:
-a6xx_gen4 = dict(
+a6xx_gen4 = A6XXProps(
          reg_size_vec4 = 64,
          # Blob limits it to 128 but we hang with 128
          instr_cache_size = 127,
@@ -685,12 +702,16 @@ add_gpus([
          )
      ))
  
+a7xx_730 = A7XXProps()
+
+a7xx_740 = A7XXProps()
+
  add_gpus([
          GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data
          GPUId(chip_id=0xffff07030001, name="FD730"), # Default no-speedbin fallback
      ], A6xxGPUInfo(
          CHIP.A7XX,
-        a6xx_gen4,
+        [a6xx_gen4, a7xx_730],
          num_ccu = 4,
          tile_align_w = 64,
          tile_align_h = 32,
@@ -746,7 +767,7 @@ add_gpus([
          GPUId(chip_id=0xffff43050a01, name="FD740"), # Default no-speedbin fallback
      ], A6xxGPUInfo(
          CHIP.A7XX,
-        a6xx_gen4,
+        [a6xx_gen4, a7xx_740],
          num_ccu = 6,
          tile_align_w = 64,
          tile_align_h = 32,
author	Danylo Piliaiev <dpiliaiev@igalia.com>
	Tue, 5 Sep 2023 13:44:49 +0000 (15:44 +0200)
committer	Marge Bot <emma+marge@anholt.net>
	Wed, 4 Oct 2023 15:51:54 +0000 (15:51 +0000)
src/freedreno/common/freedreno_dev_info.h		patch \| blob \| history
src/freedreno/common/freedreno_devices.py		patch \| blob \| history