uint32_t num_ccu;
};
- union {
+ struct {
+ uint32_t reg_size_vec4;
+
+ /* The size (in instrlen units (128 bytes)) of instruction cache where
+ * we preload a shader. Loading more than this could trigger a hang
+ * on gen3 and later.
+ */
+ uint32_t instr_cache_size;
+
+ bool has_hw_multiview;
+
+ bool has_fs_tex_prefetch;
+
+ /* Whether the PC_MULTIVIEW_MASK register exists. */
+ bool supports_multiview_mask;
+
+ /* info for setting RB_CCU_CNTL */
+ bool concurrent_resolve;
+ bool has_z24uint_s8uint;
+
+ bool tess_use_shared;
+
+ /* Does the hw support GL_QCOM_shading_rate? */
+ bool has_shading_rate;
+
+ /* newer a6xx allows using 16-bit descriptor for both 16-bit
+ * and 32-bit access
+ */
+ bool storage_16bit;
+
+ /* The latest known a630_sqe.fw fails to wait for WFI before
+ * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
+ * so we have to fall back to CP_WAIT_FOR_ME except for a650
+ * which has a fixed firmware.
+ *
+ * TODO: There may be newer a630_sqe.fw released in the future
+ * which fixes this, if so we should detect it and avoid this
+ * workaround. Once we have uapi to query fw version, we can
+ * replace this with minimum fw version.
+ */
+ bool indirect_draw_wfm_quirk;
+
+ /* On some GPUs, the depth test needs to be enabled when the
+ * depth bounds test is enabled and the depth attachment uses UBWC.
+ */
+ bool depth_bounds_require_depth_test_quirk;
+
+ bool has_tex_filter_cubic;
+ bool has_separate_chroma_filter;
+
+ bool has_sample_locations;
+
+ /* The firmware on newer a6xx drops CP_REG_WRITE support as we
+ * can now use direct register writes for these regs.
+ */
+ bool has_cp_reg_write;
+
+ bool has_8bpp_ubwc;
+
+ bool has_lpac;
+
+ bool has_getfiberid;
+
+ bool has_dp2acc;
+ bool has_dp4acc;
+
+ /* LRZ fast-clear works on all gens, however blob disables it on
+ * gen1 and gen2. We also elect to disable fast-clear on these gens
+ * because for close to none gains it adds complexity and seem to work
+ * a bit differently from gen3+. Which creates at least one edge case:
+ * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
+ * the fast-clear value is undefined. For details see
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
+ */
+ bool enable_lrz_fast_clear;
+ bool has_lrz_dir_tracking;
+ bool lrz_track_quirk;
+
+ /* Some generations have a bit to add the multiview index to the
+ * viewport index, which lets us implement different scaling for
+ * different views.
+ */
+ bool has_per_view_viewport;
+ bool has_gmem_fast_clear;
+
+ /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
+ * in sysmem rendering. */
+ uint32_t sysmem_per_ccu_cache_size;
+ /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
+ * which require color cache (non-BLIT event case).
+ * The size is expressed as a fraction of ccu cache used by sysmem
+ * rendering. If a GMEM resolve requires color cache, the driver needs
+ * to make sure it will not overwrite pixel data in GMEM that is still
+ * needed.
+ */
+ /* see enum a6xx_ccu_color_cache_size */
+ uint32_t gmem_ccu_color_cache_fraction;
+
+ /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
+ uint32_t prim_alloc_threshold;
+
+ uint32_t vs_max_inputs_count;
+
+ bool supports_double_threadsize;
+
+ bool has_sampler_minmax;
struct {
- uint32_t reg_size_vec4;
-
- /* The size (in instrlen units (128 bytes)) of instruction cache where
- * we preload a shader. Loading more than this could trigger a hang
- * on gen3 and later.
- */
- uint32_t instr_cache_size;
-
- bool has_hw_multiview;
-
- bool has_fs_tex_prefetch;
-
- /* Whether the PC_MULTIVIEW_MASK register exists. */
- bool supports_multiview_mask;
-
- /* info for setting RB_CCU_CNTL */
- bool concurrent_resolve;
- bool has_z24uint_s8uint;
-
- bool tess_use_shared;
-
- /* Does the hw support GL_QCOM_shading_rate? */
- bool has_shading_rate;
-
- /* newer a6xx allows using 16-bit descriptor for both 16-bit
- * and 32-bit access
- */
- bool storage_16bit;
-
- /* The latest known a630_sqe.fw fails to wait for WFI before
- * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
- * so we have to fall back to CP_WAIT_FOR_ME except for a650
- * which has a fixed firmware.
- *
- * TODO: There may be newer a630_sqe.fw released in the future
- * which fixes this, if so we should detect it and avoid this
- * workaround. Once we have uapi to query fw version, we can
- * replace this with minimum fw version.
- */
- bool indirect_draw_wfm_quirk;
-
- /* On some GPUs, the depth test needs to be enabled when the
- * depth bounds test is enabled and the depth attachment uses UBWC.
- */
- bool depth_bounds_require_depth_test_quirk;
-
- bool has_tex_filter_cubic;
- bool has_separate_chroma_filter;
-
- bool has_sample_locations;
-
- /* The firmware on newer a6xx drops CP_REG_WRITE support as we
- * can now use direct register writes for these regs.
- */
- bool has_cp_reg_write;
-
- bool has_8bpp_ubwc;
-
- bool has_lpac;
-
- bool has_getfiberid;
-
- bool has_dp2acc;
- bool has_dp4acc;
-
- /* LRZ fast-clear works on all gens, however blob disables it on
- * gen1 and gen2. We also elect to disable fast-clear on these gens
- * because for close to none gains it adds complexity and seem to work
- * a bit differently from gen3+. Which creates at least one edge case:
- * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
- * the fast-clear value is undefined. For details see
- * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
- */
- bool enable_lrz_fast_clear;
- bool has_lrz_dir_tracking;
- bool lrz_track_quirk;
-
- /* Some generations have a bit to add the multiview index to the
- * viewport index, which lets us implement different scaling for
- * different views.
- */
- bool has_per_view_viewport;
- bool has_gmem_fast_clear;
-
- /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
- * in sysmem rendering. */
- uint32_t sysmem_per_ccu_cache_size;
- /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
- * which require color cache (non-BLIT event case).
- * The size is expressed as a fraction of ccu cache used by sysmem
- * rendering. If a GMEM resolve requires color cache, the driver needs
- * to make sure it will not overwrite pixel data in GMEM that is still
- * needed.
- */
- /* see enum a6xx_ccu_color_cache_size */
- uint32_t gmem_ccu_color_cache_fraction;
-
- /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
- uint32_t prim_alloc_threshold;
-
- uint32_t vs_max_inputs_count;
-
- bool supports_double_threadsize;
-
- bool has_sampler_minmax;
- struct {
- uint32_t PC_POWER_CNTL;
- uint32_t TPL1_DBG_ECO_CNTL;
- uint32_t GRAS_DBG_ECO_CNTL;
- uint32_t SP_CHICKEN_BITS;
- uint32_t UCHE_CLIENT_PF;
- uint32_t PC_MODE_CNTL;
- uint32_t SP_DBG_ECO_CNTL;
- uint32_t RB_DBG_ECO_CNTL;
- uint32_t RB_DBG_ECO_CNTL_blit;
- uint32_t HLSQ_DBG_ECO_CNTL;
- uint32_t RB_UNKNOWN_8E01;
- uint32_t VPC_DBG_ECO_CNTL;
- uint32_t UCHE_UNKNOWN_0E12;
- } magic;
-
- struct {
- uint32_t reg;
- uint32_t value;
- } magic_raw[32];
- } a6xx;
- };
+ uint32_t PC_POWER_CNTL;
+ uint32_t TPL1_DBG_ECO_CNTL;
+ uint32_t GRAS_DBG_ECO_CNTL;
+ uint32_t SP_CHICKEN_BITS;
+ uint32_t UCHE_CLIENT_PF;
+ uint32_t PC_MODE_CNTL;
+ uint32_t SP_DBG_ECO_CNTL;
+ uint32_t RB_DBG_ECO_CNTL;
+ uint32_t RB_DBG_ECO_CNTL_blit;
+ uint32_t HLSQ_DBG_ECO_CNTL;
+ uint32_t RB_UNKNOWN_8E01;
+ uint32_t VPC_DBG_ECO_CNTL;
+ uint32_t UCHE_UNKNOWN_0E12;
+ } magic;
+
+ struct {
+ uint32_t reg;
+ uint32_t value;
+ } magic_raw[32];
+ } a6xx;
+
+ struct {
+ } a7xx;
};
struct fd_dev_id {
self.num_ccu = num_ccu
self.a6xx = Struct()
+ self.a7xx = Struct()
+
self.a6xx.magic = Struct()
for name, val in magic_regs.items():
self.a6xx.vs_max_inputs_count = 32
- for name, val in template.items():
- if name == "magic": # handled above
- continue
- setattr(self.a6xx, name, val)
+ templates = template if type(template) is list else [template]
+ for template in templates:
+ template.apply_props(self)
+
def __str__(self):
return super(A6xxGPUInfo, self).__str__().replace('[', '{').replace("]", "}")
fibers_per_sp = 64 * 16, # Lowest number that didn't fault on spillall fs-varying-array-mat4-col-row-rd.
))
+
+class A6XXProps(dict):
+ def apply_props(self, gpu_info):
+ for name, val in self.items():
+ if name == "magic":
+ continue
+ setattr(gpu_info.a6xx, name, val)
+
+
+class A7XXProps(dict):
+ def apply_props(self, gpu_info):
+ for name, val in self.items():
+ setattr(gpu_info.a7xx, name, val)
+
+
# a6xx can be divided into distinct sub-generations, where certain device-
# info parameters are keyed to the sub-generation. These templates reduce
# the copypaste
# a615, a616, a618, a619, a620 and a630:
-a6xx_gen1 = dict(
+a6xx_gen1 = A6XXProps(
reg_size_vec4 = 96,
instr_cache_size = 64,
concurrent_resolve = False,
)
# a605, a608, a610, 612
-a6xx_gen1_low = {**a6xx_gen1, **dict(
+a6xx_gen1_low = A6XXProps({**a6xx_gen1, **A6XXProps(
has_gmem_fast_clear = False,
reg_size_vec4 = 48,
has_hw_multiview = False,
gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
vs_max_inputs_count = 16,
supports_double_threadsize = False,
-)}
+)})
# a640, a680:
-a6xx_gen2 = dict(
+a6xx_gen2 = A6XXProps(
reg_size_vec4 = 96,
instr_cache_size = 64, # TODO
supports_multiview_mask = True,
)
# a650:
-a6xx_gen3 = dict(
+a6xx_gen3 = A6XXProps(
reg_size_vec4 = 64,
# Blob limits it to 128 but we hang with 128
instr_cache_size = 127,
)
# a635, a660:
-a6xx_gen4 = dict(
+a6xx_gen4 = A6XXProps(
reg_size_vec4 = 64,
# Blob limits it to 128 but we hang with 128
instr_cache_size = 127,
)
))
+a7xx_730 = A7XXProps()
+
+a7xx_740 = A7XXProps()
+
add_gpus([
GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data
GPUId(chip_id=0xffff07030001, name="FD730"), # Default no-speedbin fallback
], A6xxGPUInfo(
CHIP.A7XX,
- a6xx_gen4,
+ [a6xx_gen4, a7xx_730],
num_ccu = 4,
tile_align_w = 64,
tile_align_h = 32,
GPUId(chip_id=0xffff43050a01, name="FD740"), # Default no-speedbin fallback
], A6xxGPUInfo(
CHIP.A7XX,
- a6xx_gen4,
+ [a6xx_gen4, a7xx_740],
num_ccu = 6,
tile_align_w = 64,
tile_align_h = 32,