From f6ea7c3a99744db6e76b5fe1de03d5f23b2f5434 Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Thu, 27 Jul 2023 11:11:26 -0700 Subject: [PATCH] freedreno/devices: Move fibers_per_sp to the common info struct. We'll need it for pvt mem on other GPUs. Part-of: --- src/freedreno/common/freedreno_dev_info.h | 6 ++-- src/freedreno/common/freedreno_devices.py | 35 ++++++++++++++++------- src/freedreno/computerator/a6xx.cc | 2 +- src/freedreno/vulkan/tu_pipeline.cc | 2 +- src/gallium/drivers/freedreno/a6xx/fd6_program.cc | 2 +- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 4599a8a..49f0d20 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -53,6 +53,9 @@ struct fd_dev_info { int wave_granularity; + /* Information for private memory calculations */ + uint32_t fibers_per_sp; + /* number of CCU is always equal to the number of SP */ union { uint32_t num_sp_cores; @@ -61,9 +64,6 @@ struct fd_dev_info { union { struct { - /* Information for private memory calculations */ - uint32_t fibers_per_sp; - uint32_t reg_size_vec4; /* The size (in instrlen units (128 bytes)) of instruction cache where diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index dd14d24..ebe48c7 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -109,7 +109,7 @@ class GPUInfo(Struct): def __init__(self, chip, gmem_align_w, gmem_align_h, tile_align_w, tile_align_h, tile_max_w, tile_max_h, num_vsc_pipes, - cs_shared_mem_size, wave_granularity): + cs_shared_mem_size, wave_granularity, fibers_per_sp): self.chip = chip.value self.gmem_align_w = gmem_align_w self.gmem_align_h = gmem_align_h @@ -120,6 +120,7 @@ class GPUInfo(Struct): self.num_vsc_pipes = num_vsc_pipes self.cs_shared_mem_size = cs_shared_mem_size self.wave_granularity = wave_granularity + self.fibers_per_sp = fibers_per_sp s.gpu_infos.append(self) @@ -129,7 +130,7 @@ class A6xxGPUInfo(GPUInfo): into distinct sub-generations. The template parameter avoids duplication of parameters that are unique to the sub-generation. """ - def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, wave_granularity, magic_regs): + def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, wave_granularity, fibers_per_sp, magic_regs): super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4, tile_align_w = tile_align_w, tile_align_h = tile_align_h, @@ -137,7 +138,8 @@ class A6xxGPUInfo(GPUInfo): tile_max_h = max_bitfield_val(14, 8, 4), num_vsc_pipes = num_vsc_pipes, cs_shared_mem_size = cs_shared_mem_size, - wave_granularity = wave_granularity) + wave_granularity = wave_granularity, + fibers_per_sp = fibers_per_sp) # The # of SP cores seems to always match # of CCU self.num_sp_cores = num_ccu self.num_ccu = num_ccu @@ -185,7 +187,8 @@ add_gpus([ tile_max_h = ~0, # TODO num_vsc_pipes = 8, cs_shared_mem_size = 0, - wave_granularity = 2 + wave_granularity = 2, + fibers_per_sp = 0, # TODO )) add_gpus([ @@ -201,7 +204,8 @@ add_gpus([ tile_max_h = max_bitfield_val(9, 5, 5), num_vsc_pipes = 8, cs_shared_mem_size = 32 * 1024, - wave_granularity = 2 + wave_granularity = 2, + fibers_per_sp = 0, # TODO )) add_gpus([ @@ -216,7 +220,8 @@ add_gpus([ tile_max_h = max_bitfield_val(9, 5, 5), num_vsc_pipes = 8, cs_shared_mem_size = 32 * 1024, - wave_granularity = 2 + wave_granularity = 2, + fibers_per_sp = 0, # TODO )) add_gpus([ @@ -235,7 +240,8 @@ add_gpus([ tile_max_h = max_bitfield_val(16, 9, 5), num_vsc_pipes = 16, cs_shared_mem_size = 32 * 1024, - wave_granularity = 2 + wave_granularity = 2, + fibers_per_sp = 0, # TODO )) # a6xx can be divided into distinct sub-generations, where certain device- @@ -244,7 +250,6 @@ add_gpus([ # a615, a616, a618, a619, a620 and a630: a6xx_gen1 = dict( - fibers_per_sp = 128 * 16, reg_size_vec4 = 96, instr_cache_size = 64, concurrent_resolve = False, @@ -268,7 +273,6 @@ a6xx_gen1_low = {**a6xx_gen1, **dict( # a640, a680: a6xx_gen2 = dict( - fibers_per_sp = 128 * 4 * 16, reg_size_vec4 = 96, instr_cache_size = 64, # TODO supports_multiview_mask = True, @@ -282,7 +286,6 @@ a6xx_gen2 = dict( # a650: a6xx_gen3 = dict( - fibers_per_sp = 128 * 2 * 16, reg_size_vec4 = 64, # Blob limits it to 128 but we hang with 128 instr_cache_size = 127, @@ -304,7 +307,6 @@ a6xx_gen3 = dict( # a635, a660: a6xx_gen4 = dict( - fibers_per_sp = 128 * 2 * 16, reg_size_vec4 = 64, # Blob limits it to 128 but we hang with 128 instr_cache_size = 127, @@ -342,6 +344,7 @@ add_gpus([ num_vsc_pipes = 16, cs_shared_mem_size = 16 * 1024, wave_granularity = 1, + fibers_per_sp = 128 * 16, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0, @@ -373,6 +376,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 16, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -401,6 +405,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 16, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -429,6 +434,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 16, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -457,6 +463,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 4 * 16, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00008000, @@ -485,6 +492,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 4 * 16, magic_regs = dict( PC_POWER_CNTL = 3, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -513,6 +521,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 2 * 16, magic_regs = dict( PC_POWER_CNTL = 2, # this seems to be a chicken bit that fixes cubic filtering: @@ -546,6 +555,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 2 * 16, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -574,6 +584,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 2 * 16, magic_regs = dict( PC_POWER_CNTL = 2, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -602,6 +613,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 2 * 16, magic_regs = dict( PC_POWER_CNTL = 7, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -632,6 +644,7 @@ add_gpus([ num_vsc_pipes = 32, cs_shared_mem_size = 32 * 1024, wave_granularity = 2, + fibers_per_sp = 128 * 2 * 16, magic_regs = dict() )) diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index ce6d01a..b36ddc9 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -217,7 +217,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) if (v->pvtmem_size > 0) { uint32_t per_fiber_size = v->pvtmem_size; uint32_t per_sp_size = - ALIGN(per_fiber_size * a6xx_backend->info->a6xx.fibers_per_sp, 1 << 12); + ALIGN(per_fiber_size * a6xx_backend->info->fibers_per_sp, 1 << 12); uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores; struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem"); diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index e67c458..b9ef9de 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -1983,7 +1983,7 @@ tu_setup_pvtmem(struct tu_device *dev, util_next_power_of_two(ALIGN(pvtmem_bytes, 512)); pvtmem_bo->per_sp_size = ALIGN(pvtmem_bo->per_fiber_size * - dev->physical_device->info->a6xx.fibers_per_sp, + dev->physical_device->info->fibers_per_sp, 1 << 12); uint32_t total_size = dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc index af8033c..e9012e6 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc @@ -104,7 +104,7 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, fd_emit_string5(ring, name, strlen(name)); #endif - uint32_t fibers_per_sp = ctx->screen->info->a6xx.fibers_per_sp; + uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp; uint32_t num_sp_cores = ctx->screen->info->num_sp_cores; uint32_t per_fiber_size = so->pvtmem_size; -- 2.7.4