From: Danylo Piliaiev Date: Thu, 2 Feb 2023 16:19:54 +0000 (+0100) Subject: freedreno,turnip: Make CS shared memory size configurable X-Git-Tag: upstream/23.3.3~5675 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4a43ab3019232f323117c9d067bf57b5955096c5;p=platform%2Fupstream%2Fmesa.git freedreno,turnip: Make CS shared memory size configurable a610 and similar models have less shared memory size. Signed-off-by: Danylo Piliaiev Part-of: --- diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 29ff2bb..5a8cd3a 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -49,6 +49,8 @@ struct fd_dev_info { uint32_t num_vsc_pipes; + uint32_t cs_shared_mem_size; + /* number of CCU is always equal to the number of SP */ union { uint32_t num_sp_cores; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index b445c23..14db3f8 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -108,7 +108,8 @@ class GPUInfo(Struct): """ def __init__(self, chip, gmem_align_w, gmem_align_h, tile_align_w, tile_align_h, - tile_max_w, tile_max_h, num_vsc_pipes): + tile_max_w, tile_max_h, num_vsc_pipes, + cs_shared_mem_size): self.chip = chip.value self.gmem_align_w = gmem_align_w self.gmem_align_h = gmem_align_h @@ -117,6 +118,7 @@ class GPUInfo(Struct): self.tile_max_w = tile_max_w self.tile_max_h = tile_max_h self.num_vsc_pipes = num_vsc_pipes + self.cs_shared_mem_size = cs_shared_mem_size s.gpu_infos.append(self) @@ -126,13 +128,14 @@ class A6xxGPUInfo(GPUInfo): into distinct sub-generations. The template parameter avoids duplication of parameters that are unique to the sub-generation. """ - def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs): + def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, magic_regs): super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4, tile_align_w = tile_align_w, tile_align_h = tile_align_h, tile_max_w = 1024, # max_bitfield_val(5, 0, 5) tile_max_h = max_bitfield_val(14, 8, 4), - num_vsc_pipes = num_vsc_pipes) + num_vsc_pipes = num_vsc_pipes, + cs_shared_mem_size = cs_shared_mem_size) # The # of SP cores seems to always match # of CCU self.num_sp_cores = num_ccu self.num_ccu = num_ccu @@ -174,6 +177,7 @@ add_gpus([ tile_max_w = 512, tile_max_h = ~0, # TODO num_vsc_pipes = 8, + cs_shared_mem_size = 0, )) add_gpus([ @@ -188,6 +192,7 @@ add_gpus([ tile_max_w = 992, # max_bitfield_val(4, 0, 5) tile_max_h = max_bitfield_val(9, 5, 5), num_vsc_pipes = 8, + cs_shared_mem_size = 32 * 1024, )) add_gpus([ @@ -201,6 +206,7 @@ add_gpus([ tile_max_w = 1024, # max_bitfield_val(4, 0, 5) tile_max_h = max_bitfield_val(9, 5, 5), num_vsc_pipes = 8, + cs_shared_mem_size = 32 * 1024, )) add_gpus([ @@ -218,6 +224,7 @@ add_gpus([ tile_max_w = 1024, # max_bitfield_val(7, 0, 5) tile_max_h = max_bitfield_val(16, 9, 5), num_vsc_pipes = 16, + cs_shared_mem_size = 32 * 1024, )) # a6xx can be divided into distinct sub-generations, where certain device- @@ -307,6 +314,7 @@ add_gpus([ tile_align_w = 32, tile_align_h = 32, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -333,6 +341,7 @@ add_gpus([ tile_align_w = 32, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -359,6 +368,7 @@ add_gpus([ tile_align_w = 32, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -385,6 +395,7 @@ add_gpus([ tile_align_w = 32, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00008000, @@ -411,6 +422,7 @@ add_gpus([ tile_align_w = 64, tile_align_h = 32, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 3, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -437,6 +449,7 @@ add_gpus([ tile_align_w = 96, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 2, # this seems to be a chicken bit that fixes cubic filtering: @@ -468,6 +481,7 @@ add_gpus([ tile_align_w = 32, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -494,6 +508,7 @@ add_gpus([ tile_align_w = 96, tile_align_h = 16, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 2, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -520,6 +535,7 @@ add_gpus([ tile_align_w = 64, tile_align_h = 32, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict( PC_POWER_CNTL = 7, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -548,6 +564,7 @@ add_gpus([ tile_align_w = 64, tile_align_h = 32, num_vsc_pipes = 32, + cs_shared_mem_size = 32 * 1024, magic_regs = dict() )) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index ffeb564..272a308 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -145,8 +145,6 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->is_64bit = fd_dev_64b(dev_id); compiler->options = *options; - /* All known GPU's have 32k local memory (aka shared) */ - compiler->local_mem_size = 32 * 1024; /* TODO see if older GPU's were different here */ compiler->branchstack_size = 64; compiler->wave_granularity = 2; @@ -156,6 +154,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id); + compiler->local_mem_size = dev_info->cs_shared_mem_size; + if (compiler->gen >= 6) { compiler->samgq_workaround = true; /* a6xx split the pipeline state into geometry and fragment state, in diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index dad9b97..09ce58e 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1092,7 +1092,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxFragmentOutputAttachments = 8, .maxFragmentDualSrcAttachments = 1, .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2, - .maxComputeSharedMemorySize = 32768, + .maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, .maxComputeWorkGroupInvocations = 2048, .maxComputeWorkGroupSize = { 1024, 1024, 1024 }, diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index eb48868..1503330 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -823,7 +823,7 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, RET((uint64_t[]){screen->ram_size}); case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - RET((uint64_t[]){32768}); + RET((uint64_t[]){screen->info->cs_shared_mem_size}); case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: