freedreno,turnip: Make CS shared memory size configurable

author Danylo Piliaiev <dpiliaiev@igalia.com>

Thu, 2 Feb 2023 16:19:54 +0000 (17:19 +0100)

committer Marge Bot <emma+marge@anholt.net>

Thu, 13 Jul 2023 18:06:36 +0000 (18:06 +0000)
author Danylo Piliaiev <dpiliaiev@igalia.com>
Thu, 2 Feb 2023 16:19:54 +0000 (17:19 +0100)
committer Marge Bot <emma+marge@anholt.net>
Thu, 13 Jul 2023 18:06:36 +0000 (18:06 +0000)
diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h

index 29ff2bb..5a8cd3a 100644 (file)
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -49,6 +49,8 @@ struct fd_dev_info {
  
     uint32_t num_vsc_pipes;
  
+   uint32_t cs_shared_mem_size;
+
     /* number of CCU is always equal to the number of SP */
     union {
        uint32_t num_sp_cores;
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py

index b445c23..14db3f8 100644 (file)
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -108,7 +108,8 @@ class GPUInfo(Struct):
      """
      def __init__(self, chip, gmem_align_w, gmem_align_h,
                   tile_align_w, tile_align_h,
-                 tile_max_w, tile_max_h, num_vsc_pipes):
+                 tile_max_w, tile_max_h, num_vsc_pipes,
+                 cs_shared_mem_size):
          self.chip          = chip.value
          self.gmem_align_w  = gmem_align_w
          self.gmem_align_h  = gmem_align_h
@@ -117,6 +118,7 @@ class GPUInfo(Struct):
          self.tile_max_w    = tile_max_w
          self.tile_max_h    = tile_max_h
          self.num_vsc_pipes = num_vsc_pipes
+        self.cs_shared_mem_size = cs_shared_mem_size
  
          s.gpu_infos.append(self)
  
@@ -126,13 +128,14 @@ class A6xxGPUInfo(GPUInfo):
         into distinct sub-generations.  The template parameter avoids
         duplication of parameters that are unique to the sub-generation.
      """
-    def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs):
+    def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, magic_regs):
          super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
                           tile_align_w = tile_align_w,
                           tile_align_h = tile_align_h,
                           tile_max_w   = 1024, # max_bitfield_val(5, 0, 5)
                           tile_max_h   = max_bitfield_val(14, 8, 4),
-                         num_vsc_pipes = num_vsc_pipes)
+                         num_vsc_pipes = num_vsc_pipes,
+                         cs_shared_mem_size = cs_shared_mem_size)
          # The # of SP cores seems to always match # of CCU
          self.num_sp_cores = num_ccu
          self.num_ccu = num_ccu
@@ -174,6 +177,7 @@ add_gpus([
          tile_max_w   = 512,
          tile_max_h   = ~0, # TODO
          num_vsc_pipes = 8,
+        cs_shared_mem_size = 0,
      ))
  
  add_gpus([
@@ -188,6 +192,7 @@ add_gpus([
          tile_max_w   = 992, # max_bitfield_val(4, 0, 5)
          tile_max_h   = max_bitfield_val(9, 5, 5),
          num_vsc_pipes = 8,
+        cs_shared_mem_size = 32 * 1024,
      ))
  
  add_gpus([
@@ -201,6 +206,7 @@ add_gpus([
          tile_max_w   = 1024, # max_bitfield_val(4, 0, 5)
          tile_max_h   = max_bitfield_val(9, 5, 5),
          num_vsc_pipes = 8,
+        cs_shared_mem_size = 32 * 1024,
      ))
  
  add_gpus([
@@ -218,6 +224,7 @@ add_gpus([
          tile_max_w   = 1024, # max_bitfield_val(7, 0, 5)
          tile_max_h   = max_bitfield_val(16, 9, 5),
          num_vsc_pipes = 16,
+        cs_shared_mem_size = 32 * 1024,
      ))
  
  # a6xx can be divided into distinct sub-generations, where certain device-
@@ -307,6 +314,7 @@ add_gpus([
          tile_align_w = 32,
          tile_align_h = 32,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 0,
              TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -333,6 +341,7 @@ add_gpus([
          tile_align_w = 32,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 0,
              TPL1_DBG_ECO_CNTL = 0x01008000,
@@ -359,6 +368,7 @@ add_gpus([
          tile_align_w = 32,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 1,
              TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -385,6 +395,7 @@ add_gpus([
          tile_align_w = 32,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 1,
              TPL1_DBG_ECO_CNTL = 0x00008000,
@@ -411,6 +422,7 @@ add_gpus([
          tile_align_w = 64,
          tile_align_h = 32,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 3,
              TPL1_DBG_ECO_CNTL = 0x00108000,
@@ -437,6 +449,7 @@ add_gpus([
          tile_align_w = 96,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 2,
              # this seems to be a chicken bit that fixes cubic filtering:
@@ -468,6 +481,7 @@ add_gpus([
          tile_align_w = 32,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 1,
              TPL1_DBG_ECO_CNTL = 0x05008000,
@@ -494,6 +508,7 @@ add_gpus([
          tile_align_w = 96,
          tile_align_h = 16,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 2,
              TPL1_DBG_ECO_CNTL = 0x05008000,
@@ -520,6 +535,7 @@ add_gpus([
          tile_align_w = 64,
          tile_align_h = 32,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict(
              PC_POWER_CNTL = 7,
              TPL1_DBG_ECO_CNTL = 0x01008000,
@@ -548,6 +564,7 @@ add_gpus([
          tile_align_w = 64,
          tile_align_h = 32,
          num_vsc_pipes = 32,
+        cs_shared_mem_size = 32 * 1024,
          magic_regs = dict()
      ))
  
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c

index ffeb564..272a308 100644 (file)
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -145,8 +145,6 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
     compiler->is_64bit = fd_dev_64b(dev_id);
     compiler->options = *options;
  
-   /* All known GPU's have 32k local memory (aka shared) */
-   compiler->local_mem_size = 32 * 1024;
     /* TODO see if older GPU's were different here */
     compiler->branchstack_size = 64;
     compiler->wave_granularity = 2;
@@ -156,6 +154,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
  
     const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
  
+   compiler->local_mem_size = dev_info->cs_shared_mem_size;
+
     if (compiler->gen >= 6) {
        compiler->samgq_workaround = true;
        /* a6xx split the pipeline state into geometry and fragment state, in
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc

index dad9b97..09ce58e 100644 (file)
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -1092,7 +1092,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
        .maxFragmentOutputAttachments = 8,
        .maxFragmentDualSrcAttachments = 1,
        .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
-      .maxComputeSharedMemorySize = 32768,
+      .maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
        .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
        .maxComputeWorkGroupInvocations = 2048,
        .maxComputeWorkGroupSize = { 1024, 1024, 1024 },
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c

index eb48868..1503330 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -823,7 +823,7 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
        RET((uint64_t[]){screen->ram_size});
  
     case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
-      RET((uint64_t[]){32768});
+      RET((uint64_t[]){screen->info->cs_shared_mem_size});
  
     case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
     case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
author	Danylo Piliaiev <dpiliaiev@igalia.com>
	Thu, 2 Feb 2023 16:19:54 +0000 (17:19 +0100)
committer	Marge Bot <emma+marge@anholt.net>
	Thu, 13 Jul 2023 18:06:36 +0000 (18:06 +0000)
src/freedreno/common/freedreno_dev_info.h		patch \| blob \| history
src/freedreno/common/freedreno_devices.py		patch \| blob \| history
src/freedreno/ir3/ir3_compiler.c		patch \| blob \| history
src/freedreno/vulkan/tu_device.cc		patch \| blob \| history
src/gallium/drivers/freedreno/freedreno_screen.c		patch \| blob \| history