Revert "radv,aco: allow unaligned LDS access on GFX9+"

author Rhys Perry <pendingchaos02@gmail.com>

Mon, 1 Mar 2021 10:43:01 +0000 (10:43 +0000)

committer Marge Bot <eric+marge@anholt.net>

Tue, 2 Mar 2021 13:13:59 +0000 (13:13 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Mon, 1 Mar 2021 10:43:01 +0000 (10:43 +0000)
committer Marge Bot <eric+marge@anholt.net>
Tue, 2 Mar 2021 13:13:59 +0000 (13:13 +0000)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 2fcfdaa..3e49c88 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3466,35 +3466,31 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
     bool large_ds_read = bld.program->chip_class >= GFX7;
     bool usable_read2 = bld.program->chip_class >= GFX7;
  
-   bool aligned2 = align % 2 == 0;
-   bool aligned4 = align % 4 == 0;
-   bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 8 == 0);
-   bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 16 == 0);
-
     bool read2 = false;
     unsigned size = 0;
     aco_opcode op;
-   if (bytes_needed >= 16 && aligned16 && large_ds_read) {
+   //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
+   if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
        size = 16;
        op = aco_opcode::ds_read_b128;
-   } else if (bytes_needed >= 16 && aligned8 && const_offset % 8 == 0 && usable_read2) {
+   } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
        size = 16;
        read2 = true;
        op = aco_opcode::ds_read2_b64;
-   } else if (bytes_needed >= 12 && aligned16 && large_ds_read) {
+   } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
        size = 12;
        op = aco_opcode::ds_read_b96;
-   } else if (bytes_needed >= 8 && aligned8) {
+   } else if (bytes_needed >= 8 && align % 8 == 0) {
        size = 8;
        op = aco_opcode::ds_read_b64;
-   } else if (bytes_needed >= 8 && aligned4 && const_offset % 4 == 0) {
+   } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
        size = 8;
        read2 = true;
        op = aco_opcode::ds_read2_b32;
-   } else if (bytes_needed >= 4 && aligned4) {
+   } else if (bytes_needed >= 4 && align % 4 == 0) {
        size = 4;
        op = aco_opcode::ds_read_b32;
-   } else if (bytes_needed >= 2 && aligned2) {
+   } else if (bytes_needed >= 2 && align % 2 == 0) {
        size = 2;
        op = aco_opcode::ds_read_u16;
     } else {
@@ -3858,8 +3854,8 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
  
        bool aligned2 = offset % 2 == 0 && align % 2 == 0;
        bool aligned4 = offset % 4 == 0 && align % 4 == 0;
-      bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 8 == 0 && align % 8 == 0);
-      bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 16 == 0 && align % 16 == 0);
+      bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+      bool aligned16 = offset % 16 == 0 && align % 16 == 0;
  
        //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
        aco_opcode op = aco_opcode::num_opcodes;
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp

index 4e248a9..3525433 100644 (file)
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -98,10 +98,6 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
     program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
     /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
     program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
-   /* GFX10 WGP has a bug making naturally aligned access required. The LLVM
-    * subtarget feature is called "FeatureLdsMisalignedBug".
-    */
-   program->dev.has_unaligned_lds_access = chip_class >= GFX9 && !(chip_class == GFX10 && wgp_mode);
  
     program->dev.vgpr_limit = 256;
     program->dev.physical_vgprs = 256;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index f44dacd..548e71b 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1799,7 +1799,6 @@ struct DeviceInfo {
     uint16_t lds_alloc_granule;
     uint32_t lds_limit; /* in bytes */
     bool has_16bank_lds;
-   bool has_unaligned_lds_access;
     uint16_t physical_sgprs;
     uint16_t physical_vgprs;
     uint16_t vgpr_limit;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index 907fa4e..3e8bfbe 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3053,9 +3053,6 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                         void *data)
  {
-       struct radv_device *device = data;
-       enum chip_class chip = device->physical_device->rad_info.chip_class;
-
         if (num_components > 4)
                 return false;
  
@@ -3084,9 +3081,9 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
                 FALLTHROUGH;
         case nir_intrinsic_load_shared:
         case nir_intrinsic_store_shared:
-               if (chip < GFX9 && bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment on GFX6-8 and are split otherwise */
+               if (bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment and are split otherwise */
                         return align % 16 == 0;
-               else if (chip < GFX9 && bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment on GFX6-8 and are split otherwise */
+               else if (bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment and are split otherwise */
                         return align % 8 == 0;
                 else
                         return align % (bit_size == 8 ? 2 : 4) == 0;
@@ -3335,7 +3332,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline,
                                          nir_var_mem_push_const | nir_var_mem_shared |
                                          nir_var_mem_global,
                                 .callback = mem_vectorize_callback,
-                               .cb_data = device,
                                 .robust_modes = 0,
                         };
author	Rhys Perry <pendingchaos02@gmail.com>
	Mon, 1 Mar 2021 10:43:01 +0000 (10:43 +0000)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 2 Mar 2021 13:13:59 +0000 (13:13 +0000)
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/vulkan/radv_pipeline.c		patch \| blob \| history