radv,aco: implement 64-bit vertex inputs

author Rhys Perry <pendingchaos02@gmail.com>

Tue, 2 Aug 2022 14:10:50 +0000 (15:10 +0100)

committer Marge Bot <emma+marge@anholt.net>

Tue, 30 Aug 2022 19:02:11 +0000 (19:02 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Tue, 2 Aug 2022 14:10:50 +0000 (15:10 +0100)
committer Marge Bot <emma+marge@anholt.net>
Tue, 30 Aug 2022 19:02:11 +0000 (19:02 +0000)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 135e63a..0d8ee6e 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -5450,17 +5450,23 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
                    "Unimplemented non-zero nir_intrinsic_load_input offset");
  
        unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
-      unsigned component = nir_intrinsic_component(instr);
        unsigned bitsize = instr->dest.ssa.bit_size;
+      unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0);
        unsigned num_components = instr->dest.ssa.num_components;
  
-      Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
-
        aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
           aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
        std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
        for (unsigned i = 0; i < num_components; i++) {
-         elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
+         if (bitsize == 64) {
+            Temp input = get_arg(ctx, ctx->args->vs_inputs[location + (component + i) / 2]);
+            elems[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
+                                  emit_extract_vector(ctx, input, (component + i) * 2 % 4, v1),
+                                  emit_extract_vector(ctx, input, (component + i) * 2 % 4 + 1, v1));
+         } else {
+            Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
+            elems[i] = emit_extract_vector(ctx, input, component + i, v1);
+         }
           if (bitsize == 16) {
              if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
                 elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
@@ -5483,8 +5489,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
           convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
  
        unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
-      unsigned component = nir_intrinsic_component(instr);
        unsigned bitsize = instr->dest.ssa.bit_size;
+      unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0);
        unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
        uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
        uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
@@ -5639,8 +5645,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
              channels[channel_start] = fetch_dst;
           } else {
              for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
-               channels[channel_start + i] =
-                  emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
+               channels[channel_start + i] = emit_extract_vector(
+                  ctx, fetch_dst, i, RegClass::get(RegType::vgpr, bitsize / 8u));
           }
  
           channel_start += fetch_component;
@@ -5664,6 +5670,12 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
  
                 num_temp++;
                 elems[i] = channel;
+            } else if (bitsize == 64) {
+               /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
+                * For 64-bit data types, no default attribute values are provided. Input variables
+                * must not use more components than provided by the attribute.
+                */
+               vec->operands[i] = Operand(v2);
              } else if (is_float && idx == 3) {
                 vec->operands[i] = bitsize == 16 ? Operand::c16(0x3c00u) : Operand::c32(0x3f800000u);
              } else if (!is_float && idx == 3) {
@@ -11477,7 +11489,7 @@ add_startpgm(struct isel_context* ctx)
     }
  
     if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) {
-      unsigned num_attributes = util_last_bit(ctx->program->info.vs.vb_desc_usage_mask);
+      unsigned num_attributes = util_last_bit(ctx->program->info.vs.input_slot_usage_mask);
        for (unsigned i = 0; i < num_attributes; i++) {
           Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
  
@@ -12262,7 +12274,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
  
        bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
  
-      for (unsigned i = 0; i < num_descs; i++, loc++) {
+      for (unsigned i = 0; i < num_descs;) {
           PhysReg dest(attributes_start.reg() + loc * 4u);
  
           /* calculate index */
@@ -12307,6 +12319,10 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
                    bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
                              Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
                              false, true);
+               else if (vtx_info->chan_byte_size == 8)
+                  bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
+                            Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
+                            fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
                 else
                    bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
                              Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
@@ -12316,13 +12332,23 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
                 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
                    ? 1u
                    : 0x3f800000u;
-            for (unsigned j = vtx_info->num_channels; j < 4; j++) {
+            /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
+             * For 64-bit data types, no default attribute values are provided. Input variables must
+             * not use more components than provided by the attribute.
+             */
+            for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
                 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
                          Operand::c32(j == 3 ? one : 0u));
              }
+
+            unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
+            loc += slots;
+            i += slots;
           } else {
              bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
                        Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
+            loc++;
+            i++;
           }
        }
     }
diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h

index 57182cc..973624a 100644 (file)
--- a/src/amd/compiler/aco_shader_info.h
+++ b/src/amd/compiler/aco_shader_info.h
@@ -115,6 +115,7 @@ struct aco_shader_info {
        uint64_t tcs_temp_only_input_mask;
        bool use_per_attribute_vb_descs;
        uint32_t vb_desc_usage_mask;
+      uint32_t input_slot_usage_mask;
        bool has_prolog;
        bool dynamic_inputs;
     } vs;
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c

index 3b7d971..08e3e89 100644 (file)
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -3405,8 +3405,11 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr *
     case 32:
        break;
     case 64:
-      unreachable("64-bit IO should have been lowered");
-      return NULL;
+      if (ctx->stage != MESA_SHADER_VERTEX || is_output) {
+         unreachable("64-bit IO should have been lowered");
+         return NULL;
+      }
+      break;
     default:
        unreachable("unhandled load type");
        return NULL;
diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h

index a9fcd4b..909be9a 100644 (file)
--- a/src/amd/vulkan/radv_aco_shader_info.h
+++ b/src/amd/vulkan/radv_aco_shader_info.h
@@ -82,6 +82,7 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info,
     ASSIGN_FIELD(vs.tcs_temp_only_input_mask);
     ASSIGN_FIELD(vs.use_per_attribute_vb_descs);
     ASSIGN_FIELD(vs.vb_desc_usage_mask);
+   ASSIGN_FIELD(vs.input_slot_usage_mask);
     ASSIGN_FIELD(vs.has_prolog);
     ASSIGN_FIELD(vs.dynamic_inputs);
     ASSIGN_FIELD_CP(gs.output_usage_mask);
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c

index a79c25e..a5fd3f3 100644 (file)
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -355,10 +355,10 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign
  
  static LLVMValueRef
  radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
-                                unsigned num_channels, bool is_float)
+                                unsigned num_channels, bool is_float, bool is_64bit)
  {
-   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
-   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
+   LLVMValueRef zero = is_64bit ? ctx->ac.i64_0 : (is_float ? ctx->ac.f32_0 : ctx->ac.i32_0);
+   LLVMValueRef one = is_64bit ? ctx->ac.i64_0 : (is_float ? ctx->ac.f32_1 : ctx->ac.i32_1);
     LLVMValueRef chan[4];
  
     if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
@@ -446,8 +446,10 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp
      * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
      * VBO offset is 2 for R16G16B16A16_SNORM).
      */
+   unsigned chan_dwords = vtx_info->chan_byte_size == 8 ? 2 : 1;
     if (((ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10) && vtx_info->chan_byte_size) ||
-       !(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1))) {
+       !(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)) ||
+       vtx_info->element_size > 16) {
        unsigned chan_format = vtx_info->hw_format[0] & 0xf;
        LLVMValueRef values[4];
  
@@ -466,7 +468,7 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp
  
           values[chan] = ac_build_struct_tbuffer_load(
              &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
-            ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
+            ctx->ac.i32_0, chan_dwords, chan_format, num_format, 0, true);
        }
  
        input = ac_build_gather_values(&ctx->ac, values, num_channels);
@@ -482,10 +484,15 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp
  
        input = ac_build_struct_tbuffer_load(
           &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
-         ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
+         ctx->ac.i32_0, num_channels * chan_dwords, data_format, num_format, 0, true);
     }
  
-   input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
+   if (vtx_info->chan_byte_size == 8)
+      input =
+         LLVMBuildBitCast(ctx->ac.builder, input, LLVMVectorType(ctx->ac.i64, num_channels), "");
+
+   input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float,
+                                           vtx_info->chan_byte_size == 8);
  
     for (unsigned chan = 0; chan < 4; chan++) {
        LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c

index c2a1715..1653bae 100644 (file)
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1143,8 +1143,14 @@ radv_lower_io(struct radv_device *device, nir_shader *nir)
        nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT);
     }
  
-   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4,
-            nir_lower_io_lower_64bit_to_32);
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
+      NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
+               nir_lower_io_lower_64bit_to_32);
+   } else {
+      NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4,
+               nir_lower_io_lower_64bit_to_32);
+   }
  
     /* This pass needs actual constants */
     NIR_PASS(_, nir, nir_opt_constant_folding);
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h

index 137048e..d5c083e 100644 (file)
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -260,6 +260,7 @@ struct radv_shader_info {
        bool needs_base_instance;
        bool use_per_attribute_vb_descs;
        uint32_t vb_desc_usage_mask;
+      uint32_t input_slot_usage_mask;
        bool has_prolog;
        bool dynamic_inputs;
     } vs;
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c

index b04cca3..4dfdedc 100644 (file)
--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@@ -340,7 +340,7 @@ declare_vs_input_vgprs(enum amd_gfx_level gfx_level, const struct radv_shader_in
  
     if (info->vs.dynamic_inputs) {
        assert(info->vs.use_per_attribute_vb_descs);
-      unsigned num_attributes = util_last_bit(info->vs.vb_desc_usage_mask);
+      unsigned num_attributes = util_last_bit(info->vs.input_slot_usage_mask);
        for (unsigned i = 0; i < num_attributes; i++)
           ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]);
        /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c

index 537d2ce..4b3535a 100644 (file)
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -42,8 +42,11 @@ gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_inst
        unsigned idx = nir_intrinsic_io_semantics(instr).location;
        unsigned component = nir_intrinsic_component(instr);
        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
+      mask = (instr->dest.ssa.bit_size == 64 ? util_widen_mask(mask, 2) : mask) << component;
  
-      info->vs.input_usage_mask[idx] |= mask << component;
+      info->vs.input_usage_mask[idx] |= mask & 0xf;
+      if (mask >> 4)
+         info->vs.input_usage_mask[idx + 1] |= mask >> 4;
        break;
     }
     default:
@@ -313,6 +316,40 @@ assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask,
  }
  
  static void
+gather_info_input_decl_vs(const nir_shader *nir, unsigned location, const struct glsl_type *type,
+                          const struct radv_pipeline_key *key, struct radv_shader_info *info)
+{
+   if (glsl_type_is_scalar(type) || glsl_type_is_vector(type)) {
+      if (key->vs.instance_rate_inputs & BITFIELD_BIT(location)) {
+         info->vs.needs_instance_id = true;
+         info->vs.needs_base_instance = true;
+      }
+
+      if (info->vs.use_per_attribute_vb_descs)
+         info->vs.vb_desc_usage_mask |= BITFIELD_BIT(location);
+      else
+         info->vs.vb_desc_usage_mask |= BITFIELD_BIT(key->vs.vertex_attribute_bindings[location]);
+
+      info->vs.input_slot_usage_mask |=
+         BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false));
+   } else if (glsl_type_is_matrix(type) || glsl_type_is_array(type)) {
+      const struct glsl_type *elem = glsl_get_array_element(type);
+      unsigned stride = glsl_count_attribute_slots(elem, false);
+
+      for (unsigned i = 0; i < glsl_get_length(type); ++i)
+         gather_info_input_decl_vs(nir, location + i * stride, elem, key, info);
+   } else {
+      assert(glsl_type_is_struct_or_ifc(type));
+
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *field = glsl_get_struct_field(type, i);
+         gather_info_input_decl_vs(nir, location, field, key, info);
+         location += glsl_count_attribute_slots(field, false);
+      }
+   }
+}
+
+static void
  gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
                        const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
  {
@@ -331,25 +368,9 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
     info->vs.needs_base_instance |= info->vs.has_prolog;
     info->vs.needs_draw_id |= info->vs.has_prolog;
  
-   nir_foreach_shader_in_variable(var, nir) {
-      unsigned attrib_count = glsl_count_attribute_slots(var->type, true);
-
-      for (unsigned i = 0; i < attrib_count; ++i) {
-         unsigned attrib_index = var->data.location + i - VERT_ATTRIB_GENERIC0;
-
-         if (pipeline_key->vs.instance_rate_inputs & (1u << attrib_index)) {
-            info->vs.needs_instance_id = true;
-            info->vs.needs_base_instance = true;
-         }
-
-         if (info->vs.use_per_attribute_vb_descs) {
-            info->vs.vb_desc_usage_mask |= 1u << attrib_index;
-         } else {
-            info->vs.vb_desc_usage_mask |=
-               1u << pipeline_key->vs.vertex_attribute_bindings[attrib_index];
-         }
-      }
-   }
+   nir_foreach_shader_in_variable(var, nir)
+      gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type,
+                                pipeline_key, info);
  }
  
  static void
author	Rhys Perry <pendingchaos02@gmail.com>
	Tue, 2 Aug 2022 14:10:50 +0000 (15:10 +0100)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 30 Aug 2022 19:02:11 +0000 (19:02 +0000)
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_shader_info.h		patch \| blob \| history
src/amd/llvm/ac_nir_to_llvm.c		patch \| blob \| history
src/amd/vulkan/radv_aco_shader_info.h		patch \| blob \| history
src/amd/vulkan/radv_nir_to_llvm.c		patch \| blob \| history
src/amd/vulkan/radv_shader.c		patch \| blob \| history
src/amd/vulkan/radv_shader.h		patch \| blob \| history
src/amd/vulkan/radv_shader_args.c		patch \| blob \| history
src/amd/vulkan/radv_shader_info.c		patch \| blob \| history