From 1165758b8bc3b8621257189c1a786d157906000b Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Mon, 20 Mar 2023 12:33:26 +0800 Subject: [PATCH] ac/llvm,radeonsi: remove abi->load_inputs implementation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit No nir_load_input in VS now. Reviewed-by: Marek Olšák Signed-off-by: Qiang Yu Part-of: --- src/amd/llvm/ac_nir_to_llvm.c | 3 - src/amd/llvm/ac_shader_abi.h | 5 - src/gallium/drivers/radeonsi/si_shader_internal.h | 1 - src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 260 ---------------------- 4 files changed, 269 deletions(-) diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index bf51487..e7db715 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3342,9 +3342,6 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * /* No indirect indexing is allowed after this point. */ assert(!indir_index); - if (ctx->stage == MESA_SHADER_VERTEX && !is_output) - return ctx->abi->load_inputs(ctx->abi, base, component, count, 0, component_type); - /* Other non-fragment cases have outputs in temporaries. */ if (is_output && (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) { assert(is_output); diff --git a/src/amd/llvm/ac_shader_abi.h b/src/amd/llvm/ac_shader_abi.h index ee56be0..5a9494c 100644 --- a/src/amd/llvm/ac_shader_abi.h +++ b/src/amd/llvm/ac_shader_abi.h @@ -66,11 +66,6 @@ struct ac_shader_abi { void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx, LLVMValueRef *addrs); - LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi, - unsigned driver_location, unsigned component, - unsigned num_components, unsigned vertex_index, - LLVMTypeRef type); - LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, LLVMTypeRef type, LLVMValueRef vertex_index, LLVMValueRef param_index, unsigned driver_location, unsigned component, diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 00c0a93..c03dad7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -239,6 +239,5 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx); /* si_shader_llvm_vs.c */ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key, bool separate_prolog); -void si_llvm_init_vs_callbacks(struct si_shader_context *ctx); #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index bb5fb2d..96e297b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -28,17 +28,6 @@ #include "util/u_memory.h" #include "ac_nir.h" -static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index) -{ - assert(index <= 1); - - if (index == 1) - return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""), - ctx->ac.i32, ""); -} - static LLVMValueRef get_vertex_index(struct si_shader_context *ctx, struct si_vs_prolog_bits *key, unsigned input_index, LLVMValueRef instance_divisor_constbuf, @@ -86,250 +75,6 @@ static LLVMValueRef get_vertex_index(struct si_shader_context *ctx, return index; } -static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4]) -{ - const struct si_shader_info *info = &ctx->shader->selector->info; - unsigned vs_blit_property = info->base.vs.blit_sgprs_amd; - - if (vs_blit_property) { - LLVMValueRef vertex_id = ctx->abi.vertex_id; - LLVMValueRef sel_x1 = - LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, ""); - /* Use LLVMIntNE, because we have 3 vertices and only - * the middle one should use y2. - */ - LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, ""); - - unsigned param_vs_blit_inputs = ctx->args->vs_blit_inputs.arg_index; - if (input_index == 0) { - /* Position: */ - LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs); - LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 1); - - LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); - LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); - LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); - LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); - - LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, ""); - LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, ""); - - out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); - out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); - out[2] = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 2); - out[3] = ctx->ac.f32_1; - return; - } - - /* Color or texture coordinates: */ - assert(input_index == 1); - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - for (int i = 0; i < 4; i++) { - out[i] = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 3 + i); - } - } else { - assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); - LLVMValueRef x1 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 3); - LLVMValueRef y1 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 4); - LLVMValueRef x2 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 5); - LLVMValueRef y2 = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 6); - - out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, ""); - out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, ""); - out[2] = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 7); - out[3] = LLVMGetParam(ctx->main_fn.value, param_vs_blit_inputs + 8); - } - return; - } - - /* Set can_speculate=false to help keep all loads grouped together - * for better latency hiding. If it was true, LLVM could move the loads forward - * and accidentally double memory latency by doing: - * - * buffer_load_dword_xyzw - * s_waitcnt vmcnt(0) - * buffer_load_dword_xyzw - * s_waitcnt vmcnt(0) - * - * ... which is what we must prevent at all cost. - */ - const bool can_speculate = false; - unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32; - LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32; - LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32; - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs; - union si_vs_fix_fetch fix_fetch; - LLVMValueRef vb_desc; - LLVMValueRef vertex_index = NULL; - LLVMValueRef tmp; - - if (input_index < num_vbos_in_user_sgprs) { - vb_desc = ac_get_arg(&ctx->ac, ctx->args->vb_descriptors[input_index]); - } else { - unsigned index = input_index - num_vbos_in_user_sgprs; - vb_desc = ac_build_load_to_sgpr( - &ctx->ac, ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->ac.vertex_buffers), - LLVMConstInt(ctx->ac.i32, index, 0)); - } - - if (ctx->abi.vertex_id_replaced) { - /* Only ngg culling will replace vertex_id, and ngg culling is an optimization key - * field, so the shader must be monolithic. - */ - assert(ctx->shader->is_monolithic); - assert(ctx->abi.instance_id_replaced); - - vertex_index = get_vertex_index(ctx, &ctx->shader->key.ge.part.vs.prolog, - input_index, ctx->instance_divisor_constbuf, - ctx->args->ac.start_instance.arg_index, - ctx->args->ac.base_vertex.arg_index); - } else { - vertex_index = LLVMGetParam(ctx->main_fn.value, - ctx->args->vertex_index0.arg_index + input_index); - } - - /* Use the open-coded implementation for all loads of doubles and - * of dword-sized data that needs fixups. We need to insert conversion - * code anyway, and the amd/common code does it for us. - */ - bool opencode = ctx->shader->key.ge.mono.vs_fetch_opencode & (1 << input_index); - fix_fetch.bits = ctx->shader->key.ge.mono.vs_fix_fetch[input_index].bits; - if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || - (fix_fetch.u.log_size == 2)) { - tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size, - fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format, - fix_fetch.u.reverse, !opencode, vb_desc, vertex_index, - ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate); - for (unsigned i = 0; i < 4; ++i) - out[i] = - LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); - - if (bit_size == 16) { - if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || - fix_fetch.u.format == AC_FETCH_FORMAT_SINT) { - for (unsigned i = 0; i < 4; i++) - out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, ""); - } else { - for (unsigned i = 0; i < 4; i++) { - out[i] = ac_to_float(&ctx->ac, out[i]); - out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, ""); - } - } - } - return; - } - - unsigned required_channels = util_last_bit(info->input[input_index].usage_mask); - if (required_channels == 0) { - for (unsigned i = 0; i < 4; ++i) - out[i] = LLVMGetUndef(ctx->ac.f32); - return; - } - - /* Do multiple loads for special formats. */ - LLVMValueRef fetches[4]; - unsigned num_fetches; - unsigned fetch_stride; - unsigned channels_per_fetch; - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { - num_fetches = MIN2(required_channels, 3); - fetch_stride = 1 << fix_fetch.u.log_size; - channels_per_fetch = 1; - } else { - num_fetches = 1; - fetch_stride = 0; - channels_per_fetch = required_channels; - } - - for (unsigned i = 0; i < num_fetches; ++i) { - LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); - fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, - channels_per_fetch, 0, can_speculate, - bit_size == 16, false); - } - - if (num_fetches == 1 && channels_per_fetch > 1) { - LLVMValueRef fetch = fetches[0]; - for (unsigned i = 0; i < channels_per_fetch; ++i) { - tmp = LLVMConstInt(ctx->ac.i32, i, false); - fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, ""); - } - num_fetches = channels_per_fetch; - channels_per_fetch = 1; - } - - for (unsigned i = num_fetches; i < 4; ++i) - fetches[i] = LLVMGetUndef(float_type); - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) { - if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) - fetches[3] = LLVMConstInt(int_type, 1, 0); - else - fetches[3] = LLVMConstReal(float_type, 1); - } else if (fix_fetch.u.log_size == 3 && - (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || - fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || - fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && - required_channels == 4) { - - /* For 2_10_10_10, the hardware returns an unsigned value; - * convert it to a signed one. - */ - LLVMValueRef tmp = fetches[3]; - LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0); - - /* First, recover the sign-extended signed integer value. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) - tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, ""); - else - tmp = ac_to_integer(&ctx->ac, tmp); - - /* For the integer-like cases, do a natural sign extension. - * - * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 - * and happen to contain 0, 1, 2, 3 as the two LSBs of the - * exponent. - */ - tmp = LLVMBuildShl( - ctx->ac.builder, tmp, - fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, ""); - tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); - - /* Convert back to the right type. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { - LLVMValueRef clamp; - LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0); - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, ""); - clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); - } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, ""); - } - - fetches[3] = tmp; - } - - for (unsigned i = 0; i < 4; ++i) - out[i] = ac_to_float(&ctx->ac, fetches[i]); -} - -static LLVMValueRef si_load_vs_input(struct ac_shader_abi *abi, unsigned driver_location, - unsigned component, unsigned num_components, - unsigned vertex_index, LLVMTypeRef type) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef values[4]; - - load_input_vs(ctx, driver_location, values); - - for (unsigned i = 0; i < 4; i++) - values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, ""); - - return ac_build_varying_gather_values(&ctx->ac, values, num_components, component); -} - /** * Build the vertex shader prolog function. * @@ -463,8 +208,3 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part si_llvm_build_ret(ctx, ret); } - -void si_llvm_init_vs_callbacks(struct si_shader_context *ctx) -{ - ctx->abi.load_inputs = si_load_vs_input; -} -- 2.7.4