From 182d9b1e6072bec190cf0a52e9d93dbbdbaa850d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 29 Apr 2021 13:33:45 +0200 Subject: [PATCH] aco: Implement NGG culling related intrinsics. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit These are very straightforward as they just copy data from the newly added shader arguments. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 86 ++++++++++++++++++++++ .../compiler/aco_instruction_selection_setup.cpp | 14 +++- 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 43f39d5..85f5ae2 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8949,8 +8949,94 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) bool_to_vector_condition(ctx, shader_query_enabled)); break; } + case nir_intrinsic_load_cull_front_face_enabled_amd: + case nir_intrinsic_load_cull_back_face_enabled_amd: + case nir_intrinsic_load_cull_ccw_amd: + case nir_intrinsic_load_cull_small_primitives_enabled_amd: { + unsigned cmp_bit; + if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd) + cmp_bit = 0; + else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd) + cmp_bit = 1; + else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd) + cmp_bit = 2; + else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd) + cmp_bit = 3; + else + unreachable("unimplemented culling intrinsic"); + + Builder::Result enabled = + bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), + get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit)); + enabled.instr->definitions[0].setNoCSE(true); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bool_to_vector_condition(ctx, enabled)); + break; + } case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break; case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break; + case nir_intrinsic_load_cull_any_enabled_amd: { + Builder::Result cull_any_enabled = + bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0x00ffffffu)); + cull_any_enabled.instr->definitions[1].setNoCSE(true); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp())); + break; + } + case nir_intrinsic_load_cull_small_prim_precision_amd: { + /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */ + Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(24u)); + /* small_prim_precision = 1.0 * 2^X */ + bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + Operand::c32(0x3f800000u), Operand(exponent)); + break; + } + case nir_intrinsic_load_viewport_x_scale: { + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ngg_viewport_scale[0])); + break; + } + case nir_intrinsic_load_viewport_y_scale: { + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ngg_viewport_scale[1])); + break; + } + case nir_intrinsic_load_viewport_x_offset: { + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ngg_viewport_translate[0])); + break; + } + case nir_intrinsic_load_viewport_y_offset: { + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + get_arg(ctx, ctx->args->ngg_viewport_translate[1])); + break; + } + case nir_intrinsic_overwrite_vs_arguments_amd: { + ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); + ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); + break; + } + case nir_intrinsic_overwrite_tes_arguments_amd: { + ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); + ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); + ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] = + get_ssa_temp(ctx, instr->src[2].ssa); + ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa); + break; + } + case nir_intrinsic_overwrite_subgroup_num_vertices_and_primitives_amd: { + Temp old_merged_wave_info = get_arg(ctx, ctx->args->ac.merged_wave_info); + Temp num_vertices = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); + Temp num_primitives = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); + Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), num_primitives, + Operand::c32(8u)); + tmp = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), tmp, num_vertices); + ctx->arg_temps[ctx->args->ac.merged_wave_info.arg_index] = + bld.sop2(aco_opcode::s_pack_lh_b32_b16, bld.def(s1), tmp, old_merged_wave_info); + break; + } default: isel_err(&instr->instr, "Unimplemented intrinsic instr"); abort(); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 430f9f6..689c4bb 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -671,7 +671,16 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_has_input_primitive_amd: case nir_intrinsic_load_workgroup_num_input_vertices_amd: case nir_intrinsic_load_workgroup_num_input_primitives_amd: - case nir_intrinsic_load_shader_query_enabled_amd: type = RegType::sgpr; break; + case nir_intrinsic_load_shader_query_enabled_amd: + case nir_intrinsic_load_cull_front_face_enabled_amd: + case nir_intrinsic_load_cull_back_face_enabled_amd: + case nir_intrinsic_load_cull_ccw_amd: + case nir_intrinsic_load_cull_small_primitives_enabled_amd: + case nir_intrinsic_load_cull_any_enabled_amd: + case nir_intrinsic_load_viewport_x_scale: + case nir_intrinsic_load_viewport_y_scale: + case nir_intrinsic_load_viewport_x_offset: + case nir_intrinsic_load_viewport_y_offset: type = RegType::sgpr; break; case nir_intrinsic_load_sample_id: case nir_intrinsic_load_sample_mask_in: case nir_intrinsic_load_input: @@ -753,7 +762,8 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_load_packed_passthrough_primitive_amd: case nir_intrinsic_gds_atomic_add_amd: case nir_intrinsic_load_sbt_amd: - case nir_intrinsic_bvh64_intersect_ray_amd: type = RegType::vgpr; break; + case nir_intrinsic_bvh64_intersect_ray_amd: + case nir_intrinsic_load_cull_small_prim_precision_amd: type = RegType::vgpr; break; case nir_intrinsic_load_shared: /* When the result of these loads is only used by cross-lane instructions, * it is beneficial to use a VGPR destination. This is because this allows -- 2.7.4