From 5ad52ac90630e344650cf9a1b48820432af22680 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 24 Aug 2020 19:43:26 +0100 Subject: [PATCH] aco: create helpers to emit vop3p instructions Also make get_alu_src() capable to return unswizzled multi-component SGPR sources. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_builder_h.py | 1 + src/amd/compiler/aco_instruction_selection.cpp | 68 +++++++++++++++++++++----- src/amd/compiler/aco_opcodes.py | 3 ++ 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index a08eafd..af0eacd 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -535,6 +535,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("vop2_sdwa", [Format.VOP2, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2, 3])), ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])), ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), + ("vop3p", [Format.VOP3P], 'VOP3P_instruction', [(1, 2), (1, 3)]), ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]), ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]), ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])), diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9c6bbe0..0dad702 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -657,21 +657,20 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) { - if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1) + if (src.src.ssa->num_components == 1 && size == 1) return get_ssa_temp(ctx, src.src.ssa); - if (src.src.ssa->num_components == size) { - bool identity_swizzle = true; - for (unsigned i = 0; identity_swizzle && i < size; i++) { - if (src.swizzle[i] != i) - identity_swizzle = false; - } - if (identity_swizzle) - return get_ssa_temp(ctx, src.src.ssa); - } - Temp vec = get_ssa_temp(ctx, src.src.ssa); unsigned elem_size = vec.bytes() / src.src.ssa->num_components; + bool identity_swizzle = true; + + for (unsigned i = 0; identity_swizzle && i < size; i++) { + if (src.swizzle[i] != i) + identity_swizzle = false; + } + if (identity_swizzle) + return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size)); + assert(elem_size > 0); assert(vec.bytes() % elem_size == 0); @@ -701,6 +700,33 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) } } +Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src) +{ + /* returns v2b or v1 for vop3p usage. + * The source expects exactly 2 16bit components + * which are within the same dword + */ + assert(src.src.ssa->bit_size == 16); + assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1); + + Temp tmp = get_ssa_temp(ctx, src.src.ssa); + if (tmp.size() == 1) + return tmp; + + /* the size is larger than 1 dword: check the swizzle */ + unsigned dword = src.swizzle[0] >> 1; + + /* extract a full dword if possible */ + if (tmp.bytes() >= (dword + 1) * 4) { + return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1)); + } else { + /* This must be a swizzled access to %a.zz where %a is v6b */ + assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0); + assert(tmp.regClass() == v6b && dword == 1); + return emit_extract_vector(ctx, tmp, dword * 2, v2b); + } +} + uint32_t get_alu_src_ub(isel_context *ctx, nir_alu_instr *instr, int src_idx) { nir_ssa_scalar scalar = nir_ssa_scalar{instr->src[src_idx].src.ssa, @@ -848,6 +874,26 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode } } +Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr, + aco_opcode op, Temp dst, bool swap_srcs=false) +{ + Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]); + Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]); + if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr) + src1 = as_vgpr(ctx, src1); + assert(instr->dest.dest.ssa.num_components == 2); + + /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */ + unsigned opsel_lo = (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1); + unsigned opsel_hi = (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1); + + Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; + Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi); + emit_split_vector(ctx, dst, 2); + return res; +} + void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 82b47e8..2a8bc8c 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -134,6 +134,9 @@ class Format(Enum): ('uint8_t', 'row_mask', '0xF'), ('uint8_t', 'bank_mask', '0xF'), ('bool', 'bound_ctrl', 'true')] + elif self == Format.VOP3P: + return [('uint8_t', 'opsel_lo', None), + ('uint8_t', 'opsel_hi', None)] elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: return [('uint16_t', 'offset', 0), ('memory_sync_info', 'sync', 'memory_sync_info()'), -- 2.7.4