From 6320e37d4be16407cd237c2049a46360405599d5 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 26 Sep 2019 10:32:00 -0700 Subject: [PATCH] nir: add amul instruction Used for address/offset calculation (ie. array derefs), where we can potentially use less than 32b for the multiply of array idx by element size. For backends that support `imul24`, this gives a lowering pass an easy way to find multiplies that potentially can be converted to `imul24`. Signed-off-by: Rob Clark Reviewed-by: Kristian H. Kristensen Reviewed-by: Eduardo Lima Mitev --- src/compiler/glsl/gl_nir_lower_buffers.c | 2 +- src/compiler/nir/nir_builder.h | 16 +++++++++++++++- src/compiler/nir/nir_deref.c | 2 +- src/compiler/nir/nir_lower_io.c | 6 +++--- src/compiler/nir/nir_lower_io_to_vector.c | 2 +- src/compiler/nir/nir_opcodes.py | 12 ++++++++++++ src/compiler/nir/nir_opt_algebraic.py | 5 +++++ 7 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/compiler/glsl/gl_nir_lower_buffers.c b/src/compiler/glsl/gl_nir_lower_buffers.c index c7b5858..59bb582 100644 --- a/src/compiler/glsl/gl_nir_lower_buffers.c +++ b/src/compiler/glsl/gl_nir_lower_buffers.c @@ -59,7 +59,7 @@ get_block_array_index(nir_builder *b, nir_deref_instr *deref, } else { nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); arr_index = nir_umin(b, arr_index, nir_imm_int(b, arr_size - 1)); - nir_ssa_def *arr_offset = nir_imul_imm(b, arr_index, array_elements); + nir_ssa_def *arr_offset = nir_amul_imm(b, arr_index, array_elements); if (nonconst_index) nonconst_index = nir_iadd(b, nonconst_index, arr_offset); else diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index cad0e13..5714ea0 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -646,7 +646,7 @@ nir_iadd_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) } static inline nir_ssa_def * -nir_imul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +_nir_mul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y, bool amul) { assert(x->bit_size <= 64); if (x->bit_size < 64) @@ -658,12 +658,26 @@ nir_imul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) return x; } else if (util_is_power_of_two_or_zero64(y)) { return nir_ishl(build, x, nir_imm_int(build, ffsll(y) - 1)); + } else if (amul) { + return nir_amul(build, x, nir_imm_intN_t(build, y, x->bit_size)); } else { return nir_imul(build, x, nir_imm_intN_t(build, y, x->bit_size)); } } static inline nir_ssa_def * +nir_imul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +{ + return _nir_mul_imm(build, x, y, false); +} + +static inline nir_ssa_def * +nir_amul_imm(nir_builder *build, nir_ssa_def *x, uint64_t y) +{ + return _nir_mul_imm(build, x, y, true); +} + +static inline nir_ssa_def * nir_fadd_imm(nir_builder *build, nir_ssa_def *x, double y) { return nir_fadd(build, x, nir_imm_floatN_t(build, y, x->bit_size)); diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c index f7a07c6..7ec4601 100644 --- a/src/compiler/nir/nir_deref.c +++ b/src/compiler/nir/nir_deref.c @@ -297,7 +297,7 @@ nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref, if ((*p)->deref_type == nir_deref_type_array) { nir_ssa_def *index = nir_ssa_for_src(b, (*p)->arr.index, 1); int stride = type_get_array_stride((*p)->type, size_align); - offset = nir_iadd(b, offset, nir_imul_imm(b, index, stride)); + offset = nir_iadd(b, offset, nir_amul_imm(b, index, stride)); } else if ((*p)->deref_type == nir_deref_type_struct) { /* p starts at path[1], so this is safe */ nir_deref_instr *parent = *(p - 1); diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 8e20f5e..6f257d9 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -206,7 +206,7 @@ get_io_offset(nir_builder *b, nir_deref_instr *deref, unsigned size = type_size((*p)->type, bts); nir_ssa_def *mul = - nir_imul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); + nir_amul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); offset = nir_iadd(b, offset, mul); } else if ((*p)->deref_type == nir_deref_type_struct) { @@ -1094,7 +1094,7 @@ nir_explicit_io_address_from_deref(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); index = nir_i2i(b, index, base_addr->bit_size); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_ptr_as_array: { @@ -1102,7 +1102,7 @@ nir_explicit_io_address_from_deref(nir_builder *b, nir_deref_instr *deref, index = nir_i2i(b, index, base_addr->bit_size); unsigned stride = nir_deref_instr_ptr_as_array_stride(deref); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_array_wildcard: diff --git a/src/compiler/nir/nir_lower_io_to_vector.c b/src/compiler/nir/nir_lower_io_to_vector.c index 84bd941..0ef4021 100644 --- a/src/compiler/nir/nir_lower_io_to_vector.c +++ b/src/compiler/nir/nir_lower_io_to_vector.c @@ -319,7 +319,7 @@ build_array_index(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *base, deref->dest.ssa.bit_size); return nir_iadd( b, build_array_index(b, nir_deref_instr_parent(deref), base, vs_in), - nir_imul_imm(b, index, glsl_count_attribute_slots(deref->type, vs_in))); + nir_amul_imm(b, index, glsl_count_attribute_slots(deref->type, vs_in))); } default: unreachable("Invalid deref instruction type"); diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 4bb21ea..e74e07b 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1047,6 +1047,18 @@ dst.z = src2.x; dst.w = src3.x; """) +# An integer multiply instruction for address calculation. This is +# similar to imul, except that the results are undefined in case of +# overflow. Overflow is defined according to the size of the variable +# being dereferenced. +# +# This relaxed definition, compared to imul, allows an optimization +# pass to propagate bounds (ie, from an load/store intrinsic) to the +# sources, such that lower precision integer multiplies can be used. +# This is useful on hw that has 24b or perhaps 16b integer multiply +# instructions. +binop("amul", tint, _2src_commutative + associative, "src0 * src1") + # ir3-specific instruction that maps directly to mul-add shift high mix, # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer # multiplication (imul) on Freedreno backend.. diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 2e4d792..7bac217 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -1112,6 +1112,11 @@ optimizations.extend([ (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), + + # Address/offset calculations: + # for now, unconditionally convert amul to imul, this will + # change in the following patch + (('amul', a, b), ('imul', a, b)), ]) # bit_size dependent lowerings -- 2.7.4