From: Rhys Perry Date: Tue, 14 Sep 2021 17:02:01 +0000 (+0100) Subject: nir/algebraic: optimize expressions using fmulz/ffmaz X-Git-Tag: upstream/22.3.5~13710 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=495debebad2a86b12c27da173a545ca44b0f5cff;p=platform%2Fupstream%2Fmesa.git nir/algebraic: optimize expressions using fmulz/ffmaz Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 075fc1b..8c6cd77 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -146,10 +146,15 @@ optimizations = [ (('usadd_4x8_vc4', a, 0), a), (('usadd_4x8_vc4', a, ~0), ~0), (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), + (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), + (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), + (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), + (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), + (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), @@ -164,10 +169,18 @@ optimizations = [ # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16), (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32), + (('fmulz', a, 0.0), 0.0), + (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), + (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), + (('fmulz', a, a), ('fmul', a, a)), + (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32), + (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), + (('ffmaz', a, a, b), ('ffma', a, a, b)), (('imul', a, 0), 0), (('umul_unorm_4x8_vc4', a, 0), 0), (('umul_unorm_4x8_vc4', a, ~0), a), (('~fmul', a, 1.0), a), + (('~fmulz', a, 1.0), a), # The only effect a*1.0 can have is flushing denormals. If it's only used by # a floating point instruction, they should flush any input denormals and # this multiplication isn't needed. @@ -184,12 +197,17 @@ optimizations = [ (('~ffma', 0.0, a, b), b), (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16), (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32), + (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), (('~ffma', a, b, 0.0), ('fmul', a, b)), (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16), (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), + (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32), (('ffma', 1.0, a, b), ('fadd', a, b)), + (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32), (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), + (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32), (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), + (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), (('~flrp', a, b, 0.0), a), (('~flrp', a, b, 1.0), b), (('~flrp', a, a, b), a), @@ -769,6 +787,7 @@ optimizations.extend([ (('fsat', ('fsat', a)), ('fsat', a)), (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), + (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32), (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), @@ -1228,6 +1247,7 @@ optimizations.extend([ (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), + (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), # Division and reciprocal (('~fdiv', 1.0, a), ('frcp', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), @@ -1546,29 +1566,42 @@ optimizations.extend([ # Propagate negation up multiplication chains (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), + (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32), (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), + (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), # Propagate constants up multiplication chains (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), + (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), + (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), + (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), + (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), # Prefer moving out a multiplication for more MAD/FMA-friendly code (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), + (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), # Reassociate constants in add/mul chains so they can be folded together. # For now, we mostly only handle cases where the constants are separated by # a single non-constant. We could do better eventually. (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), + (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), + (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), + (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), + (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), + (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), + (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), @@ -1593,6 +1626,8 @@ optimizations.extend([ (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), + (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), + (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), @@ -2297,7 +2332,7 @@ for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: # mix(1, (a-1)+1, condition) # # Other optimizations will rearrange the constants. -for op in ['fadd', 'fmul', 'iadd', 'imul']: +for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: optimizations += [ ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) ] @@ -2339,7 +2374,7 @@ for op in ['flrp']: (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), ] -for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: +for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: optimizations += [ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), @@ -2622,6 +2657,13 @@ late_optimizations = [ (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), + ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), + ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), + ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: # # If bits is zero, the result will be zero. @@ -2716,7 +2758,7 @@ for op in ['fadd']: (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), ] -for op in ['ffma']: +for op in ['ffma', 'ffmaz']: late_optimizations += [ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), diff --git a/src/compiler/nir/nir_range_analysis.c b/src/compiler/nir/nir_range_analysis.c index 8cdefb8..24e3af1 100644 --- a/src/compiler/nir/nir_range_analysis.c +++ b/src/compiler/nir/nir_range_analysis.c @@ -858,7 +858,8 @@ analyze_expression(const nir_alu_instr *instr, unsigned src, break; } - case nir_op_fmul: { + case nir_op_fmul: + case nir_op_fmulz: { const struct ssa_result_range left = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); const struct ssa_result_range right = @@ -878,14 +879,19 @@ analyze_expression(const nir_alu_instr *instr, unsigned src, } else r.range = fmul_table[left.range][right.range]; - /* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both - * operands are numbers and either both are finite or one is finite and - * the other cannot be zero, then the result must be a number. - */ - r.is_a_number = (left.is_a_number && right.is_a_number) && - ((left.is_finite && right.is_finite) || - (!is_not_zero(left.range) && right.is_finite) || - (left.is_finite && !is_not_zero(right.range))); + if (alu->op == nir_op_fmul) { + /* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both + * operands are numbers and either both are finite or one is finite and + * the other cannot be zero, then the result must be a number. + */ + r.is_a_number = (left.is_a_number && right.is_a_number) && + ((left.is_finite && right.is_finite) || + (!is_not_zero(left.range) && right.is_finite) || + (left.is_finite && !is_not_zero(right.range))); + } else { + /* nir_op_fmulz: unlike nir_op_fmul, 0 * ±Inf is a number. */ + r.is_a_number = left.is_a_number && right.is_a_number; + } break; } @@ -1466,6 +1472,7 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht, case nir_op_ubfe: case nir_op_bfm: case nir_op_fmul: + case nir_op_fmulz: case nir_op_extract_u8: case nir_op_extract_i8: case nir_op_extract_u16: @@ -1584,6 +1591,7 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht, } break; case nir_op_fmul: + case nir_op_fmulz: /* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */ if (src0 < 0x7f800000u && src1 < 0x7f800000u) { float src0_f, src1_f; diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h index 2030dff..c35253f 100644 --- a/src/compiler/nir/nir_search_helpers.h +++ b/src/compiler/nir/nir_search_helpers.h @@ -288,7 +288,7 @@ is_not_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src, if (src_alu->op == nir_op_fneg) return is_not_fmul(ht, src_alu, 0, 0, NULL); - return src_alu->op != nir_op_fmul; + return src_alu->op != nir_op_fmul && src_alu->op != nir_op_fmulz; } static inline bool @@ -304,7 +304,7 @@ is_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src, if (src_alu->op == nir_op_fneg) return is_fmul(ht, src_alu, 0, 0, NULL); - return src_alu->op == nir_op_fmul; + return src_alu->op == nir_op_fmul || src_alu->op == nir_op_fmulz; } static inline bool @@ -499,6 +499,17 @@ is_finite(UNUSED struct hash_table *ht, const nir_alu_instr *instr, return v.is_finite; } +static inline bool +is_finite_not_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr, + unsigned src, UNUSED unsigned num_components, + UNUSED const uint8_t *swizzle) +{ + const struct ssa_result_range v = nir_analyze_range(ht, instr, src); + + return v.is_finite && + (v.range == lt_zero || v.range == gt_zero || v.range == ne_zero); +} + #define RELATION(r) \ static inline bool \