From d9a77f9ca36177c3295036588f204a9e49e81a09 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 3 Sep 2020 05:31:36 -0400 Subject: [PATCH] ac/llvm: add better code for fsign There are 2 improvements: - better code for 16, 32, and 64 bits - vector support for 16 and 32 bits Totals: SGPRS: 2639738 -> 2625882 (-0.52 %) VGPRS: 1534120 -> 1533916 (-0.01 %) Spilled SGPRs: 3541 -> 3557 (0.45 %) Spilled VGPRs: 33 -> 33 (0.00 %) Private memory VGPRs: 256 -> 256 (0.00 %) Scratch size: 292 -> 292 (0.00 %) dwords per thread Code Size: 55640332 -> 55384892 (-0.46 %) bytes Max Waves: 964785 -> 964857 (0.01 %) Totals from affected shaders: SGPRS: 377352 -> 363496 (-3.67 %) VGPRS: 209800 -> 209596 (-0.10 %) Spilled SGPRs: 1979 -> 1995 (0.81 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 256 -> 256 (0.00 %) Scratch size: 256 -> 256 (0.00 %) dwords per thread Code Size: 12549300 -> 12293860 (-2.04 %) bytes Max Waves: 105762 -> 105834 (0.07 %) Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/llvm/ac_llvm_build.c | 64 +++++++++++++++++++++++++++-------------- src/amd/llvm/ac_llvm_build.h | 4 +-- src/amd/llvm/ac_llvm_helper.cpp | 25 ++++++++++++++++ src/amd/llvm/ac_llvm_util.h | 3 ++ src/amd/llvm/ac_nir_to_llvm.c | 3 +- 5 files changed, 73 insertions(+), 26 deletions(-) diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index a3fa30f..861db0d 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -2776,31 +2776,53 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0) return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1)); } -LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize) +static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val) { - LLVMValueRef cmp, val, zero, one; - LLVMTypeRef type; + ac_enable_signed_zeros(ctx); + /* (val + 0) converts negative zero to positive zero. */ + val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), ""); + ac_disable_signed_zeros(ctx); + return val; +} - if (bitsize == 16) { - type = ctx->f16; - zero = ctx->f16_0; - one = ctx->f16_1; - } else if (bitsize == 32) { - type = ctx->f32; - zero = ctx->f32_0; - one = ctx->f32_1; - } else { - type = ctx->f64; - zero = ctx->f64_0; - one = ctx->f64_1; +LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src) +{ + LLVMTypeRef type = LLVMTypeOf(src); + LLVMValueRef pos, neg, dw[2], val; + unsigned bitsize = ac_get_elem_bits(ctx, type); + + /* The standard version leads to this: + * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004 + * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2 + * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880 + * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3 + * + * The isign version: + * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004 + * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304 + * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04 + * + * (src0 + 0) converts negative zero to positive zero. + * After that, int(fsign(x)) == isign(floatBitsToInt(x)). + * + * For FP64, use the standard version, which doesn't suffer from the huge DP rate + * reduction. (FP64 comparisons are as fast as int64 comparisons) + */ + if (bitsize == 16 || bitsize == 32) { + val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src)); + val = ac_build_isign(ctx, val); + return LLVMBuildSIToFP(ctx->builder, val, type, ""); } - cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, one, src0, ""); - cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, ""); - val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), ""); - return val; + assert(bitsize == 64); + pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, ""); + neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, ""); + dw[0] = ctx->i32_0; + dw[1] = LLVMBuildSelect(ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0), + LLVMBuildSelect(ctx->builder, neg, + LLVMConstInt(ctx->i32, 0xBFF00000, 0), + ctx->i32_0, ""), ""); + return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, ""); } LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0) diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 7cb98d0..6491843 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -598,9 +598,7 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags); LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize); LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0); -LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0, - unsigned bitsize); - +LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src); LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0); LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, diff --git a/src/amd/llvm/ac_llvm_helper.cpp b/src/amd/llvm/ac_llvm_helper.cpp index 10cf301..184f76a 100644 --- a/src/amd/llvm/ac_llvm_helper.cpp +++ b/src/amd/llvm/ac_llvm_helper.cpp @@ -121,6 +121,31 @@ LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, return builder; } +void ac_enable_signed_zeros(struct ac_llvm_context *ctx) +{ + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { + auto *b = llvm::unwrap(ctx->builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); + + /* This disables the optimization of (x + 0), which is used + * to convert negative zero to positive zero. + */ + flags.setNoSignedZeros(false); + b->setFastMathFlags(flags); + } +} + +void ac_disable_signed_zeros(struct ac_llvm_context *ctx) +{ + if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) { + auto *b = llvm::unwrap(ctx->builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); + + flags.setNoSignedZeros(); + b->setFastMathFlags(flags); + } +} + LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple) { diff --git a/src/amd/llvm/ac_llvm_util.h b/src/amd/llvm/ac_llvm_util.h index 8039c99..d44d4de 100644 --- a/src/amd/llvm/ac_llvm_util.h +++ b/src/amd/llvm/ac_llvm_util.h @@ -37,6 +37,7 @@ extern "C" { #endif struct ac_compiler_passes; +struct ac_llvm_context; enum ac_func_attr { AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0), @@ -109,6 +110,8 @@ LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx); LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode); +void ac_enable_signed_zeros(struct ac_llvm_context *ctx); +void ac_disable_signed_zeros(struct ac_llvm_context *ctx); void ac_llvm_add_target_dep_function_attr(LLVMValueRef F, diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index ddea781..4b696f2 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -826,8 +826,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; case nir_op_fsign: src[0] = ac_to_float(&ctx->ac, src[0]); - result = ac_build_fsign(&ctx->ac, src[0], - instr->dest.dest.ssa.bit_size); + result = ac_build_fsign(&ctx->ac, src[0]); break; case nir_op_ffloor: result = emit_intrin_1f_param(&ctx->ac, "llvm.floor", -- 2.7.4