From 381de3c809fce5427308c696bbd313360194eff4 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 1 Dec 2022 15:05:49 +0000 Subject: [PATCH] aco: more carefully apply constant offsets into scratch accesses MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Death stranding does scratch_arr[80-idx]. This doesn't seem to work if we try to combine the subtraction into the access. fossil-db (navi21): Totals from 52 (0.04% of 135636) affected shaders: Instrs: 78560 -> 79036 (+0.61%) CodeSize: 427940 -> 431188 (+0.76%) Latency: 1313809 -> 1318142 (+0.33%) InvThroughput: 292833 -> 293842 (+0.34%) VClause: 2361 -> 2555 (+8.22%); split: -0.51%, +8.73% Copies: 8767 -> 8746 (-0.24%); split: -0.35%, +0.11% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Fixes: 0e783d687a3 ("aco: use scratch_* for scratch load/store on GFX9+") Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7735 Part-of: --- src/amd/compiler/aco_optimizer.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 4b8d77a..abf3e92 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1255,12 +1255,14 @@ is_op_canonicalized(opt_ctx& ctx, Operand op) } bool -is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int32_t offset) +is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1) { bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10; int32_t min = ctx.program->dev.scratch_global_offset_min; int32_t max = ctx.program->dev.scratch_global_offset_max; + int64_t offset = offset0 + offset1; + bool has_vgpr_offset = instr && !instr->operands[0].isUndefined(); if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4) return false; @@ -1467,15 +1469,28 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; - if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && + /* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means + * we can't combine the addition if the unsigned addition overflows and offset is + * positive. In theory, there is also issues if + * 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just + * replaces an already out-of-bounds access with a larger one since 'saddr + offset' + * would be larger than INT32_MAX. + */ + if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == instr->operands[i].regClass() && - is_scratch_offset_valid(ctx, instr.get(), scratch.offset + (int32_t)offset)) { + is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) { + instr->operands[i].setTemp(base); + scratch.offset += (int32_t)offset; + continue; + } else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && + base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 && + is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) { instr->operands[i].setTemp(base); scratch.offset += (int32_t)offset; continue; } else if (i <= 1 && info.is_constant_or_literal(32) && ctx.program->gfx_level >= GFX10_3 && - is_scratch_offset_valid(ctx, NULL, scratch.offset + (int32_t)info.val)) { + is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) { /* GFX10.3+ can disable both SADDR and ADDR. */ instr->operands[i] = Operand(instr->operands[i].regClass()); scratch.offset += (int32_t)info.val; -- 2.7.4