From 98b4cc7110d47ae2c8bab7c530cad757300a5323 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 16 Apr 2020 19:12:19 +0100 Subject: [PATCH] aco: refactor store_lds() to use new helpers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It should also work correctly for 8/16-bit stores Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 181 ++++++++++++------------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 411744e..1c2b172 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3637,67 +3637,6 @@ Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned si return res; } -void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align) -{ - Builder bld(ctx->program, ctx->block); - unsigned bytes_written = 0; - bool large_ds_write = ctx->options->chip_class >= GFX7; - bool usable_write2 = ctx->options->chip_class >= GFX7; - - while (bytes_written < total_size * 4) { - unsigned todo = total_size * 4 - bytes_written; - bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0; - bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0; - - aco_opcode op = aco_opcode::last_opcode; - bool write2 = false; - unsigned size = 0; - if (todo >= 16 && aligned16 && large_ds_write) { - op = aco_opcode::ds_write_b128; - size = 4; - } else if (todo >= 16 && aligned8 && usable_write2) { - op = aco_opcode::ds_write2_b64; - write2 = true; - size = 4; - } else if (todo >= 12 && aligned16 && large_ds_write) { - op = aco_opcode::ds_write_b96; - size = 3; - } else if (todo >= 8 && aligned8) { - op = aco_opcode::ds_write_b64; - size = 2; - } else if (todo >= 8 && usable_write2) { - op = aco_opcode::ds_write2_b32; - write2 = true; - size = 2; - } else if (todo >= 4) { - op = aco_opcode::ds_write_b32; - size = 1; - } else { - assert(false); - } - - unsigned offset = offset0 + offset1 + bytes_written; - unsigned max_offset = write2 ? 1020 : 65535; - Temp address_offset = address; - if (offset > max_offset) { - address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset); - offset = offset1 + bytes_written; - } - assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */ - - if (write2) { - Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr); - Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr); - bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1); - } else { - Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr); - bld.ds(op, address_offset, val, m, offset); - } - - bytes_written += size * 4; - } -} - void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src) { if (!count) @@ -3803,48 +3742,108 @@ void advance_write_mask(uint32_t *todo_mask, int start, int count) void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address, unsigned base_offset, unsigned align) { - assert(util_is_power_of_two_nonzero(align) && align >= 4); - assert(elem_size_bytes == 4 || elem_size_bytes == 8); + assert(util_is_power_of_two_nonzero(align)); + assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8); Builder bld(ctx->program, ctx->block); - Operand m = load_lds_size_m0(bld); + bool large_ds_write = ctx->options->chip_class >= GFX7; + bool usable_write2 = ctx->options->chip_class >= GFX7; - /* we need at most two stores, assuming that the writemask is at most 4 bits wide */ - assert(wrmask <= 0x0f); - int start[2], count[2]; - u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]); - u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]); - assert(wrmask == 0); + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + aco_opcode opcodes[32]; - /* one combined store is sufficient */ - if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) { - Temp address_offset = address; - if ((base_offset / elem_size_bytes) + start[1] > 255) { - address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); - base_offset = 0; + wrmask = widen_mask(wrmask, elem_size_bytes); + + uint32_t todo = u_bit_consecutive(0, data.bytes()); + while (todo) { + int offset, bytes; + if (!scan_write_mask(wrmask, todo, &offset, &bytes)) { + offsets[write_count] = offset; + opcodes[write_count] = aco_opcode::num_opcodes; + write_count++; + advance_write_mask(&todo, offset, bytes); + continue; } - assert(count[0] == 1); - RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4); + bool aligned2 = offset % 2 == 0 && align % 2 == 0; + bool aligned4 = offset % 4 == 0 && align % 4 == 0; + bool aligned8 = offset % 8 == 0 && align % 8 == 0; + bool aligned16 = offset % 16 == 0 && align % 16 == 0; - Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc); - Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc); - aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; - base_offset = base_offset / elem_size_bytes; - bld.ds(op, address_offset, val0, val1, m, - base_offset + start[0], base_offset + start[1]); - return; + //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial + aco_opcode op = aco_opcode::num_opcodes; + if (bytes >= 16 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b128; + bytes = 16; + } else if (bytes >= 12 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b96; + bytes = 12; + } else if (bytes >= 8 && aligned8) { + op = aco_opcode::ds_write_b64; + bytes = 8; + } else if (bytes >= 4 && aligned4) { + op = aco_opcode::ds_write_b32; + bytes = 4; + } else if (bytes >= 2 && aligned2) { + op = aco_opcode::ds_write_b16; + bytes = 2; + } else if (bytes >= 1) { + op = aco_opcode::ds_write_b8; + bytes = 1; + } else { + assert(false); + } + + offsets[write_count] = offset; + opcodes[write_count] = op; + write_count++; + advance_write_mask(&todo, offset, bytes); } - for (unsigned i = 0; i < 2; i++) { - if (count[i] == 0) + Operand m = load_lds_size_m0(bld); + + split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data); + + for (unsigned i = 0; i < write_count; i++) { + aco_opcode op = opcodes[i]; + if (op == aco_opcode::num_opcodes) continue; - unsigned elem_size_words = elem_size_bytes / 4; - ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words, - base_offset, start[i] * elem_size_bytes, align); + Temp data = write_datas[i]; + + unsigned second = write_count; + if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) { + for (second = i + 1; second < write_count; second++) { + if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) { + op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; + opcodes[second] = aco_opcode::num_opcodes; + break; + } + } + } + + bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64; + unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes(); + + unsigned inline_offset = base_offset + offsets[i]; + unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535; + Temp address_offset = address; + if (inline_offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); + inline_offset = offsets[i]; + } + assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */ + + if (write2) { + Temp second_data = write_datas[second]; + inline_offset /= data.bytes(); + bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); + } else { + bld.ds(op, address_offset, data, m, inline_offset); + } } - return; } unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset) -- 2.7.4