From ad9bc1ffb57e36d0c8f4e57c11f82785b1ef5bdc Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 13 Jan 2023 12:29:30 +0200 Subject: [PATCH] intel/fs: enable UBO accesses through bindless heap Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_eu_defines.h | 15 +++ src/intel/compiler/brw_fs.cpp | 14 ++- src/intel/compiler/brw_fs.h | 3 +- src/intel/compiler/brw_fs_nir.cpp | 29 +++--- src/intel/compiler/brw_lower_logical_sends.cpp | 132 +++++++++++++++---------- 5 files changed, 119 insertions(+), 74 deletions(-) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 824e54a..0cfe546 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -890,6 +890,8 @@ enum tex_logical_srcs { enum pull_uniform_constant_srcs { /** Surface binding table index */ PULL_UNIFORM_CONSTANT_SRC_SURFACE, + /** Surface bindless handle */ + PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE, /** Surface offset */ PULL_UNIFORM_CONSTANT_SRC_OFFSET, /** Pull size */ @@ -898,6 +900,19 @@ enum pull_uniform_constant_srcs { PULL_UNIFORM_CONSTANT_SRCS, }; +enum pull_varying_constant_srcs { + /** Surface binding table index */ + PULL_VARYING_CONSTANT_SRC_SURFACE, + /** Surface bindless handle */ + PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE, + /** Surface offset */ + PULL_VARYING_CONSTANT_SRC_OFFSET, + /** Pull alignment */ + PULL_VARYING_CONSTANT_SRC_ALIGNMENT, + + PULL_VARYING_CONSTANT_SRCS, +}; + enum get_buffer_size_srcs { /** Surface binding table index */ GET_BUFFER_SIZE_SRC_SURFACE, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index ff89c39..0eefc5f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -168,7 +168,8 @@ fs_inst::resize_sources(uint8_t num_sources) void fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, const fs_reg &dst, - const fs_reg &surf_index, + const fs_reg &surface, + const fs_reg &surface_handle, const fs_reg &varying_offset, uint32_t const_offset, uint8_t alignment) @@ -194,9 +195,15 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * result. */ fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + + fs_reg srcs[PULL_VARYING_CONSTANT_SRCS]; + srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface; + srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle; + srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = vec4_offset; + srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment); + fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, - vec4_result, surf_index, vec4_offset, - brw_imm_ud(alignment)); + vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS); inst->size_written = 4 * vec4_result.component_size(inst->exec_size); shuffle_from_32bit_read(bld, dst, vec4_result, @@ -2513,6 +2520,7 @@ fs_visitor::lower_constant_loads() VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, brw_imm_ud(index), + fs_reg() /* surface_handle */, inst->src[1], pull_index * 4, 4); inst->remove(block); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index d928e2b..138ccbd 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -216,7 +216,8 @@ public: void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld, const fs_reg &dst, - const fs_reg &surf_index, + const fs_reg &surface, + const fs_reg &surface_handle, const fs_reg &varying_offset, uint32_t const_offset, uint8_t alignment); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 202bfea..2a76a9a 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4643,26 +4643,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_load_ubo: { - fs_reg surf_index; - if (nir_src_is_const(instr->src[0])) { - const unsigned index = nir_src_as_uint(instr->src[0]); - surf_index = brw_imm_ud(index); - } else { - /* The block index is not a constant. Evaluate the index expression - * per-channel and add the base UBO index; we have to select a value - * from any live channel. - */ - surf_index = vgrf(glsl_type::uint_type); - bld.MOV(surf_index, get_nir_src(instr->src[0])); - surf_index = bld.emit_uniformize(surf_index); - } + fs_reg surface, surface_handle; + + if (get_nir_src_bindless(instr->src[0])) + surface_handle = get_nir_buffer_intrinsic_index(bld, instr); + else + surface = get_nir_buffer_intrinsic_index(bld, instr); if (!nir_src_is_const(instr->src[1])) { fs_reg base_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD); for (int i = 0; i < instr->num_components; i++) - VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, + VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), + surface, surface_handle, base_offset, i * type_sz(dest.type), nir_dest_bit_size(instr->dest) / 8); @@ -4717,9 +4711,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr (block_sz - base % block_sz) / type_size); fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; - srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surf_index; - srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); - srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle; + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, srcs, PULL_UNIFORM_CONSTANT_SRCS); diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 45e92a1..78bc0bc 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -2301,16 +2301,23 @@ lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld, const intel_device_info *devinfo = bld.shader->devinfo; ASSERTED const brw_compiler *compiler = bld.shader->compiler; - fs_reg surface = inst->src[0]; + fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE]; + fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE]; + fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET]; + fs_reg alignment_B = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]; /* We are switching the instruction from an ALU-like instruction to a * send-from-grf instruction. Since sends can't handle strides or * source modifiers, we have to make a copy of the offset source. */ - fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1); + fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1); - assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); - unsigned alignment = inst->src[2].ud; + enum lsc_addr_surface_type surf_type = + surface_handle.file == BAD_FILE ? + LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS; + + assert(alignment_B.file == BRW_IMMEDIATE_VALUE); + unsigned alignment = alignment_B.ud; inst->opcode = SHADER_OPCODE_SEND; inst->sfid = GFX12_SFID_UGM; @@ -2318,31 +2325,39 @@ lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld, assert(!compiler->indirect_ubos_use_sampler); + inst->src[0] = brw_imm_ud(0); inst->src[2] = ubo_offset; /* payload */ + if (alignment >= 4) { - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, - LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, - 1 /* num_coordinates */, - LSC_DATA_SIZE_D32, - 4 /* num_channels */, - false /* transpose */, - LSC_CACHE_LOAD_L1STATE_L3MOCS, - true /* has_dest */); + inst->desc = + lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size, + surf_type, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + 4 /* num_channels */, + false /* transpose */, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + true /* has_dest */); inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); - setup_lsc_surface_descriptors(bld, inst, inst->desc, surface); + setup_lsc_surface_descriptors(bld, inst, inst->desc, + surface.file != BAD_FILE ? + surface : surface_handle); } else { - inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, - LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32, - 1 /* num_coordinates */, - LSC_DATA_SIZE_D32, - 1 /* num_channels */, - false /* transpose */, - LSC_CACHE_LOAD_L1STATE_L3MOCS, - true /* has_dest */); + inst->desc = + lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size, + surf_type, LSC_ADDR_SIZE_A32, + 1 /* num_coordinates */, + LSC_DATA_SIZE_D32, + 1 /* num_channels */, + false /* transpose */, + LSC_CACHE_LOAD_L1STATE_L3MOCS, + true /* has_dest */); inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc); - setup_lsc_surface_descriptors(bld, inst, inst->desc, surface); + setup_lsc_surface_descriptors(bld, inst, inst->desc, + surface.file != BAD_FILE ? + surface : surface_handle); /* The byte scattered messages can only read one dword at a time so * we have to duplicate the message 4 times to read the full vec4. @@ -2375,55 +2390,56 @@ lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) const brw_compiler *compiler = bld.shader->compiler; if (devinfo->ver >= 7) { - fs_reg index = inst->src[0]; + fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE]; + fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE]; + fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET]; + /* We are switching the instruction from an ALU-like instruction to a * send-from-grf instruction. Since sends can't handle strides or * source modifiers, we have to make a copy of the offset source. */ fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); - bld.MOV(ubo_offset, inst->src[1]); + bld.MOV(ubo_offset, offset_B); - assert(inst->src[2].file == BRW_IMMEDIATE_VALUE); - unsigned alignment = inst->src[2].ud; + assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == BRW_IMMEDIATE_VALUE); + unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud; inst->opcode = SHADER_OPCODE_SEND; inst->mlen = inst->exec_size / 8; inst->resize_sources(3); - if (index.file == IMM) { - inst->desc = index.ud & 0xff; - inst->src[0] = brw_imm_ud(0); - } else { - inst->desc = 0; - const fs_builder ubld = bld.exec_all().group(1, 0); - fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.AND(tmp, index, brw_imm_ud(0xff)); - inst->src[0] = component(tmp, 0); - } - inst->src[1] = brw_imm_ud(0); /* ex_desc */ + /* src[0] & src[1] are filled by setup_surface_descriptors() */ inst->src[2] = ubo_offset; /* payload */ if (compiler->indirect_ubos_use_sampler) { const unsigned simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : BRW_SAMPLER_SIMD_MODE_SIMD16; + const uint32_t desc = brw_sampler_desc(devinfo, 0, 0, + GFX5_SAMPLER_MESSAGE_SAMPLE_LD, + simd_mode, 0); inst->sfid = BRW_SFID_SAMPLER; - inst->desc |= brw_sampler_desc(devinfo, 0, 0, - GFX5_SAMPLER_MESSAGE_SAMPLE_LD, - simd_mode, 0); + setup_surface_descriptors(bld, inst, desc, surface, surface_handle); } else if (alignment >= 4) { + const uint32_t desc = + brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, + 4, /* num_channels */ + false /* write */); + inst->sfid = (devinfo->verx10 >= 75 ? HSW_SFID_DATAPORT_DATA_CACHE_1 : GFX7_SFID_DATAPORT_DATA_CACHE); - inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, - 4, /* num_channels */ - false /* write */); + setup_surface_descriptors(bld, inst, desc, surface, surface_handle); } else { + const uint32_t desc = + brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, + 32, /* bit_size */ + false /* write */); + inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; - inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, - 32, /* bit_size */ - false /* write */); + setup_surface_descriptors(bld, inst, desc, surface, surface_handle); + /* The byte scattered messages can only read one dword at a time so * we have to duplicate the message 4 times to read the full vec4. * Hopefully, dead code will clean up the mess if some of them aren't @@ -2447,16 +2463,22 @@ lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) } } } else { + fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE]; + fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET]; + assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE); + const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver), BRW_REGISTER_TYPE_UD); - bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); + bld.MOV(byte_offset(payload, REG_SIZE), offset); inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4; - inst->resize_sources(1); inst->base_mrf = payload.nr; inst->header_size = 1; inst->mlen = 1 + inst->exec_size / 8; + + inst->resize_sources(1); + inst->src[0] = surface; } } @@ -2965,8 +2987,10 @@ fs_visitor::lower_uniform_pull_constant_loads() continue; const fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE]; + const fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE]; const fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET]; const fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE]; + assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE); assert(offset_B.file == IMM); assert(size_B.file == IMM); @@ -2980,7 +3004,9 @@ fs_visitor::lower_uniform_pull_constant_loads() inst->sfid = GFX12_SFID_UGM; inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, 1 /* simd_size */, - LSC_ADDR_SURFTYPE_BTI, + surface_handle.file == BAD_FILE ? + LSC_ADDR_SURFTYPE_BTI : + LSC_ADDR_SURFTYPE_BSS, LSC_ADDR_SIZE_A32, 1 /* num_coordinates */, LSC_DATA_SIZE_D32, @@ -3001,7 +3027,9 @@ fs_visitor::lower_uniform_pull_constant_loads() /* Finally, the payload */ inst->resize_sources(3); - setup_lsc_surface_descriptors(ubld, inst, inst->desc, surface); + setup_lsc_surface_descriptors(ubld, inst, inst->desc, + surface.file != BAD_FILE ? + surface : surface_handle); inst->src[2] = payload; invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); @@ -3025,9 +3053,7 @@ fs_visitor::lower_uniform_pull_constant_loads() inst->resize_sources(4); - setup_surface_descriptors(ubld, inst, desc, - inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE], - fs_reg() /* surface_handle */); + setup_surface_descriptors(ubld, inst, desc, surface, surface_handle); inst->src[2] = header; inst->src[3] = fs_reg(); /* unused for reads */ -- 2.7.4