From: Lionel Landwerlin Date: Tue, 23 May 2023 10:11:02 +0000 (+0300) Subject: intel/fs: add support for sparse accesses X-Git-Tag: upstream/23.3.3~5025 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d33aff783d9fcf60009f2979f697939093ef701d;p=platform%2Fupstream%2Fmesa.git intel/fs: add support for sparse accesses Purely from the backend point of view it's just an additional parameter to sampler messages. Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 4b13f48..6e8f955 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -883,6 +883,8 @@ enum tex_logical_srcs { TEX_LOGICAL_SRC_COORD_COMPONENTS, /** REQUIRED: Number of derivative components (as UD immediate) */ TEX_LOGICAL_SRC_GRAD_COMPONENTS, + /** REQUIRED: request residency (as UD immediate) */ + TEX_LOGICAL_SRC_RESIDENCY, TEX_LOGICAL_NUM_SRCS, }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 6a8bd03..839e5d1 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -720,7 +720,8 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_TG4_OFFSET_LOGICAL: case SHADER_OPCODE_SAMPLEINFO_LOGICAL: assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && - src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); + src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM && + src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM); /* Texture coordinates. */ if (i == TEX_LOGICAL_SRC_COORDINATE) return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; @@ -1085,6 +1086,28 @@ fs_inst::implied_mrf_writes() const } } +bool +fs_inst::has_sampler_residency() const +{ + switch (opcode) { + case SHADER_OPCODE_TEX_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case SHADER_OPCODE_TXD_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: + case SHADER_OPCODE_TG4_OFFSET_LOGICAL: + case SHADER_OPCODE_TG4_LOGICAL: + assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM); + return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0; + default: + return false; + } +} + fs_reg fs_visitor::vgrf(const glsl_type *const type) { @@ -5488,46 +5511,68 @@ emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after, /* Specified channel group from the destination region. */ const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group); - const unsigned dst_size = inst->size_written / - inst->dst.component_size(inst->exec_size); - if (needs_dst_copy(lbld_after, inst)) { - const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size); + if (!needs_dst_copy(lbld_after, inst)) { + /* No need to allocate a temporary for the lowered instruction, just + * take the right group of channels from the original region. + */ + return dst; + } + + /* Deal with the residency data part later */ + const unsigned residency_size = inst->has_sampler_residency() ? REG_SIZE : 0; + const unsigned dst_size = (inst->size_written - residency_size) / + inst->dst.component_size(inst->exec_size); - if (inst->predicate) { - /* Handle predication by copying the original contents of - * the destination into the temporary before emitting the - * lowered instruction. - */ - const fs_builder gbld_before = - lbld_before.group(MIN2(lbld_before.dispatch_width(), - inst->exec_size), 0); - for (unsigned k = 0; k < dst_size; ++k) { - gbld_before.MOV(offset(tmp, lbld_before, k), - offset(dst, inst->exec_size, k)); - } - } + const fs_reg tmp = lbld_after.vgrf(inst->dst.type, + dst_size + inst->has_sampler_residency()); - const fs_builder gbld_after = - lbld_after.group(MIN2(lbld_after.dispatch_width(), - inst->exec_size), 0); + if (inst->predicate) { + /* Handle predication by copying the original contents of the + * destination into the temporary before emitting the lowered + * instruction. + */ + const fs_builder gbld_before = + lbld_before.group(MIN2(lbld_before.dispatch_width(), + inst->exec_size), 0); for (unsigned k = 0; k < dst_size; ++k) { - /* Use a builder of the right width to perform the copy avoiding - * uninitialized data if the lowered execution size is greater than - * the original execution size of the instruction. - */ - gbld_after.MOV(offset(dst, inst->exec_size, k), - offset(tmp, lbld_after, k)); + gbld_before.MOV(offset(tmp, lbld_before, k), + offset(dst, inst->exec_size, k)); } + } - return tmp; + const fs_builder gbld_after = + lbld_after.group(MIN2(lbld_after.dispatch_width(), + inst->exec_size), 0); + for (unsigned k = 0; k < dst_size; ++k) { + /* Use a builder of the right width to perform the copy avoiding + * uninitialized data if the lowered execution size is greater than the + * original execution size of the instruction. + */ + gbld_after.MOV(offset(dst, inst->exec_size, k), + offset(tmp, lbld_after, k)); + } - } else { - /* No need to allocate a temporary for the lowered instruction, just - * take the right group of channels from the original region. + if (inst->has_sampler_residency()) { + /* Sampler messages with residency need a special attention. In the + * first lane of the last component are located the Pixel Null Mask + * (bits 0:15) & some upper bits we need to discard (bits 16:31). We + * have to build a single 32bit value for the SIMD32 message out of 2 + * SIMD16 16 bit values. */ - return dst; + const fs_builder rbld = gbld_after.exec_all().group(1, 0); + fs_reg local_res_reg = component( + retype(offset(tmp, lbld_before, dst_size), + BRW_REGISTER_TYPE_UW), 0); + fs_reg final_res_reg = + retype(byte_offset(inst->dst, + inst->size_written - residency_size + + gbld_after.group() / 8), + BRW_REGISTER_TYPE_UW); + rbld.MOV(final_res_reg, local_res_reg); } + + return tmp; } bool @@ -5553,7 +5598,10 @@ fs_visitor::lower_simd_width() * original or the lowered instruction, whichever is lower. */ const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); - const unsigned dst_size = inst->size_written / + const unsigned residency_size = + inst->has_sampler_residency() ? REG_SIZE : 0; + const unsigned dst_size = + (inst->size_written - residency_size) / inst->dst.component_size(inst->exec_size); assert(!inst->writes_accumulator && !inst->mlen); @@ -5626,7 +5674,8 @@ fs_visitor::lower_simd_width() split_inst.dst = emit_zip(lbld.at(block, inst), lbld.at(block, after_inst), inst); split_inst.size_written = - split_inst.dst.component_size(lower_width) * dst_size; + split_inst.dst.component_size(lower_width) * dst_size + + residency_size; lbld.at(block, inst->next).emit(split_inst); } diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 243a13a..5ee7fe3 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3252,6 +3252,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0); fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); inst->size_written = 4 * inst->dst.component_size(inst->exec_size); @@ -4500,6 +4501,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0); /* Since the image size is always uniform, we can just emit a SIMD8 * query instruction and splat the result out. @@ -6446,6 +6448,19 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) { fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; + /* SKL PRMs: Volume 7: 3D-Media-GPGPU: + * + * "The Pixel Null Mask field, when enabled via the Pixel Null Mask + * Enable will be incorect for sample_c when applied to a surface with + * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask + * Enable may incorrectly report pixels as referencing a Null surface." + * + * We'll take care of this in NIR. + */ + assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE); + + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse); + int lod_components = 0; /* The hardware requires a LOD for buffer textures */ @@ -6700,7 +6715,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) } } - fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); + fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse); fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); inst->offset = header_bits; @@ -6710,10 +6725,17 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) assert(instr->dest.is_ssa); unsigned write_mask = nir_ssa_def_components_read(&instr->dest.ssa); assert(write_mask != 0); /* dead code should have been eliminated */ - inst->size_written = util_last_bit(write_mask) * - inst->dst.component_size(inst->exec_size); + if (instr->is_sparse) { + inst->size_written = (util_last_bit(write_mask) - 1) * + inst->dst.component_size(inst->exec_size) + + REG_SIZE; + } else { + inst->size_written = util_last_bit(write_mask) * + inst->dst.component_size(inst->exec_size); + } } else { - inst->size_written = 4 * inst->dst.component_size(inst->exec_size); + inst->size_written = 4 * inst->dst.component_size(inst->exec_size) + + (instr->is_sparse ? REG_SIZE : 0); } if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) @@ -6748,6 +6770,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); } + /* The residency bits are only in the first component. */ + if (instr->is_sparse) + nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0); + bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); } diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index f713e9f..1e20f11 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -49,6 +49,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle; srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components); srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); + srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0); fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, ARRAY_SIZE(srcs)); diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h index cfe30f6..d792d6a 100644 --- a/src/intel/compiler/brw_ir.h +++ b/src/intel/compiler/brw_ir.h @@ -30,7 +30,12 @@ #include "compiler/glsl/list.h" #define MAX_SAMPLER_MESSAGE_SIZE 11 -#define MAX_VGRF_SIZE 16 + +/* The sampler can return a vec5 when sampling with sparse residency. In + * SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20 + * VGRFs to hold the result. + */ +#define MAX_VGRF_SIZE 20 #ifdef __cplusplus struct backend_reg : private brw_reg diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 06a2346..c7215ca 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -413,6 +413,12 @@ public: */ unsigned flags_written(const intel_device_info *devinfo) const; + /** + * Return true if this instruction is a sampler message gathering residency + * data. + */ + bool has_sampler_residency() const; + fs_reg dst; fs_reg *src; diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index d86a902..e247ce8 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -806,7 +806,8 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &tg4_offset, unsigned payload_type_bit_size, unsigned coord_components, - unsigned grad_components) + unsigned grad_components, + bool residency) { const brw_compiler *compiler = bld.shader->compiler; const intel_device_info *devinfo = bld.shader->devinfo; @@ -830,7 +831,8 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, inst->offset != 0 || inst->eot || op == SHADER_OPCODE_SAMPLEINFO || sampler_handle.file != BAD_FILE || - is_high_sampler(devinfo, sampler)) { + is_high_sampler(devinfo, sampler) || + residency) { /* For general texture offsets (no txf workaround), we need a header to * put them in. * @@ -847,12 +849,16 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op, * and we have an explicit header, we need to set up the sampler * writemask. It's reversed from normal: 1 means "don't write". */ - if (!inst->eot && regs_written(inst) != 4 * reg_width) { - assert(regs_written(inst) % reg_width == 0); - unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf; + unsigned reg_count = regs_written(inst) - residency; + if (!inst->eot && reg_count < 4 * reg_width) { + assert(reg_count % reg_width == 0); + unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf; inst->offset |= mask << 12; } + if (residency) + inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */ + /* Build the actual header */ const fs_builder ubld = bld.exec_all().group(8, 0); const fs_builder ubld1 = ubld.group(1, 0); @@ -1301,6 +1307,10 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; + assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM); + const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0; + /* residency is only supported on Gfx8+ */ + assert(!residency || devinfo->ver >= 8); if (devinfo->ver >= 7) { const unsigned msg_payload_type_bit_size = @@ -1316,7 +1326,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) surface_handle, sampler_handle, tg4_offset, msg_payload_type_bit_size, - coord_components, grad_components); + coord_components, grad_components, + residency); } else if (devinfo->ver >= 5) { lower_sampler_logical_send_gfx5(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index,