intel/fs: add support for sparse accesses

author Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Tue, 23 May 2023 10:11:02 +0000 (13:11 +0300)

committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Wed, 26 Jul 2023 23:02:30 +0000 (02:02 +0300)
author Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Tue, 23 May 2023 10:11:02 +0000 (13:11 +0300)
committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Wed, 26 Jul 2023 23:02:30 +0000 (02:02 +0300)
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h

index 4b13f48..6e8f955 100644 (file)
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -883,6 +883,8 @@ enum tex_logical_srcs {
     TEX_LOGICAL_SRC_COORD_COMPONENTS,
     /** REQUIRED: Number of derivative components (as UD immediate) */
     TEX_LOGICAL_SRC_GRAD_COMPONENTS,
+   /** REQUIRED: request residency (as UD immediate) */
+   TEX_LOGICAL_SRC_RESIDENCY,
  
     TEX_LOGICAL_NUM_SRCS,
  };
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index 6a8bd03..839e5d1 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -720,7 +720,8 @@ fs_inst::components_read(unsigned i) const
     case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
     case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
        assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
-             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
        /* Texture coordinates. */
        if (i == TEX_LOGICAL_SRC_COORDINATE)
           return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
@@ -1085,6 +1086,28 @@ fs_inst::implied_mrf_writes() const
     }
  }
  
+bool
+fs_inst::has_sampler_residency() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+      assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
+      return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
+   default:
+      return false;
+   }
+}
+
  fs_reg
  fs_visitor::vgrf(const glsl_type *const type)
  {
@@ -5488,46 +5511,68 @@ emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
  
     /* Specified channel group from the destination region. */
     const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
-   const unsigned dst_size = inst->size_written /
-      inst->dst.component_size(inst->exec_size);
  
-   if (needs_dst_copy(lbld_after, inst)) {
-      const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
+   if (!needs_dst_copy(lbld_after, inst)) {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
+
+   /* Deal with the residency data part later */
+   const unsigned residency_size = inst->has_sampler_residency() ? REG_SIZE : 0;
+   const unsigned dst_size = (inst->size_written - residency_size) /
+      inst->dst.component_size(inst->exec_size);
  
-      if (inst->predicate) {
-         /* Handle predication by copying the original contents of
-          * the destination into the temporary before emitting the
-          * lowered instruction.
-          */
-         const fs_builder gbld_before =
-            lbld_before.group(MIN2(lbld_before.dispatch_width(),
-                                   inst->exec_size), 0);
-         for (unsigned k = 0; k < dst_size; ++k) {
-            gbld_before.MOV(offset(tmp, lbld_before, k),
-                            offset(dst, inst->exec_size, k));
-         }
-      }
+   const fs_reg tmp = lbld_after.vgrf(inst->dst.type,
+                                      dst_size + inst->has_sampler_residency());
  
-      const fs_builder gbld_after =
-         lbld_after.group(MIN2(lbld_after.dispatch_width(),
-                               inst->exec_size), 0);
+   if (inst->predicate) {
+      /* Handle predication by copying the original contents of the
+       * destination into the temporary before emitting the lowered
+       * instruction.
+       */
+      const fs_builder gbld_before =
+         lbld_before.group(MIN2(lbld_before.dispatch_width(),
+                                inst->exec_size), 0);
        for (unsigned k = 0; k < dst_size; ++k) {
-         /* Use a builder of the right width to perform the copy avoiding
-          * uninitialized data if the lowered execution size is greater than
-          * the original execution size of the instruction.
-          */
-         gbld_after.MOV(offset(dst, inst->exec_size, k),
-                        offset(tmp, lbld_after, k));
+         gbld_before.MOV(offset(tmp, lbld_before, k),
+                         offset(dst, inst->exec_size, k));
        }
+   }
  
-      return tmp;
+   const fs_builder gbld_after =
+      lbld_after.group(MIN2(lbld_after.dispatch_width(),
+                            inst->exec_size), 0);
+   for (unsigned k = 0; k < dst_size; ++k) {
+      /* Use a builder of the right width to perform the copy avoiding
+       * uninitialized data if the lowered execution size is greater than the
+       * original execution size of the instruction.
+       */
+      gbld_after.MOV(offset(dst, inst->exec_size, k),
+                     offset(tmp, lbld_after, k));
+   }
  
-   } else {
-      /* No need to allocate a temporary for the lowered instruction, just
-       * take the right group of channels from the original region.
+   if (inst->has_sampler_residency()) {
+      /* Sampler messages with residency need a special attention. In the
+       * first lane of the last component are located the Pixel Null Mask
+       * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
+       * have to build a single 32bit value for the SIMD32 message out of 2
+       * SIMD16 16 bit values.
         */
-      return dst;
+      const fs_builder rbld = gbld_after.exec_all().group(1, 0);
+      fs_reg local_res_reg = component(
+         retype(offset(tmp, lbld_before, dst_size),
+                BRW_REGISTER_TYPE_UW), 0);
+      fs_reg final_res_reg =
+         retype(byte_offset(inst->dst,
+                            inst->size_written - residency_size +
+                            gbld_after.group() / 8),
+                BRW_REGISTER_TYPE_UW);
+      rbld.MOV(final_res_reg, local_res_reg);
     }
+
+   return tmp;
  }
  
  bool
@@ -5553,7 +5598,10 @@ fs_visitor::lower_simd_width()
            * original or the lowered instruction, whichever is lower.
            */
           const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
-         const unsigned dst_size = inst->size_written /
+         const unsigned residency_size =
+            inst->has_sampler_residency() ? REG_SIZE : 0;
+         const unsigned dst_size =
+            (inst->size_written - residency_size) /
              inst->dst.component_size(inst->exec_size);
  
           assert(!inst->writes_accumulator && !inst->mlen);
@@ -5626,7 +5674,8 @@ fs_visitor::lower_simd_width()
              split_inst.dst = emit_zip(lbld.at(block, inst),
                                        lbld.at(block, after_inst), inst);
              split_inst.size_written =
-               split_inst.dst.component_size(lower_width) * dst_size;
+               split_inst.dst.component_size(lower_width) * dst_size +
+               residency_size;
  
              lbld.at(block, inst->next).emit(split_inst);
           }
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index 243a13a..5ee7fe3 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3252,6 +3252,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
     srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
     srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
     srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
+   srcs[TEX_LOGICAL_SRC_RESIDENCY]        = brw_imm_ud(0);
  
     fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
     inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
@@ -4500,6 +4501,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
        srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
        srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+      srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
  
        /* Since the image size is always uniform, we can just emit a SIMD8
         * query instruction and splat the result out.
@@ -6446,6 +6448,19 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
  {
     fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
  
+   /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
+    *
+    *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
+    *     Enable will be incorect for sample_c when applied to a surface with
+    *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
+    *     Enable may incorrectly report pixels as referencing a Null surface."
+    *
+    * We'll take care of this in NIR.
+    */
+   assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
+
+   srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
+
     int lod_components = 0;
  
     /* The hardware requires a LOD for buffer textures */
@@ -6700,7 +6715,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        }
     }
  
-   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
+   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
     fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
     inst->offset = header_bits;
  
@@ -6710,10 +6725,17 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        assert(instr->dest.is_ssa);
        unsigned write_mask = nir_ssa_def_components_read(&instr->dest.ssa);
        assert(write_mask != 0); /* dead code should have been eliminated */
-      inst->size_written = util_last_bit(write_mask) *
-                           inst->dst.component_size(inst->exec_size);
+      if (instr->is_sparse) {
+         inst->size_written = (util_last_bit(write_mask) - 1) *
+                              inst->dst.component_size(inst->exec_size) +
+                              REG_SIZE;
+      } else {
+         inst->size_written = util_last_bit(write_mask) *
+                              inst->dst.component_size(inst->exec_size);
+      }
     } else {
-      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+      inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
+                           (instr->is_sparse ? REG_SIZE : 0);
     }
  
     if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
@@ -6748,6 +6770,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
        bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
     }
  
+   /* The residency bits are only in the first component. */
+   if (instr->is_sparse)
+      nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
+
     bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
  }
  
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp

index f713e9f..1e20f11 100644 (file)
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -49,6 +49,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
     srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
     srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
     srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+   srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
  
     fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
                              ARRAY_SIZE(srcs));
diff --git a/src/intel/compiler/brw_ir.h b/src/intel/compiler/brw_ir.h

index cfe30f6..d792d6a 100644 (file)
--- a/src/intel/compiler/brw_ir.h
+++ b/src/intel/compiler/brw_ir.h
@@ -30,7 +30,12 @@
  #include "compiler/glsl/list.h"
  
  #define MAX_SAMPLER_MESSAGE_SIZE 11
-#define MAX_VGRF_SIZE 16
+
+/* The sampler can return a vec5 when sampling with sparse residency. In
+ * SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
+ * VGRFs to hold the result.
+ */
+#define MAX_VGRF_SIZE 20
  
  #ifdef __cplusplus
  struct backend_reg : private brw_reg
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h

index 06a2346..c7215ca 100644 (file)
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -413,6 +413,12 @@ public:
      */
     unsigned flags_written(const intel_device_info *devinfo) const;
  
+   /**
+    * Return true if this instruction is a sampler message gathering residency
+    * data.
+    */
+   bool has_sampler_residency() const;
+
     fs_reg dst;
     fs_reg *src;
  
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp

index d86a902..e247ce8 100644 (file)
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -806,7 +806,8 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
                                  const fs_reg &tg4_offset,
                                  unsigned payload_type_bit_size,
                                  unsigned coord_components,
-                                unsigned grad_components)
+                                unsigned grad_components,
+                                bool residency)
  {
     const brw_compiler *compiler = bld.shader->compiler;
     const intel_device_info *devinfo = bld.shader->devinfo;
@@ -830,7 +831,8 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
         inst->offset != 0 || inst->eot ||
         op == SHADER_OPCODE_SAMPLEINFO ||
         sampler_handle.file != BAD_FILE ||
-       is_high_sampler(devinfo, sampler)) {
+       is_high_sampler(devinfo, sampler) ||
+       residency) {
        /* For general texture offsets (no txf workaround), we need a header to
         * put them in.
         *
@@ -847,12 +849,16 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
         * and we have an explicit header, we need to set up the sampler
         * writemask.  It's reversed from normal: 1 means "don't write".
         */
-      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
-         assert(regs_written(inst) % reg_width == 0);
-         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
+      unsigned reg_count = regs_written(inst) - residency;
+      if (!inst->eot && reg_count < 4 * reg_width) {
+         assert(reg_count % reg_width == 0);
+         unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
           inst->offset |= mask << 12;
        }
  
+      if (residency)
+         inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
+
        /* Build the actual header */
        const fs_builder ubld = bld.exec_all().group(8, 0);
        const fs_builder ubld1 = ubld.group(1, 0);
@@ -1301,6 +1307,10 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
     const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
     assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
     const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
+   const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
+   /* residency is only supported on Gfx8+ */
+   assert(!residency || devinfo->ver >= 8);
  
     if (devinfo->ver >= 7) {
        const unsigned msg_payload_type_bit_size =
@@ -1316,7 +1326,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
                                        surface_handle, sampler_handle,
                                        tg4_offset,
                                        msg_payload_type_bit_size,
-                                      coord_components, grad_components);
+                                      coord_components, grad_components,
+                                      residency);
     } else if (devinfo->ver >= 5) {
        lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
                                        shadow_c, lod, lod2, sample_index,
author	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Tue, 23 May 2023 10:11:02 +0000 (13:11 +0300)
committer	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Wed, 26 Jul 2023 23:02:30 +0000 (02:02 +0300)
src/intel/compiler/brw_eu_defines.h		patch \| blob \| history
src/intel/compiler/brw_fs.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_visitor.cpp		patch \| blob \| history
src/intel/compiler/brw_ir.h		patch \| blob \| history
src/intel/compiler/brw_ir_fs.h		patch \| blob \| history
src/intel/compiler/brw_lower_logical_sends.cpp		patch \| blob \| history