intel/fs: Make per-sample and coarse dispatch tri-state
authorJason Ekstrand <jason@jlekstrand.net>
Fri, 19 Nov 2021 22:32:24 +0000 (16:32 -0600)
committerMarge Bot <emma+marge@anholt.net>
Mon, 6 Feb 2023 09:12:18 +0000 (09:12 +0000)
Whenever one of them is BRW_SOMETIMES, we depend on dynamic flag pushed
in as a push constant.  In this case, we have to often have to do the
calculation both ways and SEL the result.  It's a bit more code but
decouples MSAA from the shader key.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21094>

12 files changed:
src/gallium/drivers/crocus/crocus_state.c
src/gallium/drivers/iris/iris_state.c
src/intel/blorp/blorp_genX_exec.h
src/intel/common/intel_genX_state.h
src/intel/compiler/brw_compiler.h
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs.h
src/intel/compiler/brw_fs_visitor.cpp
src/intel/compiler/brw_lower_logical_sends.cpp
src/intel/vulkan/genX_pipeline.c
src/intel/vulkan/gfx8_cmd_buffer.c
src/intel/vulkan_hasvk/genX_pipeline.c

index c270a2b..54b9b8c 100644 (file)
@@ -6449,7 +6449,8 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
 
          intel_set_ps_dispatch_state(&ps, &batch->screen->devinfo,
                                      wm_prog_data,
-                                     ice->state.framebuffer.samples);
+                                     ice->state.framebuffer.samples,
+                                     0 /* msaa_flags */);
 
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
@@ -6517,7 +6518,8 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
          psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
          psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
          psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
-         psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
+         psx.PixelShaderIsPerSample =
+            brw_wm_prog_data_is_persample(wm_prog_data, 0);
 
          /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
          if (wm_prog_data->uses_sample_mask)
@@ -7291,7 +7293,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
             else
                wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
 
-            if (wm_prog_data->persample_dispatch)
+            if (brw_wm_prog_data_is_persample(wm_prog_data, 0))
                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
             else
                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
index c73c6c7..666763f 100644 (file)
@@ -4820,7 +4820,8 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
       psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
       psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
       psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
-      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
+      psx.PixelShaderIsPerSample =
+         brw_wm_prog_data_is_persample(wm_prog_data, 0);
       psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
 
 #if GFX_VER >= 9
@@ -6274,7 +6275,8 @@ iris_upload_dirty_render_state(struct iris_context *ice,
             uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
             _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
                intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
-                                           wm_prog_data, cso_fb->samples);
+                                           wm_prog_data, cso_fb->samples,
+                                           0 /* msaa_flags */);
 
                ps.DispatchGRFStartRegisterForConstantSetupData0 =
                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
index 8e791b6..1ccecd7 100644 (file)
@@ -899,7 +899,8 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 
       if (prog_data) {
          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
-                                     params->num_samples);
+                                     params->num_samples,
+                                     0 /* msaa_flags */);
 
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
@@ -981,7 +982,8 @@ blorp_emit_ps_config(struct blorp_batch *batch,
 
       if (prog_data) {
          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
-                                     params->num_samples);
+                                     params->num_samples,
+                                     0 /* msaa_flags */);
 
          ps.DispatchGRFStartRegisterForConstantSetupData0 =
             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
index dd6fbe9..69ccbd6 100644 (file)
@@ -43,7 +43,8 @@ static inline void
 intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
                             const struct intel_device_info *devinfo,
                             const struct brw_wm_prog_data *prog_data,
-                            unsigned rasterization_samples)
+                            unsigned rasterization_samples,
+                            enum brw_wm_msaa_flags msaa_flags)
 {
    assert(rasterization_samples != 0);
 
@@ -72,7 +73,10 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
       enable_8 = false;
 #endif
 
-   if (prog_data->persample_dispatch) {
+   const bool is_persample_dispatch =
+      brw_wm_prog_data_is_persample(prog_data, msaa_flags);
+
+   if (is_persample_dispatch) {
       /* TGL PRMs, Volume 2d: Command Reference: Structures:
        *    3DSTATE_PS_BODY::32 Pixel Dispatch Enable:
        *
@@ -108,8 +112,7 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
     *
     * 16x MSAA only exists on Gfx9+, so we can skip this on Gfx8.
     */
-   if (GFX_VER >= 9 && rasterization_samples == 16 &&
-       !prog_data->persample_dispatch) {
+   if (GFX_VER >= 9 && rasterization_samples == 16 && !is_persample_dispatch) {
       assert(enable_8 || enable_16);
       enable_32 = false;
    }
index e5604f0..8bcbe42 100644 (file)
@@ -28,6 +28,7 @@
 #include "c11/threads.h"
 #include "dev/intel_device_info.h"
 #include "util/macros.h"
+#include "util/enum_operators.h"
 #include "util/ralloc.h"
 #include "util/u_math.h"
 #include "brw_isa_info.h"
@@ -464,6 +465,12 @@ enum brw_sometimes {
    BRW_ALWAYS
 };
 
+static inline enum brw_sometimes
+brw_sometimes_invert(enum brw_sometimes x)
+{
+   return (enum brw_sometimes)((int)BRW_ALWAYS - (int)x);
+}
+
 /** The program key for Fragment/Pixel Shaders. */
 struct brw_wm_prog_key {
    struct brw_base_prog_key base;
@@ -833,6 +840,29 @@ enum brw_pixel_shader_computed_depth_mode {
    BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
 };
 
+enum brw_wm_msaa_flags {
+   /** Must be set whenever any dynamic MSAA is used
+    *
+    * This flag mostly exists to let us assert that the driver understands
+    * dynamic MSAA so we don't run into trouble with drivers that don't.
+    */
+   BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0),
+
+   /** True if the framebuffer is multisampled */
+   BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1),
+
+   /** True if this shader has been dispatched per-sample */
+   BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2),
+
+   /** True if this shader has been dispatched coarse
+    *
+    * This is intentionally chose to be bit 18 to correspond to the coarse
+    * write bit in the FB write message descriptor.
+    */
+   BRW_WM_MSAA_FLAG_COARSE_DISPATCH = (1 << 18),
+};
+MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(enum brw_wm_msaa_flags)
+
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
  * corresponding to a different brw_wm_prog_key struct, with different
@@ -872,7 +902,6 @@ struct brw_wm_prog_data {
    bool dispatch_16;
    bool dispatch_32;
    bool dual_src_blend;
-   bool persample_dispatch;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
@@ -888,10 +917,23 @@ struct brw_wm_prog_data {
    bool contains_flat_varying;
    bool contains_noperspective_varying;
 
+   /** True if the shader wants sample shading
+    *
+    * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
+    * a sample-qualified input are used in the shader.  It is independent of
+    * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
+    */
+   bool sample_shading;
+
+   /** Should this shader be dispatched per-sample */
+   enum brw_sometimes persample_dispatch;
+
    /**
     * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS).
     */
-   bool per_coarse_pixel_dispatch;
+   enum brw_sometimes coarse_pixel_dispatch;
+
+   unsigned msaa_flags_param;
 
    /**
     * Mask of which interpolation modes are required by the fragment shader.
@@ -1023,6 +1065,50 @@ _brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data,
    _brw_wm_prog_data_reg_blocks(prog_data, \
       brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
 
+static inline bool
+brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data,
+                              enum brw_wm_msaa_flags pushed_msaa_flags)
+{
+   if (pushed_msaa_flags & BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC) {
+      if (!(pushed_msaa_flags & BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO))
+         return false;
+
+      if (prog_data->sample_shading)
+         assert(pushed_msaa_flags & BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+
+      if (pushed_msaa_flags & BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH)
+         assert(prog_data->persample_dispatch != BRW_NEVER);
+      else
+         assert(prog_data->persample_dispatch != BRW_ALWAYS);
+
+      return (pushed_msaa_flags & BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
+   }
+
+   assert(prog_data->persample_dispatch == BRW_ALWAYS ||
+          prog_data->persample_dispatch == BRW_NEVER);
+
+   return prog_data->persample_dispatch;
+}
+
+static inline bool
+brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data,
+                           enum brw_wm_msaa_flags pushed_msaa_flags)
+{
+   if (pushed_msaa_flags & BRW_WM_MSAA_FLAG_ENABLE_DYNAMIC) {
+      if (pushed_msaa_flags & BRW_WM_MSAA_FLAG_COARSE_DISPATCH)
+         assert(prog_data->coarse_pixel_dispatch != BRW_NEVER);
+      else
+         assert(prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
+
+      return pushed_msaa_flags & BRW_WM_MSAA_FLAG_COARSE_DISPATCH;
+   }
+
+   assert(prog_data->coarse_pixel_dispatch == BRW_ALWAYS ||
+          prog_data->coarse_pixel_dispatch == BRW_NEVER);
+
+   return prog_data->coarse_pixel_dispatch;
+}
+
 struct brw_push_const_block {
    unsigned dwords;     /* Dword count, not reg aligned */
    unsigned regs;
index e1bacb0..98aab82 100644 (file)
@@ -1220,7 +1220,7 @@ fs_visitor::emit_samplepos_setup()
    const fs_builder abld = bld.annotate("compute sample position");
    fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
-   if (!wm_prog_data->persample_dispatch) {
+   if (wm_prog_data->persample_dispatch == BRW_NEVER) {
       /* From ARB_sample_shading specification:
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SamplePosition will always be
@@ -1255,6 +1255,16 @@ fs_visitor::emit_samplepos_setup()
       abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
    }
 
+   if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
+      check_dynamic_msaa_flag(abld, wm_prog_data,
+                              BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+      for (unsigned i = 0; i < 2; i++) {
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
+                               brw_imm_f(0.5f)));
+      }
+   }
+
    return pos;
 }
 
@@ -1369,12 +1379,12 @@ fs_visitor::emit_samplemaskin_setup()
    assert(devinfo->ver >= 6);
 
    /* The HW doesn't provide us with expected values. */
-   assert(!wm_prog_data->per_coarse_pixel_dispatch);
+   assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
 
    fs_reg coverage_mask =
       fetch_payload_reg(bld, fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
 
-   if (!wm_prog_data->persample_dispatch)
+   if (wm_prog_data->persample_dispatch == BRW_NEVER)
       return coverage_mask;
 
    /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
@@ -1399,6 +1409,13 @@ fs_visitor::emit_samplemaskin_setup()
    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
    abld.AND(mask, enabled_mask, coverage_mask);
 
+   if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
+      return mask;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
+
    return mask;
 }
 
@@ -1413,7 +1430,7 @@ fs_visitor::emit_shading_rate_setup()
    /* Coarse pixel shading size fields overlap with other fields of not in
     * coarse pixel dispatch mode, so report 0 when that's not the case.
     */
-   if (!wm_prog_data->per_coarse_pixel_dispatch)
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
       return brw_imm_ud(0);
 
    const fs_builder abld = bld.annotate("compute fragment shading rate");
@@ -1439,6 +1456,13 @@ fs_visitor::emit_shading_rate_setup()
    fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
    abld.OR(rate, int_rate_x, int_rate_y);
 
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
+      return rate;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           BRW_WM_MSAA_FLAG_COARSE_DISPATCH);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
+
    return rate;
 }
 
@@ -7256,10 +7280,14 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
    prog_data->computed_stencil =
       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
-   prog_data->persample_dispatch =
-      key->multisample_fbo &&
-      (key->persample_interp ||
-       shader->info.fs.uses_sample_shading);
+   prog_data->sample_shading =
+      shader->info.fs.uses_sample_shading ||
+      shader->info.outputs_read;
+
+   prog_data->persample_dispatch = BRW_NEVER;
+   if (key->multisample_fbo &&
+       (key->persample_interp || prog_data->sample_shading))
+      prog_data->persample_dispatch = BRW_ALWAYS;
 
    if (devinfo->ver >= 6) {
       prog_data->uses_sample_mask =
@@ -7274,7 +7302,8 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
        * persample dispatch, we hard-code it to 0.5.
        */
-      prog_data->uses_pos_offset = prog_data->persample_dispatch &&
+      prog_data->uses_pos_offset =
+         prog_data->persample_dispatch != BRW_NEVER &&
          (BITSET_TEST(shader->info.system_values_read,
                       SYSTEM_VALUE_SAMPLE_POS) ||
           BITSET_TEST(shader->info.system_values_read,
@@ -7295,13 +7324,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
 
    /* You can't be coarse and per-sample */
    assert(!key->coarse_pixel || !key->persample_interp);
-   prog_data->per_coarse_pixel_dispatch =
-      key->coarse_pixel &&
-      !shader->info.fs.uses_sample_shading &&
-      !prog_data->uses_omask &&
-      !prog_data->uses_sample_mask &&
-      (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
-      !prog_data->computed_stencil;
+   prog_data->coarse_pixel_dispatch =
+      brw_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       prog_data->uses_omask ||
+       prog_data->sample_shading ||
+       prog_data->uses_sample_mask ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil) {
+      prog_data->coarse_pixel_dispatch = BRW_NEVER;
+   }
 
    /* We choose to always enable VMask prior to XeHP, as it would cause
     * us to lose out on the eliminate_find_live_channel() optimization.
@@ -7309,16 +7341,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
    prog_data->uses_vmask = devinfo->verx10 < 125 ||
                            shader->info.fs.needs_quad_helper_invocations ||
                            shader->info.fs.needs_all_helper_invocations ||
-                           prog_data->per_coarse_pixel_dispatch;
+                           prog_data->coarse_pixel_dispatch != BRW_NEVER;
 
    prog_data->uses_src_w =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
    prog_data->uses_src_depth =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
-      !prog_data->per_coarse_pixel_dispatch;
+      prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
    prog_data->uses_depth_w_coefficients =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
-      prog_data->per_coarse_pixel_dispatch;
+      prog_data->coarse_pixel_dispatch != BRW_NEVER;
 
    calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
    brw_compute_flat_inputs(prog_data, shader);
index 44b9cfa..b779601 100644 (file)
@@ -721,6 +721,24 @@ namespace brw {
       return tmp;
    }
 
+   inline fs_reg
+   dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
+   {
+      return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
+                    BRW_REGISTER_TYPE_UD);
+   }
+
+   inline void
+   check_dynamic_msaa_flag(const fs_builder &bld,
+                           const struct brw_wm_prog_data *wm_prog_data,
+                           enum brw_wm_msaa_flags flag)
+   {
+      fs_inst *inst = bld.AND(bld.null_reg_ud(),
+                              dynamic_msaa_flags(wm_prog_data),
+                              brw_imm_ud(flag));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   }
+
    bool
    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
 }
index eb0e56c..c9ab1c4 100644 (file)
@@ -291,10 +291,10 @@ fs_visitor::emit_interpolation_setup_gfx6()
 
    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
 
-   fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
-   fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
-   fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
-   if (!wm_prog_data->per_coarse_pixel_dispatch) {
+   fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_sample_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
+   if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
       /* The thread payload only delivers subspan locations (ss0, ss1,
        * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
        * generate 4 pixel coordinates out of each subspan location. We do this
@@ -324,9 +324,9 @@ fs_visitor::emit_interpolation_setup_gfx6()
        * coordinates out of 2 subspans coordinates in a single ADD instruction
        * (twice the operation above).
        */
-      int_pixel_offset_xy = fs_reg(brw_imm_v(0x11001010));
-      half_int_pixel_offset_x = fs_reg(brw_imm_uw(0));
-      half_int_pixel_offset_y = fs_reg(brw_imm_uw(0));
+      int_sample_offset_xy = fs_reg(brw_imm_v(0x11001010));
+      half_int_sample_offset_x = fs_reg(brw_imm_uw(0));
+      half_int_sample_offset_y = fs_reg(brw_imm_uw(0));
       /* On Gfx12.5, because of regioning restrictions, the interpolation code
        * is slightly different and works off X & Y only inputs. The ordering
        * of the half bytes here is a bit odd, with each subspan replicated
@@ -336,9 +336,14 @@ fs_visitor::emit_interpolation_setup_gfx6()
        *  X offset:    0      0      1      0      0      0      1      0
        *  Y offset:    0      0      0      0      1      0      1      0
        */
-      int_pixel_offset_x = fs_reg(brw_imm_v(0x01000100));
-      int_pixel_offset_y = fs_reg(brw_imm_v(0x01010000));
-   } else {
+      int_sample_offset_x = fs_reg(brw_imm_v(0x01000100));
+      int_sample_offset_y = fs_reg(brw_imm_v(0x01010000));
+   }
+
+   fs_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_coarse_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
+   if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
       /* In coarse pixel dispatch we have to do the same ADD instruction that
        * we do in normal per pixel dispatch, except this time we're not adding
        * 1 in each direction, but instead the coarse pixel size.
@@ -354,34 +359,84 @@ fs_visitor::emit_interpolation_setup_gfx6()
          /* To build the array of half bytes we do and AND operation with the
           * right mask in X.
           */
-         int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
-         dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
+         int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
 
          /* And the right mask in Y. */
-         int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
-         dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
+         int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
       } else {
          /* To build the array of half bytes we do and AND operation with the
           * right mask in X.
           */
-         int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
-         dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
+         int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
 
          /* And the right mask in Y. */
-         int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
-         dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
+         int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
 
          /* Finally OR the 2 registers. */
-         int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
-         dbld.OR(int_pixel_offset_xy, int_pixel_offset_x, int_pixel_offset_y);
+         int_coarse_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+         dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
       }
 
-      /* Also compute the half pixel size used to center pixels. */
-      half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
-      half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      /* Also compute the half coarse size used to center coarses. */
+      half_int_coarse_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      half_int_coarse_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
+
+      bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
+      bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
+   }
+
+   fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
+   fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
+   fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
+   switch (wm_prog_data->coarse_pixel_dispatch) {
+   case BRW_NEVER:
+#define COPY_OFFSET_REG(prefix, suffix) \
+  prefix##_pixel_##suffix = prefix##_sample_##suffix;
+
+      COPY_OFFSET_REG(int, offset_x)
+      COPY_OFFSET_REG(int, offset_y)
+      COPY_OFFSET_REG(int, offset_xy)
+      COPY_OFFSET_REG(half_int, offset_x)
+      COPY_OFFSET_REG(half_int, offset_y)
+
+#undef COPY_OFFSET_REG
+      break;
+
+   case BRW_SOMETIMES:
+      check_dynamic_msaa_flag(bld, wm_prog_data,
+                              BRW_WM_MSAA_FLAG_COARSE_DISPATCH);
+
+#define COPY_OFFSET_REG(prefix, suffix) \
+   prefix##_pixel_##suffix = bld.vgrf(BRW_REGISTER_TYPE_UW); \
+   bld.SEL(prefix##_pixel_##suffix, \
+           prefix##_coarse_##suffix, \
+           prefix##_pixel_##suffix); \
+
+      COPY_OFFSET_REG(int, offset_x)
+      COPY_OFFSET_REG(int, offset_y)
+      COPY_OFFSET_REG(int, offset_xy)
+      COPY_OFFSET_REG(half_int, offset_x)
+      COPY_OFFSET_REG(half_int, offset_y)
 
-      bld.SHR(half_int_pixel_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
-      bld.SHR(half_int_pixel_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
+#undef COPY_OFFSET_REG
+      break;
+
+   case BRW_ALWAYS:
+#define COPY_OFFSET_REG(prefix, suffix) \
+  prefix##_pixel_##suffix = prefix##_coarse_##suffix;
+
+      COPY_OFFSET_REG(int, offset_x)
+      COPY_OFFSET_REG(int, offset_y)
+      COPY_OFFSET_REG(int, offset_xy)
+      COPY_OFFSET_REG(half_int, offset_x)
+      COPY_OFFSET_REG(half_int, offset_y)
+
+#undef COPY_OFFSET_REG
+      break;
    }
 
    for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
@@ -401,11 +456,15 @@ fs_visitor::emit_interpolation_setup_gfx6()
                   fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
                   int_pixel_offset_y);
 
-         if (wm_prog_data->per_coarse_pixel_dispatch) {
-            dbld.ADD(int_pixel_x, int_pixel_x,
-                     horiz_stride(half_int_pixel_offset_x, 0));
-            dbld.ADD(int_pixel_y, int_pixel_y,
-                     horiz_stride(half_int_pixel_offset_y, 0));
+         if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
+            fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
+                                     horiz_stride(half_int_pixel_offset_x, 0));
+            fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
+                                     horiz_stride(half_int_pixel_offset_y, 0));
+            if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
+               addx->predicate = BRW_PREDICATE_NORMAL;
+               addy->predicate = BRW_PREDICATE_NORMAL;
+            }
          }
 
          hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
@@ -463,8 +522,8 @@ fs_visitor::emit_interpolation_setup_gfx6()
    }
 
    abld = bld.annotate("compute pos.z");
+   fs_reg coarse_z;
    if (wm_prog_data->uses_depth_w_coefficients) {
-      assert(!wm_prog_data->uses_src_depth);
       /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
        * properly. In the same way we have to add the coarse pixel size to
        * pixels locations, here we recompute the Z value with 2 coefficients
@@ -501,9 +560,9 @@ fs_visitor::emit_interpolation_setup_gfx6()
       abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
       abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
 
-      this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
-      abld.MAD(this->pixel_z, z_c0, z_cx, float_pixel_x);
-      abld.MAD(this->pixel_z, this->pixel_z, z_cy, float_pixel_y);
+      coarse_z = abld.vgrf(BRW_REGISTER_TYPE_F);
+      abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
+      abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
    }
 
    if (wm_prog_data->uses_src_depth) {
@@ -511,6 +570,34 @@ fs_visitor::emit_interpolation_setup_gfx6()
       this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
    }
 
+   if (wm_prog_data->uses_depth_w_coefficients ||
+       wm_prog_data->uses_src_depth) {
+      fs_reg sample_z = this->pixel_z;
+
+      switch (wm_prog_data->coarse_pixel_dispatch) {
+      case BRW_NEVER:
+         assert(wm_prog_data->uses_src_depth);
+         assert(!wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = sample_z;
+         break;
+
+      case BRW_SOMETIMES:
+         assert(wm_prog_data->uses_src_depth);
+         assert(wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
+
+         /* We re-use the check_dynamic_msaa_flag() call from above */
+         abld.SEL(this->pixel_z, coarse_z, sample_z);
+         break;
+
+      case BRW_ALWAYS:
+         assert(!wm_prog_data->uses_src_depth);
+         assert(wm_prog_data->uses_depth_w_coefficients);
+         this->pixel_z = coarse_z;
+         break;
+      }
+   }
+
    if (wm_prog_data->uses_src_w) {
       abld = bld.annotate("compute pos.w");
       this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
index 9a2b2f6..1faae68 100644 (file)
@@ -359,7 +359,18 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       inst->desc =
          (inst->group / 16) << 11 | /* rt slot group */
          brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
-                           prog_data->per_coarse_pixel_dispatch);
+                           0 /* coarse_write */);
+
+      fs_reg desc = brw_imm_ud(0);
+      if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
+         inst->desc |= (1 << 18);
+      } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
+         STATIC_ASSERT(BRW_WM_MSAA_FLAG_COARSE_DISPATCH == (1 << 18));
+         const fs_builder &ubld = bld.exec_all().group(8, 0);
+         desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         ubld.AND(desc, dynamic_msaa_flags(prog_data),
+                  brw_imm_ud(BRW_WM_MSAA_FLAG_COARSE_DISPATCH));
+      }
 
       uint32_t ex_desc = 0;
       if (devinfo->ver >= 11) {
@@ -376,7 +387,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       inst->opcode = SHADER_OPCODE_SEND;
       inst->resize_sources(3);
       inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
-      inst->src[0] = brw_imm_ud(0);
+      inst->src[0] = desc;
       inst->src[1] = brw_imm_ud(0);
       inst->src[2] = payload;
       inst->mlen = regs_written(load);
@@ -2456,8 +2467,6 @@ lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
    fs_reg payload = brw_vec8_grf(0, 0);
    unsigned mlen = 1;
 
-   const fs_reg desc = inst->src[1];
-
    unsigned mode;
    switch (inst->opcode) {
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -2480,11 +2489,33 @@ lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
       unreachable("Invalid interpolator instruction");
    }
 
+   fs_reg desc = inst->src[1];
    uint32_t desc_imm =
       brw_pixel_interp_desc(devinfo, mode, inst->pi_noperspective,
-                            wm_prog_data->per_coarse_pixel_dispatch,
+                            false /* coarse_pixel_rate */,
                             inst->exec_size, inst->group);
 
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
+      desc_imm |= (1 << 15);
+   } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
+      fs_reg orig_desc = desc;
+      const fs_builder &ubld = bld.exec_all().group(8, 0);
+      desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
+               brw_imm_ud(BRW_WM_MSAA_FLAG_COARSE_DISPATCH));
+
+      /* The uniform is in bit 18 but we need it in bit 15 */
+      STATIC_ASSERT(BRW_WM_MSAA_FLAG_COARSE_DISPATCH == (1 << 18));
+      ubld.SHR(desc, desc, brw_imm_ud(3));
+
+   /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
+      if (orig_desc.file == IMM) {
+         desc_imm |= orig_desc.ud;
+      } else {
+         ubld.OR(desc, desc, orig_desc);
+      }
+   }
+
    assert(bld.shader->devinfo->ver >= 7);
    inst->opcode = SHADER_OPCODE_SEND;
    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
index 132dd9f..983d4f5 100644 (file)
@@ -1526,7 +1526,8 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
 
    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
       intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
-                                  ms != NULL ? ms->rasterization_samples : 1);
+                                  ms != NULL ? ms->rasterization_samples : 1,
+                                  0 /* msaa_flags */);
 
       ps.KernelStartPointer0 = fs_bin->kernel.offset +
                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
@@ -1581,7 +1582,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
       ps.PixelShaderValid              = true;
       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
-      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
+      ps.PixelShaderIsPerSample        =
+         brw_wm_prog_data_is_persample(wm_prog_data, 0);
       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
@@ -1603,7 +1605,7 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
       assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
       if (!wm_prog_data->uses_sample_mask)
          ps.InputCoverageMaskState = ICMS_NONE;
-      else if (wm_prog_data->per_coarse_pixel_dispatch)
+      else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
          ps.InputCoverageMaskState  = ICMS_NORMAL;
       else if (wm_prog_data->post_depth_coverage)
          ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
@@ -1613,13 +1615,14 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
 #if GFX_VER >= 11
       ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
          wm_prog_data->uses_depth_w_coefficients;
-      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
+      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->coarse_pixel_dispatch;
 #endif
 #if GFX_VERx10 >= 125
       /* TODO: We should only require this when the last geometry shader uses
        *       a fragment shading rate that is not constant.
        */
-      ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch;
+      ps.EnablePSDependencyOnCPsizeChange =
+         brw_wm_prog_data_is_coarse(wm_prog_data, 0);
 #endif
    }
 }
index b41491b..c7b5af4 100644 (file)
@@ -315,7 +315,8 @@ genX(emit_shading_rate)(struct anv_batch *batch,
                         const struct vk_fragment_shading_rate_state *fsr)
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
+   const bool cps_enable = wm_prog_data &&
+      brw_wm_prog_data_is_coarse(wm_prog_data, 0);
 
 #if GFX_VER == 11
    anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
index f1a1141..a22df5b 100644 (file)
@@ -1605,7 +1605,7 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
          wm.PixelShaderKillsPixel;
 
       if (ms != NULL && ms->rasterization_samples > 1) {
-         if (wm_prog_data->persample_dispatch) {
+         if (brw_wm_prog_data_is_persample(wm_prog_data, 0)) {
             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
          } else {
             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
@@ -1671,7 +1671,8 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
 
    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
       intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
-                                  ms != NULL ? ms->rasterization_samples : 1);
+                                  ms != NULL ? ms->rasterization_samples : 1,
+                                  0 /* msaa_flags */);
 
       ps.KernelStartPointer0 = fs_bin->kernel.offset +
                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
@@ -1739,7 +1740,8 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
       ps.PixelShaderValid              = true;
       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
-      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
+      ps.PixelShaderIsPerSample        =
+         brw_wm_prog_data_is_persample(wm_prog_data, 0);
       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;