From ac303c5d5b39a7b1fd907bfd58bd649055a7a02f Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Tue, 16 Aug 2022 08:08:43 +0000
Subject: [PATCH] intel/fs: improve Wa_22013689345 workaround

The initial implementation is a pretty big hammer. Implement the HW
recommendation to minimize cases in which we need a fence.

This improves by 10FPS on some of the Sascha Willems RT demos.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 6031ad4bf690 ("intel/fs: Add Wa_22013689345")
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19322>
(cherry picked from commit 945637514e6e970fcc37745f509eec11ff3b5129)
---
 .pick_status.json             |  2 +-
 src/intel/compiler/brw_eu.h   | 37 +++++++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs.cpp | 32 ++++++++++++++++++++++++--------
 3 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 9267b55..2b7ad4b 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -139,7 +139,7 @@
         "description": "intel/fs: improve Wa_22013689345 workaround",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "6031ad4bf690fe250d90063dec7e0269da5b3016"
     },
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index f078504..8b7bcae 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1203,6 +1203,43 @@ lsc_opcode_has_transpose(enum lsc_opcode opcode)
    return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE;
 }
 
+static inline bool
+lsc_opcode_is_store(enum lsc_opcode opcode)
+{
+   return opcode == LSC_OP_STORE ||
+          opcode == LSC_OP_STORE_CMASK;
+}
+
+static inline bool
+lsc_opcode_is_atomic(enum lsc_opcode opcode)
+{
+   switch (opcode) {
+   case LSC_OP_ATOMIC_INC:
+   case LSC_OP_ATOMIC_DEC:
+   case LSC_OP_ATOMIC_LOAD:
+   case LSC_OP_ATOMIC_STORE:
+   case LSC_OP_ATOMIC_ADD:
+   case LSC_OP_ATOMIC_SUB:
+   case LSC_OP_ATOMIC_MIN:
+   case LSC_OP_ATOMIC_MAX:
+   case LSC_OP_ATOMIC_UMIN:
+   case LSC_OP_ATOMIC_UMAX:
+   case LSC_OP_ATOMIC_CMPXCHG:
+   case LSC_OP_ATOMIC_FADD:
+   case LSC_OP_ATOMIC_FSUB:
+   case LSC_OP_ATOMIC_FMIN:
+   case LSC_OP_ATOMIC_FMAX:
+   case LSC_OP_ATOMIC_FCMPXCHG:
+   case LSC_OP_ATOMIC_AND:
+   case LSC_OP_ATOMIC_OR:
+   case LSC_OP_ATOMIC_XOR:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
 static inline uint32_t
 lsc_data_size_bytes(enum lsc_data_size data_size)
 {
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index a979e59..6244546 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6167,18 +6167,34 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
 {
    /* This workaround is about making sure that any instruction writing
     * through UGM has completed before we hit EOT.
-    *
-    * The workaround talks about UGM writes or atomic message but what is
-    * important is anything that hasn't completed. Usually any SEND
-    * instruction that has a destination register will be read by something
-    * else so we don't need to care about those as they will be synchronized
-    * by other parts of the shader or optimized away. What is left are
-    * instructions that don't have a destination register.
     */
    if (inst->sfid != GFX12_SFID_UGM)
       return false;
 
-   return inst->dst.file == BAD_FILE;
+   /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
+    * where the L1-cache override is NOT among {WB, WS, WT}
+    */
+   enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
+   if (lsc_opcode_is_store(opcode)) {
+      switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
+      case LSC_CACHE_STORE_L1STATE_L3MOCS:
+      case LSC_CACHE_STORE_L1WB_L3WB:
+      case LSC_CACHE_STORE_L1S_L3UC:
+      case LSC_CACHE_STORE_L1S_L3WB:
+      case LSC_CACHE_STORE_L1WT_L3UC:
+      case LSC_CACHE_STORE_L1WT_L3WB:
+         return false;
+
+      default:
+         return true;
+      }
+   }
+
+   /* Any UGM Atomic message WITHOUT return value */
+   if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
+      return true;
+
+   return false;
 }
 
 /* Wa_22013689345
-- 
2.7.4