From ac303c5d5b39a7b1fd907bfd58bd649055a7a02f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 16 Aug 2022 08:08:43 +0000 Subject: [PATCH] intel/fs: improve Wa_22013689345 workaround The initial implementation is a pretty big hammer. Implement the HW recommendation to minimize cases in which we need a fence. This improves by 10FPS on some of the Sascha Willems RT demos. Signed-off-by: Lionel Landwerlin Fixes: 6031ad4bf690 ("intel/fs: Add Wa_22013689345") Reviewed-by: Francisco Jerez Part-of: (cherry picked from commit 945637514e6e970fcc37745f509eec11ff3b5129) --- .pick_status.json | 2 +- src/intel/compiler/brw_eu.h | 37 +++++++++++++++++++++++++++++++++++++ src/intel/compiler/brw_fs.cpp | 32 ++++++++++++++++++++++++-------- 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 9267b55..2b7ad4b 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -139,7 +139,7 @@ "description": "intel/fs: improve Wa_22013689345 workaround", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "6031ad4bf690fe250d90063dec7e0269da5b3016" }, diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index f078504..8b7bcae 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1203,6 +1203,43 @@ lsc_opcode_has_transpose(enum lsc_opcode opcode) return opcode == LSC_OP_LOAD || opcode == LSC_OP_STORE; } +static inline bool +lsc_opcode_is_store(enum lsc_opcode opcode) +{ + return opcode == LSC_OP_STORE || + opcode == LSC_OP_STORE_CMASK; +} + +static inline bool +lsc_opcode_is_atomic(enum lsc_opcode opcode) +{ + switch (opcode) { + case LSC_OP_ATOMIC_INC: + case LSC_OP_ATOMIC_DEC: + case LSC_OP_ATOMIC_LOAD: + case LSC_OP_ATOMIC_STORE: + case LSC_OP_ATOMIC_ADD: + case LSC_OP_ATOMIC_SUB: + case LSC_OP_ATOMIC_MIN: + case LSC_OP_ATOMIC_MAX: + case LSC_OP_ATOMIC_UMIN: + case LSC_OP_ATOMIC_UMAX: + case LSC_OP_ATOMIC_CMPXCHG: + case LSC_OP_ATOMIC_FADD: + case LSC_OP_ATOMIC_FSUB: + case LSC_OP_ATOMIC_FMIN: + case LSC_OP_ATOMIC_FMAX: + case LSC_OP_ATOMIC_FCMPXCHG: + case LSC_OP_ATOMIC_AND: + case LSC_OP_ATOMIC_OR: + case LSC_OP_ATOMIC_XOR: + return true; + + default: + return false; + } +} + static inline uint32_t lsc_data_size_bytes(enum lsc_data_size data_size) { diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index a979e59..6244546 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6167,18 +6167,34 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) { /* This workaround is about making sure that any instruction writing * through UGM has completed before we hit EOT. - * - * The workaround talks about UGM writes or atomic message but what is - * important is anything that hasn't completed. Usually any SEND - * instruction that has a destination register will be read by something - * else so we don't need to care about those as they will be synchronized - * by other parts of the shader or optimized away. What is left are - * instructions that don't have a destination register. */ if (inst->sfid != GFX12_SFID_UGM) return false; - return inst->dst.file == BAD_FILE; + /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages, + * where the L1-cache override is NOT among {WB, WS, WT} + */ + enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc); + if (lsc_opcode_is_store(opcode)) { + switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) { + case LSC_CACHE_STORE_L1STATE_L3MOCS: + case LSC_CACHE_STORE_L1WB_L3WB: + case LSC_CACHE_STORE_L1S_L3UC: + case LSC_CACHE_STORE_L1S_L3WB: + case LSC_CACHE_STORE_L1WT_L3UC: + case LSC_CACHE_STORE_L1WT_L3WB: + return false; + + default: + return true; + } + } + + /* Any UGM Atomic message WITHOUT return value */ + if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE) + return true; + + return false; } /* Wa_22013689345 -- 2.7.4