From 7fe1d202d52c478094a04ec61a8f0b47da59db66 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Mon, 9 Jan 2023 15:31:33 -0800 Subject: [PATCH] intel/fs/gfx12: Ensure that prior reads have executed before barrier with acquire semantics. This avoids a violation of the Vulkan memory model that was leading to intermittent failures of at least 8k test-cases of the Vulkan CTS (within the group dEQP-VK.memory_model.*) on TGL and DG2 platforms. In theory the issue may be reproducible on earlier platforms like IVB and ICL, but the SYNC.ALLWR instruction is not available on those platforms so a different (likely costlier) fix will be needed. The issue occurs within the sequence we emit for a NIR memory barrier with acquire semantics requiring the synchronization of multiple caches, e.g. in pseudocode for a barrier involving the TGM and UGM caches on DG2: x <- load.ugm // Atomic read sequenced-before the barrier y <- fence.ugm z <- fence.tgm wait(y, z) w <- load.tgm // Read sequenced-after the barrier In the example we must provide the guarantee that the memory load for x is completed before the one for w, however this ordering can be reversed with the intervention of a concurrent thread, since the UGM fence will block on the prior UGM load and potentially take a long time, while the TGM fence may complete and invalidate the TGM cache immediately, so a concurrent thread could pollute the TGM cache with stale contents for the w location *before* the UGM load has completed, leading to an inversion of the expected memory ordering. v2: Apply the workaround regardless of whether the NIR barrier intrinsic specifies multiple storage classes or a single one, since an acquire barrier is required to order subsequent requests relative to previous atomic requests of unknown storage class not necessarily specified by the memory scope information of the intrinsic. Cc: mesa-stable Reviewed-by: Ivan Briano Reviewed-by: Lionel Landwerlin Part-of: (cherry picked from commit 4a2e7306dd007a9564f9194c52d181ef24271c4e) --- .pick_status.json | 2 +- src/intel/compiler/brw_fs_nir.cpp | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/.pick_status.json b/.pick_status.json index fa89f1b..ee16722 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -3109,7 +3109,7 @@ "description": "intel/fs/gfx12: Ensure that prior reads have executed before barrier with acquire semantics.", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null }, diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8fb9020..ca5d2bc 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4487,6 +4487,50 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const fs_builder ubld = bld.group(8, 0); + /* A memory barrier with acquire semantics requires us to + * guarantee that memory operations of the specified storage + * class sequenced-after the barrier aren't reordered before the + * barrier, nor before any previous atomic operation + * sequenced-before the barrier which may be synchronizing this + * acquire barrier with a prior release sequence. + * + * In order to guarantee the latter we must make sure that any + * such previous operation has completed execution before + * invalidating the relevant caches, since otherwise some cache + * could be polluted by a concurrent thread after its + * invalidation but before the previous atomic completes, which + * could lead to a violation of the expected memory ordering if + * a subsequent memory read hits the polluted cacheline, which + * would return a stale value read from memory before the + * completion of the atomic sequenced-before the barrier. + * + * This ordering inversion can be avoided trivially if the + * operations we need to order are all handled by a single + * in-order cache, since the flush implied by the memory fence + * occurs after any pending operations have completed, however + * that doesn't help us when dealing with multiple caches + * processing requests out of order, in which case we need to + * explicitly stall the EU until any pending memory operations + * have executed. + * + * Note that that might be somewhat heavy handed in some cases. + * In particular when this memory fence was inserted by + * spirv_to_nir() lowering an atomic with acquire semantics into + * an atomic+barrier sequence we could do a better job by + * synchronizing with respect to that one atomic *only*, but + * that would require additional information not currently + * available to the backend. + * + * XXX - Use an alternative workaround on IVB and ICL, since + * SYNC.ALLWR is only available on Gfx12+. + */ + if (devinfo->ver >= 12 && + (!nir_intrinsic_has_memory_scope(instr) || + (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) { + ubld.exec_all().group(1, 0).emit( + BRW_OPCODE_SYNC, ubld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR)); + } + if (devinfo->has_lsc) { assert(devinfo->verx10 >= 125); uint32_t desc = -- 2.7.4