From 47bac63d3f6b9e64fdf997aff1f145bc948f02d9 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 8 Mar 2022 11:14:33 -0800 Subject: [PATCH] [AMDGPU] gfx940 memory model Differential Revision: https://reviews.llvm.org/D121242 --- llvm/docs/AMDGPUUsage.rst | 2417 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 354 +++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 31 +- llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll | 428 ++++ .../CodeGen/AMDGPU/memory-legalizer-flat-agent.ll | 2488 ++++++++++++++++++++ .../AMDGPU/memory-legalizer-flat-nontemporal.ll | 114 + .../AMDGPU/memory-legalizer-flat-singlethread.ll | 2034 ++++++++++++++++ .../CodeGen/AMDGPU/memory-legalizer-flat-system.ll | 2488 ++++++++++++++++++++ .../AMDGPU/memory-legalizer-flat-wavefront.ll | 2010 ++++++++++++++++ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 2142 +++++++++++++++++ .../AMDGPU/memory-legalizer-global-agent.ll | 2390 +++++++++++++++++++ .../AMDGPU/memory-legalizer-global-nontemporal.ll | 90 + .../AMDGPU/memory-legalizer-global-singlethread.ll | 1986 ++++++++++++++++ .../AMDGPU/memory-legalizer-global-system.ll | 2278 ++++++++++++++++++ .../AMDGPU/memory-legalizer-global-wavefront.ll | 1986 ++++++++++++++++ .../AMDGPU/memory-legalizer-global-workgroup.ll | 2173 +++++++++++++++++ .../CodeGen/AMDGPU/memory-legalizer-local-agent.ll | 2064 ++++++++++++++++ .../AMDGPU/memory-legalizer-local-nontemporal.ll | 98 + .../AMDGPU/memory-legalizer-local-singlethread.ll | 1954 +++++++++++++++ .../AMDGPU/memory-legalizer-local-system.ll | 2064 ++++++++++++++++ .../AMDGPU/memory-legalizer-local-wavefront.ll | 1954 +++++++++++++++ .../AMDGPU/memory-legalizer-local-workgroup.ll | 2064 ++++++++++++++++ .../AMDGPU/memory-legalizer-private-nontemporal.ll | 94 + 23 files changed, 35683 insertions(+), 18 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index b3ad9c9..539cda8 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5018,6 +5018,7 @@ following sections: * :ref:`amdgpu-amdhsa-memory-model-gfx6-gfx9` * :ref:`amdgpu-amdhsa-memory-model-gfx90a` +* :ref:`amdgpu-amdhsa-memory-model-gfx940` * :ref:`amdgpu-amdhsa-memory-model-gfx10` .. _amdgpu-amdhsa-memory-model-gfx6-gfx9: @@ -8669,6 +8670,2422 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. - system for OpenCL.* ============ ============ ============== ========== ================================ +.. _amdgpu-amdhsa-memory-model-gfx940: + +Memory Model GFX940 ++++++++++++++++++++ + +For GFX940: + +* Each agent has multiple shader arrays (SA). +* Each SA has multiple compute units (CU). +* Each CU has multiple SIMDs that execute wavefronts. +* The wavefronts for a single work-group are executed in the same CU but may be + executed by different SIMDs. The exception is when in tgsplit execution mode + when the wavefronts may be executed by different SIMDs in different CUs. +* Each CU has a single LDS memory shared by the wavefronts of the work-groups + executing on it. The exception is when in tgsplit execution mode when no LDS + is allocated as wavefronts of the same work-group can be in different CUs. +* All LDS operations of a CU are performed as wavefront wide operations in a + global order and involve no caching. Completion is reported to a wavefront in + execution order. +* The LDS memory has multiple request queues shared by the SIMDs of a + CU. Therefore, the LDS operations performed by different wavefronts of a + work-group can be reordered relative to each other, which can result in + reordering the visibility of vector memory operations with respect to LDS + operations of other wavefronts in the same work-group. A ``s_waitcnt + lgkmcnt(0)`` is required to ensure synchronization between LDS operations and + vector memory operations between wavefronts of a work-group, but not between + operations performed by the same wavefront. +* The vector memory operations are performed as wavefront wide operations and + completion is reported to a wavefront in execution order. The exception is + that ``flat_load/store/atomic`` instructions can report out of vector memory + order if they access LDS memory, and out of LDS operation order if they access + global memory. +* The vector memory operations access a single vector L1 cache shared by all + SIMDs a CU. Therefore: + + * No special action is required for coherence between the lanes of a single + wavefront. + + * No special action is required for coherence between wavefronts in the same + work-group since they execute on the same CU. The exception is when in + tgsplit execution mode as wavefronts of the same work-group can be in + different CUs and so a ``buffer_inv sc0`` is required which will invalidate + the L1 cache is in tgsplit mode. + + * A ``buffer_inv sc1`` is required to invalidate the L1 cache for coherence + between wavefronts executing in different work-groups as they may be + executing on different CUs. + +* The scalar memory operations access a scalar L1 cache shared by all wavefronts + on a group of CUs. The scalar and vector L1 caches are not coherent. However, + scalar operations are used in a restricted way so do not impact the memory + model. See :ref:`amdgpu-amdhsa-memory-spaces`. +* The vector and scalar memory operations use an L2 cache. + + * The gfx940 can be configured as a number of smaller agents with each having + a single L2 shared by all CUs on the same agent, or as fewer (possibly one) + larger agents with groups of CUs on each agent each sharing separate L2 + caches. + * The L2 cache has independent channels to service disjoint ranges of virtual + addresses. + * Each CU has a separate request queue per channel for its associated L2. + Therefore, the vector and scalar memory operations performed by wavefronts + executing with different L1 caches and the same L2 cache can be reordered + relative to each other. + * A ``s_waitcnt vmcnt(0)`` is required to ensure synchronization between + vector memory operations of different CUs. It ensures a previous vector + memory operation has completed before executing a subsequent vector memory + or LDS operation and so can be used to meet the requirements of acquire and + release. + * An L2 cache can be kept coherent with other L2 caches by using the MTYPE RW + (read-write) for memory local to the L2, and MTYPE NC (non-coherent) with + the PTE C-bit set for memory not local to the L2. + + * Any local memory cache lines will be automatically invalidated by writes + from CUs associated with other L2 caches, or writes from the CPU, due to + the cache probe caused by the PTE C-bit. + * XGMI accesses from the CPU to local memory may be cached on the CPU. + Subsequent access from the GPU will automatically invalidate or writeback + the CPU cache due to the L2 probe filter. + * To ensure coherence of local memory writes of CUs with different L1 caches + in the same agent a ``buffer_wbl2`` is required. It does nothing if the + agent is configured to have a single L2, or will writeback dirty L2 cache + lines if configured to have multiple L2 caches. + * To ensure coherence of local memory writes of CUs in different agents a + ``buffer_wbl2 sc1`` is required. It will writeback dirty L2 cache lines. + * To ensure coherence of local memory reads of CUs with different L1 caches + in the same agent a ``buffer_inv sc1`` is required. It does nothing if the + agent is configured to have a single L2, or will invalidate non-local L2 + cache lines if configured to have multiple L2 caches. + * To ensure coherence of local memory reads of CUs in different agents a + ``buffer_inv sc0 sc1`` is required. It will invalidate non-local L2 cache + lines if configured to have multiple L2 caches. + + * PCIe access from the GPU to the CPU can be kept coherent by using the MTYPE + UC (uncached) which bypasses the L2. + +Scalar memory operations are only used to access memory that is proven to not +change during the execution of the kernel dispatch. This includes constant +address space and global address space for program scope ``const`` variables. +Therefore, the kernel machine code does not have to maintain the scalar cache to +ensure it is coherent with the vector caches. The scalar and vector caches are +invalidated between kernel dispatches by CP since constant address space data +may change between kernel dispatch executions. See +:ref:`amdgpu-amdhsa-memory-spaces`. + +The one exception is if scalar writes are used to spill SGPR registers. In this +case the AMDGPU backend ensures the memory location used to spill is never +accessed by vector memory operations at the same time. If scalar writes are used +then a ``s_dcache_wb`` is inserted before the ``s_endpgm`` and before a function +return since the locations may be used for vector memory instructions by a +future wavefront that uses the same scratch area, or a function call that +creates a frame at the same address, respectively. There is no need for a +``s_dcache_inv`` as all scalar writes are write-before-read in the same thread. + +For kernarg backing memory: + +* CP invalidates the L1 cache at the start of each kernel dispatch. +* On dGPU over XGMI or PCIe the kernarg backing memory is allocated in host + memory accessed as MTYPE UC (uncached) to avoid needing to invalidate the L2 + cache. This also causes it to be treated as non-volatile and so is not + invalidated by ``*_vol``. +* On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and + so the L2 cache will be coherent with the CPU and other agents. + +Scratch backing memory (which is used for the private address space) is accessed +with MTYPE NC_NV (non-coherent non-volatile). Since the private address space is +only accessed by a single thread, and is always write-before-read, there is +never a need to invalidate these entries from the L1 cache. Hence all cache +invalidates are done as ``*_vol`` to only invalidate the volatile cache lines. + +The code sequences used to implement the memory model for GFX940 are defined +in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-table`. + + .. table:: AMDHSA Memory Model Code Sequences GFX940 + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx940-table + + ============ ============ ============== ========== ================================ + LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code + Ordering Sync Scope Address GFX940 + Space + ============ ============ ============== ========== ================================ + **Non-Atomic** + ------------------------------------------------------------------------------------ + load *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_load + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_load + nt=1 + + - volatile + + 1. buffer/global/flat_load + sc0=1 sc1=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + load *none* *none* - local 1. ds_load + store *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_store + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_store + nt=1 + + - volatile + + 1. buffer/global/flat_store + sc0=1 sc1=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + store *none* *none* - local 1. ds_store + **Unordered Atomic** + ------------------------------------------------------------------------------------ + load atomic unordered *any* *any* *Same as non-atomic*. + store atomic unordered *any* *any* *Same as non-atomic*. + atomicrmw unordered *any* *any* *Same as monotonic atomic*. + **Monotonic Atomic** + ------------------------------------------------------------------------------------ + load atomic monotonic - singlethread - global 1. buffer/global/flat_load + - wavefront - generic + load atomic monotonic - workgroup - global 1. buffer/global/flat_load + - generic sc0=1 + load atomic monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_load + load atomic monotonic - agent - global 1. buffer/global/flat_load + - generic sc1=1 + load atomic monotonic - system - global 1. buffer/global/flat_load + - generic sc0=1 sc1=1 + store atomic monotonic - singlethread - global 1. buffer/global/flat_store + - wavefront - generic + store atomic monotonic - singlethread - global 1. buffer/global/flat_store + - wavefront - generic + store atomic monotonic - workgroup - global 1. buffer/global/flat_store + - generic sc0=1 + store atomic monotonic - agent - global 1. buffer/global/flat_store + - generic sc1=1 + store atomic monotonic - system - global 1. buffer/global/flat_store + - generic sc0=1 sc1=1 + store atomic monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_store + atomicrmw monotonic - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + - workgroup + - agent + atomicrmw monotonic - system - global 1. buffer/global/flat_atomic + - generic sc1=1 + atomicrmw monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_atomic + **Acquire Atomic** + ------------------------------------------------------------------------------------ + load atomic acquire - singlethread - global 1. buffer/global/ds/flat_load + - wavefront - local + - generic + load atomic acquire - workgroup - global 1. buffer/global_load sc0=1 + 2. s_waitcnt vmcnt(0) + + - If not TgSplit execution + mode, omit. + - Must happen before the + following buffer_inv. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale data. + + load atomic acquire - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_load + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + load atomic acquire - workgroup - generic 1. flat_load sc0=1 + 2. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). + - Must happen before + the following + buffer_inv and any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local load + atomic value being + acquired. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + load atomic acquire - agent - global 1. buffer/global_load + sc1=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the load + has completed + before invalidating + the cache. + + 3. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale global data. + + load atomic acquire - system - global 1. buffer/global/flat_load + sc0=1 sc1=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the load + has completed + before invalidating + the cache. + + 3. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + load atomic acquire - agent - generic 1. flat_load sc1=1 + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL omit + lgkmcnt(0). + - Must happen before + following + buffer_inv. + - Ensures the flat_load + has completed + before invalidating + the cache. + + 3. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + load atomic acquire - system - generic 1. flat_load sc0=1 sc1=1 + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL omit + lgkmcnt(0). + - Must happen before + the following + buffer_inv. + - Ensures the flat_load + has completed + before invalidating + the caches. + + 3. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acquire - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw acquire - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw acquire - workgroup - global 1. buffer/global_atomic + 2. s_waitcnt vmcnt(0) + + - If not TgSplit execution + mode, omit. + - Must happen before the + following buffer_inv. + - Ensures the atomicrmw + has completed + before invalidating + the cache. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local + atomicrmw value + being acquired. + + atomicrmw acquire - workgroup - generic 1. flat_atomic + 2. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). + - Must happen before + the following + buffer_inv and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local + atomicrmw value + being acquired. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acquire - agent - global 1. buffer/global_atomic + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 3. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - system - global 1. buffer/global_atomic + sc1=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 3. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acquire - agent - generic 1. flat_atomic + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 3. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - system - generic 1. flat_atomic sc1=1 + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 3. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + fence acquire - singlethread *none* *none* + - wavefront + fence acquire - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/ + atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + buffer_inv and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the + value read by the + fence-paired-atomic. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + fence acquire - agent *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + buffer_inv. + - Ensures that the + fence-paired atomic + has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + 2. buffer_inv sc1=1 + + - Must happen before any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + fence acquire - system *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + buffer_inv. + - Ensures that the + fence-paired atomic + has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + 2. buffer_inv sc0=1 sc1=1 + + - Must happen before any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + **Release Atomic** + ------------------------------------------------------------------------------------ + store atomic release - singlethread - global 1. buffer/global/flat_store + - wavefront - generic + store atomic release - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_store + store atomic release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + have + completed before + performing the + store that is being + released. + + 2. buffer/global/flat_store sc0=1 + store atomic release - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_store + store atomic release - agent - global 1. buffer_wbl2 sc1=1 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + to memory have + completed before + performing the + store that is being + released. + + 3. buffer/global/flat_store sc1=1 + store atomic release - system - global 1. buffer_wbl2 sc0=1 sc1=1 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after any + preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after any + preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + to memory and the L2 + writeback have + completed before + performing the + store that is being + released. + + 2. buffer/global/flat_store + sc0=1 sc1=1 + atomicrmw release - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw release - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. buffer/global/flat_atomic sc0=1 + atomicrmw release - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + atomicrmw release - agent - global 1. buffer_wbl2 sc1=1 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and local + have completed + before performing + the atomicrmw that + is being released. + + 3. buffer/global/flat_atomic sc1=1 + atomicrmw release - system - global 1. buffer_wbl2 sc0=1 sc1=1 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to memory and the L2 + writeback have + completed before + performing the + store that is being + released. + + 3. buffer/global/flat_atomic + sc0=1 sc1=1 + fence release - singlethread *none* *none* + - wavefront + fence release - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + fence release - agent *none* 1. buffer_wbl2 sc1=1 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + fence release - system *none* 1. buffer_wbl2 sc0=1 sc1=1 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + **Acquire-Release Atomic** + ------------------------------------------------------------------------------------ + atomicrmw acq_rel - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw acq_rel - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw acq_rel - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - Must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. buffer/global_atomic + 3. s_waitcnt vmcnt(0) + + - If not TgSplit execution + mode, omit. + - Must happen before + the following + buffer_inv. + - Ensures any + following global + data read is no + older than the + atomicrmw value + being acquired. + + 4. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acq_rel - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + atomicrmw acq_rel - workgroup - generic 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. flat_atomic + 3. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If not TgSplit execution + mode, omit vmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + the following + buffer_inv and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local load + atomic value being + acquired. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acq_rel - agent - global 1. buffer_wbl2 sc1=1 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global have + completed before + performing the + atomicrmw that is + being released. + + 3. buffer/global_atomic + 4. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 5. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acq_rel - system - global 1. buffer_wbl2 sc0=1 sc1=1 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and L2 writeback + have completed before + performing the + atomicrmw that is + being released. + + 3. buffer/global_atomic + sc1=1 + 4. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acq_rel - agent - generic 1. buffer_wbl2 sc1=1 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global have + completed before + performing the + atomicrmw that is + being released. + + 3. flat_atomic + 4. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 5. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acq_rel - system - generic 1. buffer_wbl2 sc0=1 sc1=1 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and L2 writeback + have completed before + performing the + atomicrmw that is + being released. + + 3. flat_atomic sc1=1 + 4. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_inv. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + fence acq_rel - singlethread *none* *none* + - wavefront + fence acq_rel - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, + since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that all + memory operations + have + completed before + performing any + following global + memory operations. + - Ensures that the + preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before following + global memory + operations. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + local/generic store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + - Must happen before + the following + buffer_inv. + - Ensures that the + acquire-fence-paired + atomic has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + acquire-fence-paired-atomic. + + 3. buffer_inv sc0=1 + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + fence acq_rel - agent *none* 1. buffer_wbl2 sc1=1 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at agent scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + buffer_inv. + - Ensures that the + preceding + global/local/generic + load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before invalidating + the cache. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + global/local/generic + store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + + 3. buffer_inv sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. This + satisfies the + requirements of + acquire. + + fence acq_rel - system *none* 1. buffer_wbl2 sc0=1 sc1=1 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + buffer_inv. + - Ensures that the + preceding + global/local/generic + load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before invalidating + the cache. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + global/local/generic + store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + + 2. buffer_inv sc0=1 sc1=1 + + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + **Sequential Consistent Atomic** + ------------------------------------------------------------------------------------ + load atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local load atomic acquire, + - generic except must generated + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - s_waitcnt lgkmcnt(0) must + happen after + preceding + local/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + lgkmcnt(0) and so do + not need to be + considered.) + - s_waitcnt vmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + vmcnt(0) and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global/local + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + s_waitcnt of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The s_waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the s_waitcnt be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generated + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + *Same as corresponding + load atomic acquire, + except must generated + all instructions even + for OpenCL.* + + load atomic seq_cst - agent - global 1. s_waitcnt lgkmcnt(0) & + - system - generic vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) + and s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt lgkmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + lgkmcnt(0) and so do + not need to be + considered.) + - s_waitcnt vmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + vmcnt(0) and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + s_waitcnt of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The s_waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the s_waitcnt be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generated + all instructions even + for OpenCL.* + store atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local store atomic release, + - workgroup - generic except must generated + - agent all instructions even + - system for OpenCL.* + atomicrmw seq_cst - singlethread - global *Same as corresponding + - wavefront - local atomicrmw acq_rel, + - workgroup - generic except must generated + - agent all instructions even + - system for OpenCL.* + fence seq_cst - singlethread *none* *Same as corresponding + - wavefront fence acq_rel, + - workgroup except must generated + - agent all instructions even + - system for OpenCL.* + ============ ============ ============== ========== ================================ + .. _amdgpu-amdhsa-memory-model-gfx10: Memory Model GFX10 diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 8c8609c..728021a 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -459,6 +459,56 @@ public: Position Pos) const override; }; +class SIGfx940CacheControl : public SIGfx90ACacheControl { +protected: + + /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::SC0); + } + + /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::SC1); + } + + /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableNTBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::NT); + } + +public: + + SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; + + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIGfx10CacheControl : public SIGfx7CacheControl { protected: @@ -775,6 +825,8 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, /* static */ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); + if (ST.hasGFX940Insts()) + return std::make_unique(ST); if (ST.hasGFX90AInsts()) return std::make_unique(ST); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -1388,6 +1440,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx940CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. Setting SC + // bits to indicate work-group scope will do this automatically. + Changed |= enableSC0Bit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx940CacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::WORKGROUP: + // Set SC bits to indicate workgroup scope. + Changed |= enableSC0Bit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx940CacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC1 bit to indicate system scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // RMW atomic operations implicitly bypass the L1 cache and only use SC1 + // to indicate system or agent scope. The SC0 bit is used to indicate if + // they are return or no-return. Leave SC1 bit unset to indicate agent + // scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + return Changed; +} + +bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + + return Changed; + } + + if (IsNonTemporal) { + Changed |= enableNTBit(MI); + return Changed; + } + + return Changed; +} + +bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!InsertCacheInv) + return false; + + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and + // CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::AGENT: + // Ensures that following loads will not see stale remote date or local + // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale + // due to the memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to invalidate the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be invalidated. + if (ST.isTgSplitEnabled()) { + // Ensures L1 is invalidated if in threadgroup split mode. In + // non-threadgroup split mode it is a NOP, but no point generating it in + // that case if know not in that mode. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate work-group scope. + .addImm(AMDGPU::CPol::SC0); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + Changed = true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Could generate "BUFFER_INV" but it would do nothing as there are no + // caches to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + +bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by the + // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::SYSTEM, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". + Changed = true; + break; + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::AGENT, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)". + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Do not generate "BUFFER_WBL2" as there are no caches it would + // writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (Pos == Position::AFTER) + --MI; + + // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other + // S_WAITCNT needed. + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos); + + return Changed; +} + bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 06dc89f..b914089 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -31,12 +31,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_invl2 -; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -49,12 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_invl2 -; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -76,12 +74,11 @@ define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_invl2 -; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret float %ret @@ -195,12 +192,11 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3) ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 ; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_invl2 -; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) ret void @@ -210,12 +206,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_invl2 -; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 8e20ea4..26201f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -35,6 +37,14 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire ret void @@ -68,6 +78,14 @@ define amdgpu_kernel void @singlethread_release_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread") release ret void @@ -101,6 +119,14 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel ret void @@ -134,6 +160,14 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst ret void @@ -167,6 +201,14 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire ret void @@ -200,6 +242,14 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_one_as_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release ret void @@ -233,6 +283,14 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -266,6 +324,14 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -299,6 +365,14 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire ret void @@ -332,6 +406,14 @@ define amdgpu_kernel void @wavefront_release_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront") release ret void @@ -365,6 +447,14 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel ret void @@ -398,6 +488,14 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst ret void @@ -431,6 +529,14 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire ret void @@ -464,6 +570,14 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_one_as_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release ret void @@ -497,6 +611,14 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -530,6 +652,14 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -573,6 +703,17 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -614,6 +755,16 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -657,6 +808,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -700,6 +862,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -738,6 +911,16 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -774,6 +957,15 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_one_as_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -812,6 +1004,16 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -850,6 +1052,16 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -900,6 +1112,20 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent") acquire ret void @@ -942,6 +1168,18 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -992,6 +1230,20 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -1042,6 +1294,20 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -1092,6 +1358,20 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_one_as_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire ret void @@ -1134,6 +1414,18 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_one_as_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_one_as_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -1184,6 +1476,20 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -1234,6 +1540,20 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -1288,6 +1608,20 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence acquire ret void @@ -1332,6 +1666,18 @@ define amdgpu_kernel void @system_release_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence release ret void @@ -1386,6 +1732,20 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence acq_rel ret void @@ -1440,6 +1800,20 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence seq_cst ret void @@ -1494,6 +1868,20 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_one_as_acquire_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("one-as") acquire ret void @@ -1538,6 +1926,18 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_one_as_release_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_one_as_release_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -1592,6 +1992,20 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_one_as_acq_rel_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -1646,6 +2060,20 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: system_one_as_seq_cst_fence: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 4ef2355..56f1ec9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 @@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 @@ -261,6 +315,34 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 @@ -361,6 +443,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 @@ -434,6 +546,26 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 @@ -506,6 +638,26 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 @@ -586,6 +738,30 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") release, align 4 @@ -666,6 +842,30 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 @@ -738,6 +938,26 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic @@ -825,6 +1045,30 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire @@ -905,6 +1149,30 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release @@ -1000,6 +1268,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel @@ -1095,6 +1391,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst @@ -1186,6 +1510,32 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire @@ -1286,6 +1636,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel @@ -1386,6 +1766,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst @@ -1471,6 +1881,26 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1571,6 +2001,30 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1664,6 +2118,30 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1772,6 +2250,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1880,6 +2386,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1980,6 +2514,30 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2080,6 +2638,30 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2188,6 +2770,34 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2296,6 +2906,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2404,6 +3042,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2512,6 +3178,34 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2620,6 +3314,34 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2728,6 +3450,34 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2836,6 +3586,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2944,6 +3722,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3049,6 +3855,30 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3163,6 +3993,32 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3278,6 +4134,34 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3400,6 +4284,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3522,6 +4436,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3636,6 +4580,32 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3750,6 +4720,32 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3872,6 +4868,36 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3994,6 +5020,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4116,6 +5172,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4238,6 +5324,36 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4360,6 +5476,36 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4482,6 +5628,36 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4604,6 +5780,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4726,6 +5932,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4813,6 +6049,32 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 @@ -4898,6 +6160,32 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 @@ -4995,6 +6283,35 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 @@ -5100,6 +6417,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 @@ -5173,6 +6521,26 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 @@ -5245,6 +6613,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 @@ -5325,6 +6713,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 @@ -5405,6 +6817,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 @@ -5477,6 +6913,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic @@ -5562,6 +7018,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire @@ -5642,6 +7122,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release @@ -5735,6 +7239,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -5828,6 +7360,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -5923,6 +7483,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire @@ -6027,6 +7614,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -6131,6 +7749,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -6216,6 +7865,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6314,6 +7983,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6407,6 +8100,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6513,6 +8230,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6619,6 +8364,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6717,6 +8490,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6815,6 +8612,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6921,6 +8742,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7027,6 +8876,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7133,6 +9010,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7239,6 +9144,34 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7345,6 +9278,34 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7451,6 +9412,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7557,6 +9546,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7663,6 +9680,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7768,6 +9813,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7887,6 +9956,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8002,6 +10098,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8129,6 +10253,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8256,6 +10411,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8375,6 +10561,33 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8494,6 +10707,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8621,6 +10861,37 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8748,6 +11019,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8875,6 +11177,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9002,6 +11335,37 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9129,6 +11493,37 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9256,6 +11651,37 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9383,6 +11809,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9510,6 +11967,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index ab79a4c..89129f7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4, !nontemporal !0 @@ -179,6 +207,36 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -266,6 +324,32 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4 @@ -361,6 +445,36 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 49f2733..b2b1484 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 @@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 @@ -254,6 +308,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 @@ -339,6 +419,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 @@ -412,6 +518,26 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 @@ -484,6 +610,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 @@ -556,6 +702,26 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 @@ -628,6 +794,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 @@ -700,6 +886,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic @@ -772,6 +978,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -844,6 +1070,26 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release @@ -916,6 +1162,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -988,6 +1254,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -1072,6 +1358,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -1157,6 +1467,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -1242,6 +1576,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -1327,6 +1685,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1412,6 +1790,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1497,6 +1895,26 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1582,6 +2000,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1667,6 +2105,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1752,6 +2210,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1837,6 +2315,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1922,6 +2420,26 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2007,6 +2525,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2092,6 +2630,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2177,6 +2735,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2262,6 +2840,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2347,6 +2945,26 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2432,6 +3050,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2517,6 +3155,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2622,6 +3280,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2729,6 +3411,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2836,6 +3542,30 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2943,6 +3673,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3050,6 +3804,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3157,6 +3935,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3264,6 +4066,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3371,6 +4197,30 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3478,6 +4328,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3585,6 +4459,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3692,6 +4590,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3799,6 +4721,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3906,6 +4852,30 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4013,6 +4983,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4120,6 +5114,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4207,6 +5225,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 @@ -4292,6 +5336,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -4377,6 +5447,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 @@ -4462,6 +5558,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -4535,6 +5657,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 @@ -4607,6 +5749,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -4679,6 +5841,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 @@ -4751,6 +5933,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -4823,6 +6025,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -4895,6 +6117,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -4967,6 +6209,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release @@ -5039,6 +6301,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -5111,6 +6393,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5195,6 +6497,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -5280,6 +6606,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -5365,6 +6715,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5450,6 +6824,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5535,6 +6929,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5620,6 +7034,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5705,6 +7139,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5790,6 +7244,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5875,6 +7349,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5960,6 +7454,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6045,6 +7559,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6130,6 +7664,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6215,6 +7769,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6300,6 +7874,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6385,6 +7979,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6470,6 +8084,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6555,6 +8189,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6640,6 +8294,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6745,6 +8419,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6852,6 +8550,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6959,6 +8681,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7066,6 +8812,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7173,6 +8943,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7280,6 +9074,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7387,6 +9205,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7494,6 +9336,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7601,6 +9467,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7708,6 +9598,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7815,6 +9729,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7922,6 +9860,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8029,6 +9991,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8136,6 +10122,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8243,6 +10253,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index b9a3070..f9318d3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in unordered, align 4 @@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in monotonic, align 4 @@ -263,6 +317,34 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in acquire, align 4 @@ -365,6 +447,36 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in seq_cst, align 4 @@ -438,6 +550,26 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out unordered, align 4 @@ -510,6 +642,26 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out monotonic, align 4 @@ -592,6 +744,30 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out release, align 4 @@ -674,6 +850,30 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out seq_cst, align 4 @@ -746,6 +946,26 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic @@ -835,6 +1055,30 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acquire @@ -917,6 +1161,30 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in release @@ -1016,6 +1284,34 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel @@ -1115,6 +1411,34 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst @@ -1208,6 +1532,32 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acquire @@ -1312,6 +1662,36 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel @@ -1416,6 +1796,36 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst @@ -1501,6 +1911,26 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1603,6 +2033,30 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1698,6 +2152,30 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1810,6 +2288,34 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1922,6 +2428,34 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2024,6 +2558,30 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2126,6 +2684,30 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2238,6 +2820,34 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2350,6 +2960,34 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2462,6 +3100,34 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2574,6 +3240,34 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2686,6 +3380,34 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2798,6 +3520,34 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2910,6 +3660,34 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3022,6 +3800,34 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3127,6 +3933,30 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3243,6 +4073,32 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3360,6 +4216,34 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3486,6 +4370,36 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3612,6 +4526,36 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3728,6 +4672,32 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3844,6 +4814,32 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3970,6 +4966,36 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4096,6 +5122,36 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4222,6 +5278,36 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4348,6 +5434,36 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4474,6 +5590,36 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4600,6 +5746,36 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4726,6 +5902,36 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4852,6 +6058,36 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4939,6 +6175,32 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 @@ -5024,6 +6286,32 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 @@ -5123,6 +6411,35 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 @@ -5230,6 +6547,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 @@ -5303,6 +6651,26 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 @@ -5375,6 +6743,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 @@ -5457,6 +6845,30 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 @@ -5539,6 +6951,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 @@ -5611,6 +7047,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic @@ -5698,6 +7154,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire @@ -5780,6 +7260,30 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release @@ -5877,6 +7381,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel @@ -5974,6 +7506,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst @@ -6071,6 +7631,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire @@ -6179,6 +7766,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel @@ -6287,6 +7905,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst @@ -6372,6 +8021,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6472,6 +8141,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6567,6 +8260,30 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6677,6 +8394,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6787,6 +8532,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6887,6 +8660,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6987,6 +8784,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7097,6 +8918,34 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7207,6 +9056,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7317,6 +9194,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7427,6 +9332,34 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7537,6 +9470,34 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7647,6 +9608,34 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7757,6 +9746,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7867,6 +9884,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7972,6 +10017,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8093,6 +10162,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8210,6 +10306,34 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8341,6 +10465,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8472,6 +10627,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8593,6 +10779,33 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8714,6 +10927,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8845,6 +11085,37 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8976,6 +11247,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9107,6 +11409,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9238,6 +11571,37 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9369,6 +11733,37 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9500,6 +11895,37 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9631,6 +12057,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -9762,6 +12219,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 1569af9..062f981 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 @@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 @@ -254,6 +308,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 @@ -339,6 +419,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 @@ -412,6 +518,26 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 @@ -484,6 +610,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 @@ -556,6 +702,26 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 @@ -628,6 +794,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 @@ -700,6 +886,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic @@ -772,6 +978,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -844,6 +1070,26 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release @@ -916,6 +1162,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -988,6 +1254,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -1072,6 +1358,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -1157,6 +1467,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -1242,6 +1576,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -1327,6 +1685,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1412,6 +1790,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1497,6 +1895,26 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1582,6 +2000,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1667,6 +2105,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1752,6 +2210,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1837,6 +2315,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1922,6 +2420,26 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2007,6 +2525,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2092,6 +2630,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2177,6 +2735,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2262,6 +2840,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2347,6 +2945,26 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2432,6 +3050,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2517,6 +3155,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2622,6 +3280,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2729,6 +3411,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2836,6 +3542,30 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2943,6 +3673,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3050,6 +3804,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3157,6 +3935,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3264,6 +4066,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3371,6 +4197,30 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3478,6 +4328,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3585,6 +4459,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3692,6 +4590,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3799,6 +4721,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3906,6 +4852,30 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4013,6 +4983,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4120,6 +5114,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4207,6 +5225,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 @@ -4292,6 +5336,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -4377,6 +5447,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 @@ -4462,6 +5558,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -4535,6 +5657,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 @@ -4607,6 +5749,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -4679,6 +5841,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 @@ -4751,6 +5933,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -4823,6 +6025,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -4895,6 +6117,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -4967,6 +6209,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release @@ -5039,6 +6301,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -5111,6 +6393,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5195,6 +6497,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -5280,6 +6606,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -5365,6 +6715,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5450,6 +6824,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5535,6 +6929,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5620,6 +7034,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5705,6 +7139,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5790,6 +7244,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5875,6 +7349,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5960,6 +7454,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6045,6 +7559,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6130,6 +7664,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6215,6 +7769,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6300,6 +7874,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6385,6 +7979,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6470,6 +8084,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6555,6 +8189,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6640,6 +8294,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6745,6 +8419,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6852,6 +8550,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6959,6 +8681,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7066,6 +8812,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7173,6 +8943,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7280,6 +9074,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7387,6 +9205,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7494,6 +9336,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7601,6 +9467,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7708,6 +9598,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7815,6 +9729,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7922,6 +9860,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8029,6 +9991,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8136,6 +10122,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index ead08ed..cc06b9b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 @@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 @@ -260,6 +314,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 @@ -358,6 +440,36 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 @@ -431,6 +543,26 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 @@ -503,6 +635,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 @@ -582,6 +734,28 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 @@ -661,6 +835,28 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 @@ -733,6 +929,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic @@ -814,6 +1030,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire @@ -893,6 +1132,28 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release @@ -981,6 +1242,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel @@ -1069,6 +1355,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst @@ -1155,6 +1466,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire @@ -1249,6 +1585,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel @@ -1343,6 +1706,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst @@ -1428,6 +1818,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1522,6 +1932,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1614,6 +2047,28 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1715,6 +2170,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1816,6 +2296,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1910,6 +2415,29 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2004,6 +2532,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2105,6 +2656,31 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2206,6 +2782,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2307,6 +2908,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2408,6 +3034,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2513,6 +3164,30 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2625,6 +3300,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2739,6 +3439,32 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2858,6 +3584,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2977,6 +3730,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3089,6 +3869,31 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3201,6 +4006,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3320,6 +4150,33 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3439,6 +4296,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3558,6 +4442,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3677,6 +4588,33 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3796,6 +4734,33 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3915,6 +4880,33 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4034,6 +5026,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4153,6 +5172,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4240,6 +5286,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 @@ -4325,6 +5397,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -4413,6 +5511,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 @@ -4504,6 +5629,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -4577,6 +5730,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 @@ -4649,6 +5822,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -4724,6 +5917,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 @@ -4799,6 +6013,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -4871,6 +6106,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -4947,6 +6202,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5022,6 +6299,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release @@ -5101,6 +6399,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -5180,6 +6501,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -5267,6 +6611,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5358,6 +6727,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -5449,6 +6844,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -5534,6 +6955,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5623,6 +7064,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5711,6 +7174,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5803,6 +7287,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5895,6 +7402,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5984,6 +7514,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6073,6 +7625,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6165,6 +7739,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6257,6 +7854,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6349,6 +7969,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6441,6 +8084,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6533,6 +8199,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6625,6 +8314,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6717,6 +8429,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6809,6 +8544,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -6914,6 +8672,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7024,6 +8806,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7134,6 +8941,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7247,6 +9079,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7360,6 +9218,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7470,6 +9354,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7580,6 +9489,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7693,6 +9627,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7806,6 +9766,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -7919,6 +9905,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8032,6 +10044,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8145,6 +10183,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8258,6 +10322,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8371,6 +10461,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -8484,6 +10600,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX940-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 43120f5..8537aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -90,6 +92,26 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4 @@ -180,6 +202,26 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4 @@ -278,6 +320,28 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4 @@ -381,6 +445,28 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4 @@ -463,6 +549,26 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4 @@ -544,6 +650,26 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4 @@ -634,6 +760,30 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4 @@ -724,6 +874,30 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4 @@ -805,6 +979,26 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic @@ -901,6 +1095,30 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire @@ -991,6 +1209,30 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release @@ -1096,6 +1338,34 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel @@ -1201,6 +1471,34 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst @@ -1304,6 +1602,32 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire @@ -1417,6 +1741,36 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel @@ -1530,6 +1884,36 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst @@ -1619,6 +2003,26 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1723,6 +2127,30 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1821,6 +2249,30 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1934,6 +2386,34 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2047,6 +2527,34 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2151,6 +2659,30 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2255,6 +2787,30 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2368,6 +2924,34 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2481,6 +3065,34 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2594,6 +3206,34 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2707,6 +3347,34 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2820,6 +3488,34 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2933,6 +3629,34 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3046,6 +3770,34 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3159,6 +3911,34 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3264,6 +4044,30 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3379,6 +4183,32 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3495,6 +4325,34 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3619,6 +4477,36 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3743,6 +4631,36 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3858,6 +4776,32 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3973,6 +4917,32 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4097,6 +5067,36 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4221,6 +5221,36 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4345,6 +5375,36 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4469,6 +5529,36 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4593,6 +5683,36 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4717,6 +5837,36 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4841,6 +5991,36 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4965,6 +6145,36 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5057,6 +6267,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 @@ -5147,6 +6377,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 @@ -5245,6 +6495,28 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 @@ -5348,6 +6620,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 @@ -5430,6 +6724,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 @@ -5511,6 +6825,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 @@ -5601,6 +6935,30 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 @@ -5691,6 +7049,30 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 @@ -5772,6 +7154,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic @@ -5868,6 +7270,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire @@ -5958,6 +7384,30 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release @@ -6063,6 +7513,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -6168,6 +7646,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -6271,6 +7777,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire @@ -6384,6 +7916,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -6497,6 +8059,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -6586,6 +8178,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6690,6 +8302,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6788,6 +8424,30 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6901,6 +8561,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7014,6 +8702,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7118,6 +8834,30 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7222,6 +8962,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7335,6 +9099,34 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7448,6 +9240,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7561,6 +9381,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7674,6 +9522,34 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7787,6 +9663,34 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7900,6 +9804,34 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8013,6 +9945,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8126,6 +10086,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8231,6 +10219,30 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8346,6 +10358,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8470,6 +10508,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8594,6 +10662,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8709,6 +10807,32 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8824,6 +10948,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8948,6 +11098,36 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9072,6 +11252,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9196,6 +11406,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9320,6 +11560,36 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9444,6 +11714,36 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9568,6 +11868,36 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9692,6 +12022,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9816,6 +12176,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 0261671..38062a8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -91,6 +93,28 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 @@ -191,6 +215,28 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -284,6 +330,28 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -379,6 +447,28 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 949b5a5..d09b067 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -90,6 +92,26 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4 @@ -180,6 +202,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4 @@ -270,6 +312,26 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4 @@ -360,6 +422,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4 @@ -442,6 +524,26 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4 @@ -523,6 +625,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4 @@ -604,6 +726,26 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4 @@ -685,6 +827,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4 @@ -766,6 +928,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic @@ -847,6 +1029,26 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire @@ -928,6 +1130,26 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release @@ -1009,6 +1231,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1090,6 +1332,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1185,6 +1447,30 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire @@ -1281,6 +1567,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1377,6 +1687,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1466,6 +1800,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1555,6 +1909,26 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1644,6 +2018,26 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1733,6 +2127,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1822,6 +2236,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1911,6 +2345,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2000,6 +2454,26 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2089,6 +2563,26 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2178,6 +2672,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2267,6 +2781,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2356,6 +2890,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2445,6 +2999,26 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2534,6 +3108,26 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2623,6 +3217,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2712,6 +3326,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2817,6 +3451,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2924,6 +3582,30 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3031,6 +3713,30 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3138,6 +3844,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3245,6 +3975,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3352,6 +4106,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3459,6 +4237,30 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3566,6 +4368,30 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3673,6 +4499,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3780,6 +4630,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3887,6 +4761,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3994,6 +4892,30 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4101,6 +5023,30 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4208,6 +5154,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4315,6 +5285,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4407,6 +5401,26 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 @@ -4497,6 +5511,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -4587,6 +5621,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 @@ -4677,6 +5731,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -4759,6 +5833,26 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 @@ -4840,6 +5934,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -4921,6 +6035,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 @@ -5002,6 +6136,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -5083,6 +6237,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -5164,6 +6338,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -5245,6 +6439,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release @@ -5326,6 +6540,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -5407,6 +6641,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5502,6 +6756,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -5598,6 +6876,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -5694,6 +6996,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5783,6 +7109,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5872,6 +7218,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5961,6 +7327,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6050,6 +7436,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6139,6 +7545,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6228,6 +7654,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6317,6 +7763,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6406,6 +7872,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6495,6 +7981,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6584,6 +8090,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6673,6 +8199,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6762,6 +8308,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6851,6 +8417,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6940,6 +8526,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7029,6 +8635,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7134,6 +8760,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7241,6 +8891,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7348,6 +9022,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7455,6 +9153,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7562,6 +9284,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7669,6 +9415,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7776,6 +9546,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7883,6 +9677,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7990,6 +9808,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8097,6 +9939,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8204,6 +10070,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8311,6 +10201,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8418,6 +10332,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8525,6 +10463,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8632,6 +10594,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 93a58c1..54407c1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -90,6 +92,26 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4 @@ -180,6 +202,26 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4 @@ -280,6 +322,28 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4 @@ -385,6 +449,28 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 @@ -467,6 +553,26 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4 @@ -548,6 +654,26 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4 @@ -640,6 +766,30 @@ define amdgpu_kernel void @global_system_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out release, align 4 @@ -732,6 +882,30 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 @@ -813,6 +987,26 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic @@ -911,6 +1105,30 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire @@ -1003,6 +1221,30 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release @@ -1112,6 +1354,34 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel @@ -1221,6 +1491,34 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -1326,6 +1624,32 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire @@ -1443,6 +1767,36 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel @@ -1560,6 +1914,36 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -1649,6 +2033,26 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1755,6 +2159,30 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1855,6 +2283,30 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1972,6 +2424,34 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2089,6 +2569,34 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2195,6 +2703,30 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2301,6 +2833,30 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2418,6 +2974,34 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2535,6 +3119,34 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2652,6 +3264,34 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2769,6 +3409,34 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2874,6 +3542,30 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2991,6 +3683,32 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3119,6 +3837,36 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3247,6 +3995,36 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3364,6 +4142,32 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3481,6 +4285,32 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3609,6 +4439,36 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3737,6 +4597,36 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3865,6 +4755,36 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3993,6 +4913,36 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4121,6 +5071,36 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4249,6 +5229,36 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4377,6 +5387,36 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4505,6 +5545,36 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4597,6 +5667,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 @@ -4687,6 +5777,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 @@ -4787,6 +5897,28 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 @@ -4892,6 +6024,28 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 @@ -4974,6 +6128,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 @@ -5055,6 +6229,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 @@ -5147,6 +6341,30 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 @@ -5239,6 +6457,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 @@ -5320,6 +6562,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic @@ -5418,6 +6680,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire @@ -5510,6 +6796,30 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release @@ -5619,6 +6929,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel @@ -5728,6 +7066,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst @@ -5833,6 +7199,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire @@ -5950,6 +7342,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel @@ -6067,6 +7489,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst @@ -6156,6 +7608,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6262,6 +7734,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6362,6 +7858,30 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6479,6 +7999,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6596,6 +8144,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6702,6 +8278,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6808,6 +8408,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6925,6 +8549,34 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7042,6 +8694,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7159,6 +8839,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7276,6 +8984,34 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7393,6 +9129,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7510,6 +9274,34 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7627,6 +9419,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7744,6 +9564,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7849,6 +9697,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7966,6 +9838,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8084,6 +9982,34 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8212,6 +10138,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8340,6 +10296,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8457,6 +10443,32 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8574,6 +10586,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8702,6 +10740,36 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8830,6 +10898,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8958,6 +11056,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9086,6 +11214,36 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9214,6 +11372,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9342,6 +11530,36 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9470,6 +11688,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9598,6 +11846,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 3fe2c7b..32e62b3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -90,6 +92,26 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4 @@ -180,6 +202,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4 @@ -270,6 +312,26 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4 @@ -360,6 +422,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4 @@ -442,6 +524,26 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4 @@ -523,6 +625,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4 @@ -604,6 +726,26 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4 @@ -685,6 +827,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4 @@ -766,6 +928,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic @@ -847,6 +1029,26 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire @@ -928,6 +1130,26 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release @@ -1009,6 +1231,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1090,6 +1332,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1185,6 +1447,30 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire @@ -1281,6 +1567,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1377,6 +1687,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1466,6 +1800,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1555,6 +1909,26 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1644,6 +2018,26 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1733,6 +2127,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1822,6 +2236,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1911,6 +2345,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2000,6 +2454,26 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2089,6 +2563,26 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2178,6 +2672,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2267,6 +2781,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2356,6 +2890,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2445,6 +2999,26 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2534,6 +3108,26 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2623,6 +3217,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2712,6 +3326,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2817,6 +3451,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2924,6 +3582,30 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3031,6 +3713,30 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3138,6 +3844,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3245,6 +3975,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3352,6 +4106,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3459,6 +4237,30 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3566,6 +4368,30 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3673,6 +4499,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3780,6 +4630,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3887,6 +4761,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3994,6 +4892,30 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4101,6 +5023,30 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4208,6 +5154,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4315,6 +5285,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4407,6 +5401,26 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 @@ -4497,6 +5511,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -4587,6 +5621,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 @@ -4677,6 +5731,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -4759,6 +5833,26 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 @@ -4840,6 +5934,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -4921,6 +6035,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 @@ -5002,6 +6136,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -5083,6 +6237,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -5164,6 +6338,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -5245,6 +6439,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release @@ -5326,6 +6540,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -5407,6 +6641,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5502,6 +6756,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -5598,6 +6876,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -5694,6 +6996,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5783,6 +7109,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5872,6 +7218,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5961,6 +7327,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6050,6 +7436,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6139,6 +7545,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6228,6 +7654,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6317,6 +7763,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6406,6 +7872,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6495,6 +7981,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6584,6 +8090,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6673,6 +8199,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6762,6 +8308,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6851,6 +8417,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6940,6 +8526,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7029,6 +8635,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7134,6 +8760,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7241,6 +8891,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7348,6 +9022,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7455,6 +9153,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7562,6 +9284,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7669,6 +9415,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7776,6 +9546,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7883,6 +9677,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7990,6 +9808,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8097,6 +9939,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8204,6 +10070,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8311,6 +10201,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8418,6 +10332,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8525,6 +10463,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8632,6 +10594,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 339cb98..7fa4873 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -90,6 +92,26 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4 @@ -180,6 +202,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4 @@ -272,6 +314,27 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 @@ -368,6 +431,27 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4 @@ -450,6 +534,26 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4 @@ -531,6 +635,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4 @@ -620,6 +744,28 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 @@ -709,6 +855,28 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4 @@ -790,6 +958,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic @@ -875,6 +1063,28 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire @@ -964,6 +1174,28 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release @@ -1057,6 +1289,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1150,6 +1406,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1247,6 +1527,31 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire @@ -1353,6 +1658,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1459,6 +1791,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1548,6 +1907,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1641,6 +2020,28 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1738,6 +2139,28 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1839,6 +2262,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1940,6 +2387,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2033,6 +2504,28 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2126,6 +2619,28 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2227,6 +2742,30 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2328,6 +2867,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2429,6 +2992,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2530,6 +3117,30 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2631,6 +3242,30 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2732,6 +3367,30 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2833,6 +3492,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2934,6 +3617,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3039,6 +3746,30 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3148,6 +3879,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3263,6 +4019,32 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3380,6 +4162,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3497,6 +4306,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3606,6 +4442,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3715,6 +4576,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3832,6 +4718,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3949,6 +4862,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4066,6 +5006,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4183,6 +5150,33 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4300,6 +5294,33 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4417,6 +5438,33 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4534,6 +5582,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4651,6 +5726,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4743,6 +5845,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 @@ -4833,6 +5955,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -4925,6 +6067,27 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 @@ -5018,6 +6181,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -5100,6 +6284,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 @@ -5181,6 +6385,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -5265,6 +6489,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 @@ -5349,6 +6594,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -5430,6 +6696,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -5515,6 +6801,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5599,6 +6907,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release @@ -5687,6 +7016,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -5775,6 +7127,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -5872,6 +7247,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5973,6 +7373,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -6074,6 +7500,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -6163,6 +7615,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6256,6 +7728,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6348,6 +7842,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6444,6 +7959,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6540,6 +8078,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6633,6 +8194,28 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6726,6 +8309,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6822,6 +8427,29 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -6918,6 +8546,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7014,6 +8665,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7110,6 +8784,29 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7206,6 +8903,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7302,6 +9022,29 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7398,6 +9141,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7494,6 +9260,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7599,6 +9388,30 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7708,6 +9521,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7818,6 +9656,31 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -7930,6 +9793,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8042,6 +9931,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8151,6 +10066,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8260,6 +10200,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8372,6 +10337,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8484,6 +10475,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8596,6 +10613,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8708,6 +10751,32 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8820,6 +10889,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -8932,6 +11027,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9044,6 +11165,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -9156,6 +11303,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index a328fd3..c0dcb96 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -88,6 +90,28 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4 @@ -176,6 +200,28 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4 @@ -266,6 +312,29 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4 @@ -364,6 +433,31 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4 @@ -438,6 +532,24 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4 @@ -511,6 +623,24 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4 @@ -592,6 +722,26 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4 @@ -673,6 +823,26 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4 @@ -746,6 +916,24 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic @@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire @@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release @@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel @@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst @@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire @@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel @@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst @@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 @@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 @@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 @@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 @@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 @@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 @@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 @@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 @@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic @@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire @@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release @@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire @@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index df3e522..75a6572 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -99,6 +101,30 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 @@ -201,6 +227,30 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -299,6 +349,30 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -398,6 +472,30 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index 67ce190..8e8080e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -88,6 +90,28 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4 @@ -176,6 +200,28 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4 @@ -264,6 +310,28 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4 @@ -352,6 +420,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4 @@ -426,6 +516,24 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4 @@ -499,6 +607,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4 @@ -572,6 +698,24 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4 @@ -645,6 +789,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4 @@ -718,6 +880,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic @@ -791,6 +971,24 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -864,6 +1062,24 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release @@ -937,6 +1153,24 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1010,6 +1244,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1097,6 +1349,28 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -1185,6 +1459,28 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1273,6 +1569,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1354,6 +1672,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1435,6 +1773,26 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1516,6 +1874,26 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1597,6 +1975,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1678,6 +2076,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1759,6 +2177,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1840,6 +2278,26 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1921,6 +2379,26 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2002,6 +2480,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2083,6 +2581,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2164,6 +2682,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2245,6 +2783,26 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2326,6 +2884,26 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2407,6 +2985,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2488,6 +3086,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2583,6 +3201,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2680,6 +3322,30 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2777,6 +3443,30 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2874,6 +3564,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2971,6 +3685,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3068,6 +3806,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3165,6 +3927,30 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3262,6 +4048,30 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3359,6 +4169,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3456,6 +4290,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3553,6 +4411,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3650,6 +4532,30 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3747,6 +4653,30 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3844,6 +4774,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3941,6 +4895,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4031,6 +5009,28 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 @@ -4119,6 +5119,28 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -4207,6 +5229,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 @@ -4295,6 +5339,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -4369,6 +5435,24 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 @@ -4442,6 +5526,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -4515,6 +5617,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 @@ -4588,6 +5708,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -4661,6 +5799,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -4734,6 +5890,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -4807,6 +5981,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release @@ -4880,6 +6072,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -4953,6 +6163,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5040,6 +6268,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -5128,6 +6378,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -5216,6 +6488,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -5297,6 +6591,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5378,6 +6692,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5459,6 +6793,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5540,6 +6894,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5621,6 +6995,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5702,6 +7096,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5783,6 +7197,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5864,6 +7298,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5945,6 +7399,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6026,6 +7500,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6107,6 +7601,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6188,6 +7702,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6269,6 +7803,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6350,6 +7904,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6431,6 +8005,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6526,6 +8120,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6623,6 +8241,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6720,6 +8362,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6817,6 +8483,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6914,6 +8604,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7011,6 +8725,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7108,6 +8846,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7205,6 +8967,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7302,6 +9088,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7399,6 +9209,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7496,6 +9330,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7593,6 +9451,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7690,6 +9572,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7787,6 +9693,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7884,6 +9814,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 4efd46d..8e0770e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -88,6 +90,28 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4 @@ -176,6 +200,28 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4 @@ -266,6 +312,29 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4 @@ -364,6 +433,31 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4 @@ -438,6 +532,24 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4 @@ -511,6 +623,24 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4 @@ -592,6 +722,26 @@ define amdgpu_kernel void @local_system_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out release, align 4 @@ -673,6 +823,26 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4 @@ -746,6 +916,24 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic @@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire @@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release @@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel @@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst @@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire @@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel @@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst @@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 @@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 @@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 @@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 @@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 @@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 @@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 @@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 @@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic @@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire @@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release @@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel @@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst @@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire @@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel @@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst @@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index 046325f..ba58a05 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -88,6 +90,28 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4 @@ -176,6 +200,28 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4 @@ -264,6 +310,28 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4 @@ -352,6 +420,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4 @@ -426,6 +516,24 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4 @@ -499,6 +607,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4 @@ -572,6 +698,24 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4 @@ -645,6 +789,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4 @@ -718,6 +880,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic @@ -791,6 +971,24 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -864,6 +1062,24 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release @@ -937,6 +1153,24 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1010,6 +1244,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1097,6 +1349,28 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -1185,6 +1459,28 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1273,6 +1569,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1354,6 +1672,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1435,6 +1773,26 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1516,6 +1874,26 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1597,6 +1975,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1678,6 +2076,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1759,6 +2177,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1840,6 +2278,26 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1921,6 +2379,26 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2002,6 +2480,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2083,6 +2581,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2164,6 +2682,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2245,6 +2783,26 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2326,6 +2884,26 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2407,6 +2985,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2488,6 +3086,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2583,6 +3201,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2680,6 +3322,30 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2777,6 +3443,30 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2874,6 +3564,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2971,6 +3685,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3068,6 +3806,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3165,6 +3927,30 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3262,6 +4048,30 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3359,6 +4169,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3456,6 +4290,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3553,6 +4411,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3650,6 +4532,30 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3747,6 +4653,30 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3844,6 +4774,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3941,6 +4895,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4031,6 +5009,28 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 @@ -4119,6 +5119,28 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -4207,6 +5229,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 @@ -4295,6 +5339,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -4369,6 +5435,24 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 @@ -4442,6 +5526,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -4515,6 +5617,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 @@ -4588,6 +5708,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -4661,6 +5799,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -4734,6 +5890,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -4807,6 +5981,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release @@ -4880,6 +6072,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -4953,6 +6163,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5040,6 +6268,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -5128,6 +6378,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -5216,6 +6488,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -5297,6 +6591,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5378,6 +6692,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5459,6 +6793,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5540,6 +6894,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5621,6 +6995,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5702,6 +7096,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5783,6 +7197,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5864,6 +7298,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5945,6 +7399,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6026,6 +7500,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6107,6 +7601,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6188,6 +7702,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6269,6 +7803,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6350,6 +7904,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6431,6 +8005,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6526,6 +8120,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6623,6 +8241,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6720,6 +8362,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6817,6 +8483,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6914,6 +8604,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7011,6 +8725,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7108,6 +8846,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7205,6 +8967,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7302,6 +9088,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7399,6 +9209,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7496,6 +9330,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7593,6 +9451,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7690,6 +9572,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7787,6 +9693,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7884,6 +9814,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 580d7a8..0cfa962 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: @@ -88,6 +90,28 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4 @@ -176,6 +200,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4 @@ -266,6 +312,29 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 @@ -364,6 +433,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4 @@ -438,6 +532,24 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4 @@ -511,6 +623,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4 @@ -592,6 +722,26 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 @@ -673,6 +823,26 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4 @@ -746,6 +916,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic @@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire @@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release @@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire @@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: buffer_inv sc0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 @@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 @@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 @@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 @@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release @@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 0fd1e7a..b511b98 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: @@ -125,6 +127,28 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 @@ -253,6 +277,30 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -378,6 +426,28 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -504,6 +574,30 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1: +; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt +; GFX940-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1: +; GFX940-TGSPLIT: ; %bb.0: ; %entry +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt +; GFX940-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() -- 2.7.4