virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
const = 0;
- /// Inserts any necessary instructions at position \p Pos relative
- /// to instruction \p MI to ensure any caches associated with
- /// address spaces \p AddrSpace for memory scopes up to memory scope
- /// \p Scope are invalidated. Returns true iff any instructions
- /// inserted.
- virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const = 0;
+ /// Inserts any necessary instructions at position \p Pos relative to
+ /// instruction \p MI to ensure any subsequent memory instructions of this
+ /// thread with address spaces \p AddrSpace will observe the previous memory
+ /// operations by any thread for memory scopes up to memory scope \p Scope .
+ /// Returns true iff any instructions inserted.
+ virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative
- /// to instruction \p MI to ensure memory instructions of kind \p Op
- /// associated with address spaces \p AddrSpace have completed as
- /// observed by other memory instructions executing in memory scope
- /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
- /// ordering is between address spaces. Returns true iff any
- /// instructions inserted.
+ /// to instruction \p MI to ensure memory instructions before \p Pos of kind
+ /// \p Op associated with address spaces \p AddrSpace have completed. Used
+ /// between memory instructions to enforce the order they become visible as
+ /// observed by other memory instructions executing in memory scope \p Scope.
+ /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
+ /// address spaces. Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
+ /// Inserts any necessary instructions at position \p Pos relative to
+ /// instruction \p MI to ensure previous memory instructions by this thread
+ /// with address spaces \p AddrSpace have completed and can be observed by
+ /// subsequent memory instructions by any thread executing in memory scope \p
+ /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
+ /// between address spaces. Returns true iff any instructions inserted.
+ virtual bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const = 0;
+
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
};
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
- /// TODO: Do not set glc for rmw atomic operations as they
- /// implicitly bypass the L1 cache.
-
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
return Changed;
}
-bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
if (!InsertCacheInv)
return false;
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
- // If no cross address space ordering then an LDS waitcnt is not
- // needed as LDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/GDS memory as LDS operations
- // could be reordered with respect to later global/GDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- // If no cross address space ordering then an GDS waitcnt is not
- // needed as GDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/LDS memory as GDS operations
- // could be reordered with respect to later global/LDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+ // is not needed as GDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/LDS memory as GDS operations could be
+ // reordered with respect to later global/LDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
return Changed;
}
-bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
if (!InsertCacheInv)
return false;
const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
- const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
- ? AMDGPU::BUFFER_WBINVL1
- : AMDGPU::BUFFER_WBINVL1_VOL;
+ const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
+ ? AMDGPU::BUFFER_WBINVL1
+ : AMDGPU::BUFFER_WBINVL1_VOL;
if (Pos == Position::AFTER)
++MI;
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(Flush));
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
/// sequentially consistent, and no other thread can access scratch
/// memory.
- /// Other address spaces do not hava a cache.
+ /// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode and all waves of a work-group are on the same CU, and so the
- // L0 does not need to be bypassed.
+ // CU mode all waves of a work-group are on the same CU, and so the L0
+ // does not need to be bypassed.
if (!CuMode) Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
return Changed;
}
-bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
if (!InsertCacheInv)
return false;
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
- // If no cross address space ordering then an LDS waitcnt is not
- // needed as LDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/GDS memory as LDS operations
- // could be reordered with respect to later global/GDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- // If no cross address space ordering then an GDS waitcnt is not
- // needed as GDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/LDS memory as GDS operations
- // could be reordered with respect to later global/LDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+ // is not needed as GDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/LDS memory as GDS operations could be
+ // reordered with respect to later global/LDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
return Changed;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
return Changed;
}
/// ordering and memory scope, then library does not need to
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
- /// adding waitcnt before a S_BARRIER.
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ /// adding S_WAITCNT before a S_BARRIER.
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ // TODO: If both release and invalidate are happening they could be combined
+ // to use the single "BUFFER_WBL2" instruction. This could be done by
+ // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
+ // track cache invalidate and write back instructions.
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::BEFORE);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::BEFORE);
return Changed;
}
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
return Changed;