#include <asm/qspinlock_types.h>
#include <asm/paravirt.h>
+#ifdef CONFIG_PPC64
+/*
+ * Use the EH=1 hint for accesses that result in the lock being acquired.
+ * The hardware is supposed to optimise this pattern by holding the lock
+ * cacheline longer, and releasing when a store to the same memory (the
+ * unlock) is performed.
+ */
+#define _Q_SPIN_EH_HINT 1
+#else
+#define _Q_SPIN_EH_HINT 0
+#endif
+
/*
* The trylock itself may steal. This makes trylocks slightly stronger, and
- * might make spin locks slightly more efficient when stealing.
+ * makes locks slightly more efficient when stealing.
*
* This is compile-time, so if true then there may always be stealers, so the
* nosteal paths become unused.
*/
#define _Q_SPIN_TRY_LOCK_STEAL 1
+/*
+ * Put a speculation barrier after testing the lock/node and finding it
+ * busy. Try to prevent pointless speculation in slow paths.
+ *
+ * Slows down the lockstorm microbenchmark with no stealing, where locking
+ * is purely FIFO through the queue. May have more benefit in real workload
+ * where speculating into the wrong place could have a greater cost.
+ */
+#define _Q_SPIN_SPEC_BARRIER 0
+
+#ifdef CONFIG_PPC64
+/*
+ * Execute a miso instruction after passing the MCS lock ownership to the
+ * queue head. Miso is intended to make stores visible to other CPUs sooner.
+ *
+ * This seems to make the lockstorm microbenchmark nospin test go slightly
+ * faster on POWER10, but disable for now.
+ */
+#define _Q_SPIN_MISO 0
+#else
+#define _Q_SPIN_MISO 0
+#endif
+
+#ifdef CONFIG_PPC64
+/*
+ * This executes miso after an unlock of the lock word, having ownership
+ * pass to the next CPU sooner. This will slow the uncontended path to some
+ * degree. Not evidence it helps yet.
+ */
+#define _Q_SPIN_MISO_UNLOCK 0
+#else
+#define _Q_SPIN_MISO_UNLOCK 0
+#endif
+
+/*
+ * Seems to slow down lockstorm microbenchmark, suspect queue node just
+ * has to become shared again right afterwards when its waiter spins on
+ * the lock field.
+ */
+#define _Q_SPIN_PREFETCH_NEXT 0
+
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
return READ_ONCE(lock->val);
"2: \n"
: "=&r" (prev)
: "r" (&lock->val), "r" (new),
- "i" (IS_ENABLED(CONFIG_PPC64))
+ "i" (_Q_SPIN_EH_HINT)
: "cr0", "memory");
return likely(prev == 0);
"2: \n"
: "=&r" (prev), "=&r" (tmp)
: "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK),
- "i" (IS_ENABLED(CONFIG_PPC64))
+ "i" (_Q_SPIN_EH_HINT)
: "cr0", "memory");
return likely(!(prev & ~_Q_TAIL_CPU_MASK));
static inline void queued_spin_unlock(struct qspinlock *lock)
{
smp_store_release(&lock->locked, 0);
+ if (_Q_SPIN_MISO_UNLOCK)
+ asm volatile("miso" ::: "memory");
}
#define arch_spin_is_locked(l) queued_spin_is_locked(l)
static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
+#if _Q_SPIN_SPEC_BARRIER == 1
+#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0)
+#else
+#define spec_barrier() do { } while (0)
+#endif
+
static __always_inline bool recently_sleepy(void)
{
/* pv_sleepy_lock is true when this is called */
: "r" (&lock->val), "r"(tail), "r" (newval),
"i" (_Q_LOCKED_VAL),
"r" (_Q_TAIL_CPU_MASK),
- "i" (IS_ENABLED(CONFIG_PPC64))
+ "i" (_Q_SPIN_EH_HINT)
: "cr0", "memory");
return prev;
val = READ_ONCE(lock->val);
if (val & _Q_MUST_Q_VAL)
break;
+ spec_barrier();
if (unlikely(!(val & _Q_LOCKED_VAL))) {
spin_end();
qnodesp = this_cpu_ptr(&qnodes);
if (unlikely(qnodesp->count >= MAX_NODES)) {
+ spec_barrier();
while (!queued_spin_trylock(lock))
cpu_relax();
return;
/* Wait for mcs node lock to be released */
spin_begin();
while (!node->locked) {
+ spec_barrier();
+
if (yield_to_prev(lock, node, old, paravirt))
seen_preempted = true;
}
+ spec_barrier();
spin_end();
/* Clear out stale propagated yield_cpu */
node->yield_cpu = -1;
smp_rmb(); /* acquire barrier for the mcs lock */
+
+ /*
+ * Generic qspinlocks have this prefetch here, but it seems
+ * like it could cause additional line transitions because
+ * the waiter will keep loading from it.
+ */
+ if (_Q_SPIN_PREFETCH_NEXT) {
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
}
/* We're at the head of the waitqueue, wait for the lock. */
val = READ_ONCE(lock->val);
if (!(val & _Q_LOCKED_VAL))
break;
+ spec_barrier();
if (paravirt && pv_sleepy_lock && maybe_stealers) {
if (!sleepy) {
val |= _Q_MUST_Q_VAL;
}
}
+ spec_barrier();
spin_end();
/* If we're the last queued, must clean up the tail. */
cpu_relax();
spin_end();
}
+ spec_barrier();
/*
* Unlock the next mcs waiter node. Release barrier is not required
if (paravirt && pv_prod_head) {
int next_cpu = next->cpu;
WRITE_ONCE(next->locked, 1);
+ if (_Q_SPIN_MISO)
+ asm volatile("miso" ::: "memory");
if (vcpu_is_preempted(next_cpu))
prod_cpu(next_cpu);
} else {
WRITE_ONCE(next->locked, 1);
+ if (_Q_SPIN_MISO)
+ asm volatile("miso" ::: "memory");
}
release:
* is passed as the paravirt argument to the functions.
*/
if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
- if (try_to_steal_lock(lock, true))
+ if (try_to_steal_lock(lock, true)) {
+ spec_barrier();
return;
+ }
queued_spin_lock_mcs_queue(lock, true);
} else {
- if (try_to_steal_lock(lock, false))
+ if (try_to_steal_lock(lock, false)) {
+ spec_barrier();
return;
+ }
queued_spin_lock_mcs_queue(lock, false);
}
}