powerpc/qspinlock: add compile-time tuning adjustments

author Nicholas Piggin <npiggin@gmail.com>

Sat, 26 Nov 2022 09:59:32 +0000 (19:59 +1000)

committer Michael Ellerman <mpe@ellerman.id.au>

Fri, 2 Dec 2022 06:48:50 +0000 (17:48 +1100)
author Nicholas Piggin <npiggin@gmail.com>
Sat, 26 Nov 2022 09:59:32 +0000 (19:59 +1000)
committer Michael Ellerman <mpe@ellerman.id.au>
Fri, 2 Dec 2022 06:48:50 +0000 (17:48 +1100)
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h

index 93b1c976db8a50d7aa2aa39e5dba7d79f4cd9e33..28a53fb69b381a65e0eee111b75427d5e51b20b3 100644 (file)
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -6,15 +6,68 @@
  #include <asm/qspinlock_types.h>
  #include <asm/paravirt.h>
  
+#ifdef CONFIG_PPC64
+/*
+ * Use the EH=1 hint for accesses that result in the lock being acquired.
+ * The hardware is supposed to optimise this pattern by holding the lock
+ * cacheline longer, and releasing when a store to the same memory (the
+ * unlock) is performed.
+ */
+#define _Q_SPIN_EH_HINT 1
+#else
+#define _Q_SPIN_EH_HINT 0
+#endif
+
  /*
   * The trylock itself may steal. This makes trylocks slightly stronger, and
- * might make spin locks slightly more efficient when stealing.
+ * makes locks slightly more efficient when stealing.
   *
   * This is compile-time, so if true then there may always be stealers, so the
   * nosteal paths become unused.
   */
  #define _Q_SPIN_TRY_LOCK_STEAL 1
  
+/*
+ * Put a speculation barrier after testing the lock/node and finding it
+ * busy. Try to prevent pointless speculation in slow paths.
+ *
+ * Slows down the lockstorm microbenchmark with no stealing, where locking
+ * is purely FIFO through the queue. May have more benefit in real workload
+ * where speculating into the wrong place could have a greater cost.
+ */
+#define _Q_SPIN_SPEC_BARRIER 0
+
+#ifdef CONFIG_PPC64
+/*
+ * Execute a miso instruction after passing the MCS lock ownership to the
+ * queue head. Miso is intended to make stores visible to other CPUs sooner.
+ *
+ * This seems to make the lockstorm microbenchmark nospin test go slightly
+ * faster on POWER10, but disable for now.
+ */
+#define _Q_SPIN_MISO 0
+#else
+#define _Q_SPIN_MISO 0
+#endif
+
+#ifdef CONFIG_PPC64
+/*
+ * This executes miso after an unlock of the lock word, having ownership
+ * pass to the next CPU sooner. This will slow the uncontended path to some
+ * degree. Not evidence it helps yet.
+ */
+#define _Q_SPIN_MISO_UNLOCK 0
+#else
+#define _Q_SPIN_MISO_UNLOCK 0
+#endif
+
+/*
+ * Seems to slow down lockstorm microbenchmark, suspect queue node just
+ * has to become shared again right afterwards when its waiter spins on
+ * the lock field.
+ */
+#define _Q_SPIN_PREFETCH_NEXT 0
+
  static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
  {
         return READ_ONCE(lock->val);
@@ -52,7 +105,7 @@ static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock)
  "2:                                                                    \n"
         : "=&r" (prev)
         : "r" (&lock->val), "r" (new),
-         "i" (IS_ENABLED(CONFIG_PPC64))
+         "i" (_Q_SPIN_EH_HINT)
         : "cr0", "memory");
  
         return likely(prev == 0);
@@ -76,7 +129,7 @@ static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock)
  "2:                                                                    \n"
         : "=&r" (prev), "=&r" (tmp)
         : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK),
-         "i" (IS_ENABLED(CONFIG_PPC64))
+         "i" (_Q_SPIN_EH_HINT)
         : "cr0", "memory");
  
         return likely(!(prev & ~_Q_TAIL_CPU_MASK));
@@ -101,6 +154,8 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
  static inline void queued_spin_unlock(struct qspinlock *lock)
  {
         smp_store_release(&lock->locked, 0);
+       if (_Q_SPIN_MISO_UNLOCK)
+               asm volatile("miso" ::: "memory");
  }
  
  #define arch_spin_is_locked(l)         queued_spin_is_locked(l)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c

index 0f33a07c1d192df2be42d042d98177ffcad7dd99..1cf5d3e7525015fe5202c97d9236b8e5da8cc10d 100644 (file)
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -48,6 +48,12 @@ static bool pv_prod_head __read_mostly = false;
  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
  static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
  
+#if _Q_SPIN_SPEC_BARRIER == 1
+#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0)
+#else
+#define spec_barrier() do { } while (0)
+#endif
+
  static __always_inline bool recently_sleepy(void)
  {
         /* pv_sleepy_lock is true when this is called */
@@ -137,7 +143,7 @@ static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail)
         : "r" (&lock->val), "r"(tail), "r" (newval),
           "i" (_Q_LOCKED_VAL),
           "r" (_Q_TAIL_CPU_MASK),
-         "i" (IS_ENABLED(CONFIG_PPC64))
+         "i" (_Q_SPIN_EH_HINT)
         : "cr0", "memory");
  
         return prev;
@@ -475,6 +481,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav
                 val = READ_ONCE(lock->val);
                 if (val & _Q_MUST_Q_VAL)
                         break;
+               spec_barrier();
  
                 if (unlikely(!(val & _Q_LOCKED_VAL))) {
                         spin_end();
@@ -540,6 +547,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
  
         qnodesp = this_cpu_ptr(&qnodes);
         if (unlikely(qnodesp->count >= MAX_NODES)) {
+               spec_barrier();
                 while (!queued_spin_trylock(lock))
                         cpu_relax();
                 return;
@@ -576,9 +584,12 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
                 /* Wait for mcs node lock to be released */
                 spin_begin();
                 while (!node->locked) {
+                       spec_barrier();
+
                         if (yield_to_prev(lock, node, old, paravirt))
                                 seen_preempted = true;
                 }
+               spec_barrier();
                 spin_end();
  
                 /* Clear out stale propagated yield_cpu */
@@ -586,6 +597,17 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
                         node->yield_cpu = -1;
  
                 smp_rmb(); /* acquire barrier for the mcs lock */
+
+               /*
+                * Generic qspinlocks have this prefetch here, but it seems
+                * like it could cause additional line transitions because
+                * the waiter will keep loading from it.
+                */
+               if (_Q_SPIN_PREFETCH_NEXT) {
+                       next = READ_ONCE(node->next);
+                       if (next)
+                               prefetchw(next);
+               }
         }
  
         /* We're at the head of the waitqueue, wait for the lock. */
@@ -597,6 +619,7 @@ again:
                 val = READ_ONCE(lock->val);
                 if (!(val & _Q_LOCKED_VAL))
                         break;
+               spec_barrier();
  
                 if (paravirt && pv_sleepy_lock && maybe_stealers) {
                         if (!sleepy) {
@@ -637,6 +660,7 @@ again:
                         val |= _Q_MUST_Q_VAL;
                 }
         }
+       spec_barrier();
         spin_end();
  
         /* If we're the last queued, must clean up the tail. */
@@ -657,6 +681,7 @@ again:
                         cpu_relax();
                 spin_end();
         }
+       spec_barrier();
  
         /*
          * Unlock the next mcs waiter node. Release barrier is not required
@@ -668,10 +693,14 @@ again:
         if (paravirt && pv_prod_head) {
                 int next_cpu = next->cpu;
                 WRITE_ONCE(next->locked, 1);
+               if (_Q_SPIN_MISO)
+                       asm volatile("miso" ::: "memory");
                 if (vcpu_is_preempted(next_cpu))
                         prod_cpu(next_cpu);
         } else {
                 WRITE_ONCE(next->locked, 1);
+               if (_Q_SPIN_MISO)
+                       asm volatile("miso" ::: "memory");
         }
  
  release:
@@ -686,12 +715,16 @@ void queued_spin_lock_slowpath(struct qspinlock *lock)
          * is passed as the paravirt argument to the functions.
          */
         if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
-               if (try_to_steal_lock(lock, true))
+               if (try_to_steal_lock(lock, true)) {
+                       spec_barrier();
                         return;
+               }
                 queued_spin_lock_mcs_queue(lock, true);
         } else {
-               if (try_to_steal_lock(lock, false))
+               if (try_to_steal_lock(lock, false)) {
+                       spec_barrier();
                         return;
+               }
                 queued_spin_lock_mcs_queue(lock, false);
         }
  }
author	Nicholas Piggin <npiggin@gmail.com>
	Sat, 26 Nov 2022 09:59:32 +0000 (19:59 +1000)
committer	Michael Ellerman <mpe@ellerman.id.au>
	Fri, 2 Dec 2022 06:48:50 +0000 (17:48 +1100)
arch/powerpc/include/asm/qspinlock.h		patch \| blob \| history
arch/powerpc/lib/qspinlock.c		patch \| blob \| history