powerpc/qspinlock: allow propagation of yield CPU down the queue
authorNicholas Piggin <npiggin@gmail.com>
Sat, 26 Nov 2022 09:59:25 +0000 (19:59 +1000)
committerMichael Ellerman <mpe@ellerman.id.au>
Fri, 2 Dec 2022 06:48:50 +0000 (17:48 +1100)
Having all CPUs poll the lock word for the owner CPU that should be
yielded to defeats most of the purpose of using MCS queueing for
scalability. Yet it may be desirable for queued waiters to yield to a
preempted owner.

With this change, queue waiters never sample the owner CPU directly from
the lock word. The queue head (which is spinning on the lock) propagates
the owner CPU back to the next waiter if it finds the owner has been
preempted. That waiter then propagates the owner CPU back to the next
waiter, and so on.

s390 addresses this problem differenty, by having queued waiters sample
the lock word to find the owner at a low frequency. That has the
advantage of being simpler, the advantage of propagation is that the
lock word never has to be accesed by queued waiters, and the transfer of
cache lines to transmit the owner data is only required when lock holder
vCPU preemption occurs.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221126095932.1234527-11-npiggin@gmail.com
arch/powerpc/lib/qspinlock.c

index c1f3b69..c45f30c 100644 (file)
@@ -12,6 +12,7 @@
 struct qnode {
        struct qnode    *next;
        struct qspinlock *lock;
+       int             yield_cpu;
        u8              locked; /* 1 if lock acquired */
 };
 
@@ -28,6 +29,7 @@ static int head_spins __read_mostly = (1 << 8);
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
+static bool pv_yield_propagate_owner __read_mostly = true;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -232,14 +234,67 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u
        __yield_to_locked_owner(lock, val, paravirt, mustq);
 }
 
+static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
+{
+       struct qnode *next;
+       int owner;
+
+       if (!paravirt)
+               return;
+       if (!pv_yield_propagate_owner)
+               return;
+
+       owner = get_owner_cpu(val);
+       if (*set_yield_cpu == owner)
+               return;
+
+       next = READ_ONCE(node->next);
+       if (!next)
+               return;
+
+       if (vcpu_is_preempted(owner)) {
+               next->yield_cpu = owner;
+               *set_yield_cpu = owner;
+       } else if (*set_yield_cpu != -1) {
+               next->yield_cpu = owner;
+               *set_yield_cpu = owner;
+       }
+}
+
 static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
 {
        int prev_cpu = decode_tail_cpu(val);
        u32 yield_count;
+       int yield_cpu;
 
        if (!paravirt)
                goto relax;
 
+       if (!pv_yield_propagate_owner)
+               goto yield_prev;
+
+       yield_cpu = READ_ONCE(node->yield_cpu);
+       if (yield_cpu == -1) {
+               /* Propagate back the -1 CPU */
+               if (node->next && node->next->yield_cpu != -1)
+                       node->next->yield_cpu = yield_cpu;
+               goto yield_prev;
+       }
+
+       yield_count = yield_count_of(yield_cpu);
+       if ((yield_count & 1) == 0)
+               goto yield_prev; /* owner vcpu is running */
+
+       smp_rmb();
+
+       if (yield_cpu == node->yield_cpu) {
+               if (node->next && node->next->yield_cpu != yield_cpu)
+                       node->next->yield_cpu = yield_cpu;
+               yield_to_preempted(yield_cpu, yield_count);
+               return;
+       }
+
+yield_prev:
        if (!pv_yield_prev)
                goto relax;
 
@@ -293,6 +348,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
        u32 val, old, tail;
        bool mustq = false;
        int idx;
+       int set_yield_cpu = -1;
        int iters = 0;
 
        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
@@ -314,6 +370,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
        node = &qnodesp->nodes[idx];
        node->next = NULL;
        node->lock = lock;
+       node->yield_cpu = -1;
        node->locked = 0;
 
        tail = encode_tail_cpu(smp_processor_id());
@@ -334,6 +391,10 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
                while (!node->locked)
                        yield_to_prev(lock, node, old, paravirt);
 
+               /* Clear out stale propagated yield_cpu */
+               if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
+                       node->yield_cpu = -1;
+
                smp_rmb(); /* acquire barrier for the mcs lock */
        }
 
@@ -344,6 +405,7 @@ again:
                if (!(val & _Q_LOCKED_VAL))
                        break;
 
+               propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
                yield_head_to_locked_owner(lock, val, paravirt);
                if (!maybe_stealers)
                        continue;
@@ -512,6 +574,22 @@ static int pv_yield_prev_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
 
+static int pv_yield_propagate_owner_set(void *data, u64 val)
+{
+       pv_yield_propagate_owner = !!val;
+
+       return 0;
+}
+
+static int pv_yield_propagate_owner_get(void *data, u64 *val)
+{
+       *val = pv_yield_propagate_owner;
+
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
+
 static __init int spinlock_debugfs_init(void)
 {
        debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
@@ -520,6 +598,7 @@ static __init int spinlock_debugfs_init(void)
                debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
                debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
                debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
+               debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
        }
 
        return 0;