Re: [PATCH 04/17] powerpc/qspinlock: convert atomic operations to assembly
From: Jordan Niethe <hidden>
Date: 2022-08-10 03:55:20
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
quoted hunk ↗ jump to hunk
This uses more optimal ll/sc style access patterns (rather than cmpxchg), and also sets the EH=1 lock hint on those operations which acquire ownership of the lock. --- arch/powerpc/include/asm/qspinlock.h | 25 +++++-- arch/powerpc/include/asm/qspinlock_types.h | 6 +- arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- 3 files changed, 79 insertions(+), 33 deletions(-)diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 79a1936fb68d..3ab354159e5e 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h@@ -2,28 +2,43 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <linux/atomic.h> #include <linux/compiler.h> #include <asm/qspinlock_types.h> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - return atomic_read(&lock->val); + return READ_ONCE(lock->val); } static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - return !atomic_read(&lock.val); + return !lock.val; } static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } static __always_inline int queued_spin_trylock(struct qspinlock *lock) { - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) + u32 new = _Q_LOCKED_VAL; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0)
btw IS_ENABLED() already returns 1 or 0
+ : "cr0", "memory");
This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper?
+ + if (likely(prev == 0)) return 1; return 0;
same optional style nit: return likely(prev == 0);
quoted hunk ↗ jump to hunk
}diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 3425dab42576..210adf05b235 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h@@ -7,7 +7,7 @@ typedef struct qspinlock { union { - atomic_t val; + u32 val; #ifdef __LITTLE_ENDIAN struct {@@ -23,10 +23,10 @@ typedef struct qspinlock { }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } /* - * Bitfields in the atomic value: + * Bitfields in the lock word: * * 0: locked bit * 16-31: tail cpu (+1)diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 5ebb88d95636..7c71e5e287df 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c@@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/export.h>@@ -22,32 +21,59 @@ struct qnodes { static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); -static inline int encode_tail_cpu(void) +static inline u32 encode_tail_cpu(void) { return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; } -static inline int get_tail_cpu(int val) +static inline int get_tail_cpu(u32 val) { return (val >> _Q_TAIL_CPU_OFFSET) - 1; } /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */
I think you missed deleting the above line.
+/* Take the lock by setting the lock bit, no other CPUs will touch it. */
static __always_inline void lock_set_locked(struct qspinlock *lock)
{
- atomic_or(_Q_LOCKED_VAL, &lock->val);
- __atomic_acquire_fence();
+ u32 new = _Q_LOCKED_VAL;
+ u32 prev;
+
+ asm volatile(
+"1: lwarx %0,0,%1,%3 # lock_set_locked \n"
+" or %0,%0,%2 \n"
+" stwcx. %0,0,%1 \n"
+" bne- 1b \n"
+"\t" PPC_ACQUIRE_BARRIER " \n"
+ : "=&r" (prev)
+ : "r" (&lock->val), "r" (new),
+ "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0)
+ : "cr0", "memory");
}This is pretty similar with the DEFINE_TESTOP() pattern from arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for word instead of double word. Do you think it's possible / beneficial to make use of those macros?
-/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */
-static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val)
+/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
+static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old)
{
- int newval = _Q_LOCKED_VAL;
-
- if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val)
+ u32 new = _Q_LOCKED_VAL;
+ u32 prev;
+
+ BUG_ON(old & _Q_LOCKED_VAL);The BUG_ON() could have been introduced in an earlier patch I think.
+ + asm volatile( +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new),
Could this be like "r"(_Q_TAIL_CPU_MASK) below? i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change.
quoted hunk ↗ jump to hunk
+ "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(prev == old)) return 1; - else - return 0; + return 0; } /*@@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va * This provides a release barrier for publishing node, and an acquire barrier
Does the comment mean there needs to be an acquire barrier in this assembly?
quoted hunk ↗ jump to hunk
* for getting the old node. */ -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) { - for (;;) { - int val = atomic_read(&lock->val); - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; - int old; - - old = atomic_cmpxchg(&lock->val, val, newval); - if (old == val) - return old; - } + u32 prev, tmp; + + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; } -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = get_tail_cpu(val); struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu);@@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) { struct qnodes *qnodesp; struct qnode *next, *node; - int val, old, tail; + u32 val, old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));@@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) } /* We're at the head of the waitqueue, wait for the lock. */ - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) cpu_relax(); /* If we're the last queued, must clean up the tail. */