--- v4
+++ v3
@@ -1,68 +1,51 @@
-An over-committed guest with more vCPUs than pCPUs has a heavy overload in
-osq_lock().
+This is to fix some lock holder preemption issues. Some other locks
+implementation do a spin loop before acquiring the lock itself.
+Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
+takes the cpu as parameter and return true if the cpu is preempted. Then
+kernel can break the spin loops upon the retval of vcpu_is_preempted().
-This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
-node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
-lock.
+As kernel has used this interface, So lets support it.
-Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
-currently running or not. So break the spin loops on true condition.
-
-test case:
-perf record -a perf bench sched messaging -g 400 -p && perf report
-
-before patch:
-18.09% sched-messaging [kernel.vmlinux] [k] osq_lock
-12.28% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner
- 5.27% sched-messaging [kernel.vmlinux] [k] mutex_unlock
- 3.89% sched-messaging [kernel.vmlinux] [k] wait_consider_task
- 3.64% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq
- 3.41% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner.is
- 2.49% sched-messaging [kernel.vmlinux] [k] system_call
-
-after patch:
-20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner
- 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock
- 4.12% sched-messaging [kernel.vmlinux] [k] system_call
- 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common
- 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7
- 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner
- 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock
+Only pSeries need support it. And the fact is powerNV are built into same
+kernel image with pSeries. So we need return false if we are runnig as
+powerNV. The another fact is that lppaca->yield_count keeps zero on
+powerNV. So we can just skip the machine type check.
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
+Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
- kernel/locking/osq_lock.c | 10 +++++++++-
- 1 file changed, 9 insertions(+), 1 deletion(-)
+ arch/powerpc/include/asm/spinlock.h | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
-diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
-index 05a3785..39d1385 100644
---- a/kernel/locking/osq_lock.c
-+++ b/kernel/locking/osq_lock.c
-@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
- return cpu_nr + 1;
- }
+diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
+index 523673d..3ac9fcb 100644
+--- a/arch/powerpc/include/asm/spinlock.h
++++ b/arch/powerpc/include/asm/spinlock.h
+@@ -52,6 +52,24 @@
+ #define SYNC_IO
+ #endif
-+static inline int node_cpu(struct optimistic_spin_node *node)
++/*
++ * This support kernel to check if one cpu is preempted or not.
++ * Then we can fix some lock holder preemption issue.
++ */
++#ifdef CONFIG_PPC_PSERIES
++#define vcpu_is_preempted vcpu_is_preempted
++static inline bool vcpu_is_preempted(int cpu)
+{
-+ return node->cpu - 1;
++ /*
++ * pSeries and powerNV can be built into same kernel image. In
++ * principle we need return false directly if we are running as
++ * powerNV. However the yield_count is always zero on powerNV, So
++ * skip such machine type check
++ */
++ return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
++#endif
+
- static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
- int cpu_nr = encoded_cpu_val - 1;
-@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
-+ * Use vcpu_is_preempted to detech lock holder preemption issue
-+ * and break. vcpu_is_preempted is a macro defined by false if
-+ * arch does not support vcpu preempted check,
- */
-- if (need_resched())
-+ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
- goto unqueue;
-
- cpu_relax_lowlatency();
+ return lock.slock == 0;
--
2.4.11