--- v2
+++ v5
@@ -1,51 +1,70 @@
-This is to fix some lock holder preemption issues. Some other locks
-implementation do a spin loop before acquiring the lock itself. Currently
-kernel has an interface of bool vcpu_is_preempted(int cpu). It take the cpu
-as parameter and return true if the cpu is preempted. Then kernel can break
-the spin loops upon on the retval of vcpu_is_preempted.
+An over-committed guest with more vCPUs than pCPUs has a heavy overload in
+osq_lock().
-As kernel has used this interface, So lets support it.
+This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
+node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
+lock.
-Only pSeries need supoort it. And the fact is powerNV are built into same
-kernel image with pSeries. So we need return false if we are runnig as
-powerNV. The another fact is that lppaca->yiled_count keeps zero on
-powerNV. So we can just skip the machine type.
+Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
+currently running or not. So break the spin loops on true condition.
+
+test case:
+perf record -a perf bench sched messaging -g 400 -p && perf report
+
+before patch:
+18.09% sched-messaging [kernel.vmlinux] [k] osq_lock
+12.28% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner
+ 5.27% sched-messaging [kernel.vmlinux] [k] mutex_unlock
+ 3.89% sched-messaging [kernel.vmlinux] [k] wait_consider_task
+ 3.64% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq
+ 3.41% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner.is
+ 2.49% sched-messaging [kernel.vmlinux] [k] system_call
+
+after patch:
+20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner
+ 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock
+ 4.12% sched-messaging [kernel.vmlinux] [k] system_call
+ 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common
+ 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7
+ 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner
+ 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
-Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
+Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Tested-by: Juergen Gross <jgross@suse.com>
---
- arch/powerpc/include/asm/spinlock.h | 18 ++++++++++++++++++
- 1 file changed, 18 insertions(+)
+ kernel/locking/osq_lock.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
-diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
-index 523673d..3ac9fcb 100644
---- a/arch/powerpc/include/asm/spinlock.h
-+++ b/arch/powerpc/include/asm/spinlock.h
-@@ -52,6 +52,24 @@
- #define SYNC_IO
- #endif
+diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
+index 05a3785..39d1385 100644
+--- a/kernel/locking/osq_lock.c
++++ b/kernel/locking/osq_lock.c
+@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
+ return cpu_nr + 1;
+ }
-+/*
-+ * This support kernel to check if one cpu is preempted or not.
-+ * Then we can fix some lock holder preemption issue.
-+ */
-+#ifdef CONFIG_PPC_PSERIES
-+#define vcpu_is_preempted vcpu_is_preempted
-+static inline bool vcpu_is_preempted(int cpu)
++static inline int node_cpu(struct optimistic_spin_node *node)
+{
-+ /*
-+ * pSeries and powerNV can be built into same kernel image. In
-+ * principle we need return false directly if we are running as
-+ * powerNV. However the yield_count is always zero on powerNV, So
-+ * skip such machine type check
-+ */
-+ return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
++ return node->cpu - 1;
+}
-+#endif
+
- static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
{
- return lock.slock == 0;
+ int cpu_nr = encoded_cpu_val - 1;
+@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
+ while (!READ_ONCE(node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
++ * Use vcpu_is_preempted to detech lock holder preemption issue
++ * and break. vcpu_is_preempted is a macro defined by false if
++ * arch does not support vcpu preempted check,
+ */
+- if (need_resched())
++ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
+ goto unqueue;
+
+ cpu_relax_lowlatency();
--
2.4.11