--- v4
+++ v5
@@ -1,147 +1,63 @@
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
-takes the cpu as parameter and return true if the cpu is preempted. Then
-kernel can break the spin loops upon on the retval of vcpu_is_preempted.
+takes the cpu as parameter and return true if the cpu is preempted.
+Then kernel can break the spin loops upon on the retval of
+vcpu_is_preempted.
As kernel has used this interface, So lets support it.
-We use one field of struct kvm_steal_time to indicate that if one vcpu
-is running or not.
+To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
+pv_lock_ops.
-unix benchmark result:
-host: kernel 4.8.1, i5-4570, 4 cpus
-guest: kernel 4.8.1, 8 vcpus
-
- test-case after-patch before-patch
-Execl Throughput | 18307.9 lps | 11701.6 lps
-File Copy 1024 bufsize 2000 maxblocks | 1352407.3 KBps | 790418.9 KBps
-File Copy 256 bufsize 500 maxblocks | 367555.6 KBps | 222867.7 KBps
-File Copy 4096 bufsize 8000 maxblocks | 3675649.7 KBps | 1780614.4 KBps
-Pipe Throughput | 11872208.7 lps | 11855628.9 lps
-Pipe-based Context Switching | 1495126.5 lps | 1490533.9 lps
-Process Creation | 29881.2 lps | 28572.8 lps
-Shell Scripts (1 concurrent) | 23224.3 lpm | 22607.4 lpm
-Shell Scripts (8 concurrent) | 3531.4 lpm | 3211.9 lpm
-System Call Overhead | 10385653.0 lps | 10419979.0 lps
+Then kvm or xen could provide their own implementation to support
+vcpu_is_preempted.
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
- arch/x86/include/asm/paravirt_types.h | 6 ++++++
- arch/x86/include/asm/spinlock.h | 8 ++++++++
- arch/x86/include/uapi/asm/kvm_para.h | 3 ++-
- arch/x86/kernel/kvm.c | 11 +++++++++++
- arch/x86/kernel/paravirt.c | 11 +++++++++++
- arch/x86/kvm/x86.c | 12 ++++++++++++
- 6 files changed, 50 insertions(+), 1 deletion(-)
+ arch/x86/include/asm/paravirt_types.h | 2 ++
+ arch/x86/include/asm/spinlock.h | 8 ++++++++
+ arch/x86/kernel/paravirt-spinlocks.c | 6 ++++++
+ 3 files changed, 16 insertions(+)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
-index 0f400c0..b1c7937 100644
+index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
-@@ -98,6 +98,10 @@ struct pv_time_ops {
- unsigned long long (*steal_clock)(int cpu);
+@@ -310,6 +310,8 @@ struct pv_lock_ops {
+
+ void (*wait)(u8 *ptr, u8 val);
+ void (*kick)(int cpu);
++
++ bool (*vcpu_is_preempted)(int cpu);
};
-+struct pv_vcpu_ops {
-+ bool (*vcpu_is_preempted)(int cpu);
-+};
-+
- struct pv_cpu_ops {
- /* hooks for various privileged instructions */
- unsigned long (*get_debugreg)(int regno);
-@@ -318,6 +322,7 @@ struct pv_lock_ops {
- struct paravirt_patch_template {
- struct pv_init_ops pv_init_ops;
- struct pv_time_ops pv_time_ops;
-+ struct pv_vcpu_ops pv_vcpu_ops;
- struct pv_cpu_ops pv_cpu_ops;
- struct pv_irq_ops pv_irq_ops;
- struct pv_mmu_ops pv_mmu_ops;
-@@ -327,6 +332,7 @@ struct paravirt_patch_template {
- extern struct pv_info pv_info;
- extern struct pv_init_ops pv_init_ops;
- extern struct pv_time_ops pv_time_ops;
-+extern struct pv_vcpu_ops pv_vcpu_ops;
- extern struct pv_cpu_ops pv_cpu_ops;
- extern struct pv_irq_ops pv_irq_ops;
- extern struct pv_mmu_ops pv_mmu_ops;
+ /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
-index 921bea7..52fd942 100644
+index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
extern struct static_key paravirt_ticketlocks_enabled;
static __always_inline bool static_key_false(struct static_key *key);
-+#ifdef CONFIG_PARAVIRT
++#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
-+ return pv_vcpu_ops.vcpu_is_preempted(cpu);
++ return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
#include <asm/qspinlock.h>
/*
-diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
-index 94dc8ca..e9c12a1 100644
---- a/arch/x86/include/uapi/asm/kvm_para.h
-+++ b/arch/x86/include/uapi/asm/kvm_para.h
-@@ -45,7 +45,8 @@ struct kvm_steal_time {
- __u64 steal;
- __u32 version;
- __u32 flags;
-- __u32 pad[12];
-+ __u32 preempted;
-+ __u32 pad[11];
- };
-
- #define KVM_STEAL_ALIGNMENT_BITS 5
-diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
-index edbbfc8..0011bef 100644
---- a/arch/x86/kernel/kvm.c
-+++ b/arch/x86/kernel/kvm.c
-@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
- wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
- }
-
-+static bool kvm_vcpu_is_preempted(int cpu)
-+{
-+ struct kvm_steal_time *src;
-+
-+ src = &per_cpu(steal_time, cpu);
-+
-+ return !!src->preempted;
-+}
-+
- #ifdef CONFIG_SMP
- static void __init kvm_smp_prepare_boot_cpu(void)
- {
-@@ -488,6 +497,8 @@ void __init kvm_guest_init(void)
- kvm_guest_cpu_init();
- #endif
-
-+ pv_vcpu_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
-+
- /*
- * Hard lockup detection is enabled by default. Disable it, as guests
- * can get false positives too easily, for example if the host is
-diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
-index bbf3d59..7adb7e9 100644
---- a/arch/x86/kernel/paravirt.c
-+++ b/arch/x86/kernel/paravirt.c
-@@ -122,6 +122,7 @@ static void *get_call_destination(u8 type)
- struct paravirt_patch_template tmpl = {
- .pv_init_ops = pv_init_ops,
- .pv_time_ops = pv_time_ops,
-+ .pv_vcpu_ops = pv_vcpu_ops,
- .pv_cpu_ops = pv_cpu_ops,
- .pv_irq_ops = pv_irq_ops,
- .pv_mmu_ops = pv_mmu_ops,
-@@ -203,6 +204,11 @@ static u64 native_steal_clock(int cpu)
- return 0;
+diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
+index 2c55a00..2f204dd 100644
+--- a/arch/x86/kernel/paravirt-spinlocks.c
++++ b/arch/x86/kernel/paravirt-spinlocks.c
+@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
+ __raw_callee_save___native_queued_spin_unlock;
}
+static bool native_vcpu_is_preempted(int cpu)
@@ -149,57 +65,15 @@
+ return 0;
+}
+
- /* These are in entry.S */
- extern void native_iret(void);
- extern void native_usergs_sysret64(void);
-@@ -312,6 +318,10 @@ struct pv_time_ops pv_time_ops = {
- .steal_clock = native_steal_clock,
+ struct pv_lock_ops pv_lock_ops = {
+ #ifdef CONFIG_SMP
+ .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .wait = paravirt_nop,
+ .kick = paravirt_nop,
++ .vcpu_is_preempted = native_vcpu_is_preempted,
+ #endif /* SMP */
};
-
-+struct pv_vcpu_ops pv_vcpu_ops = {
-+ .vcpu_is_preempted = native_vcpu_is_preempted,
-+};
-+
- __visible struct pv_irq_ops pv_irq_ops = {
- .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
-@@ -458,6 +468,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
- };
-
- EXPORT_SYMBOL_GPL(pv_time_ops);
-+EXPORT_SYMBOL (pv_vcpu_ops);
- EXPORT_SYMBOL (pv_cpu_ops);
- EXPORT_SYMBOL (pv_mmu_ops);
- EXPORT_SYMBOL_GPL(pv_info);
-diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
-index 6c633de..0ffc5aa 100644
---- a/arch/x86/kvm/x86.c
-+++ b/arch/x86/kvm/x86.c
-@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
- return;
-
-+ vcpu->arch.st.steal.preempted = 0;
-+
- if (vcpu->arch.st.steal.version & 1)
- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
-
-@@ -2812,6 +2814,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-
- void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
- {
-+ if (vcpu->arch.st.msr_val & KVM_MSR_ENABLED)
-+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
-+ &vcpu->arch.st.steal,
-+ sizeof(struct kvm_steal_time)) == 0) {
-+ vcpu->arch.st.steal.preempted = 1;
-+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
-+ &vcpu->arch.st.steal,
-+ sizeof(struct kvm_steal_time));
-+ }
-+
- kvm_x86_ops->vcpu_put(vcpu);
- kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = rdtsc();
+ EXPORT_SYMBOL(pv_lock_ops);
--
2.4.11