Thread (36 messages) 36 messages, 7 authors, 2023-06-21

Re: [PATCH mm-unstable v2 07/10] kvm/powerpc: add kvm_arch_test_clear_young()

From: Yu Zhao <hidden>
Date: 2023-06-21 00:39:00
Also in: kvm, kvmarm, linux-arm-kernel, linux-doc, linux-mm, linux-trace-kernel, lkml

On Tue, Jun 20, 2023 at 1:48 AM Nicholas Piggin [off-list ref] wrote:
On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
quoted
Implement kvm_arch_test_clear_young() to support the fast path in
mmu_notifier_ops->test_clear_young().

It focuses on a simple case, i.e., radix MMU sets the accessed bit in
KVM PTEs and VMs are not nested, where it can rely on RCU and
pte_xchg() to safely clear the accessed bit without taking
kvm->mmu_lock. Complex cases fall back to the existing slow path
where kvm->mmu_lock is then taken.

Signed-off-by: Yu Zhao <redacted>
---
 arch/powerpc/include/asm/kvm_host.h    |  8 ++++
 arch/powerpc/include/asm/kvm_ppc.h     |  1 +
 arch/powerpc/kvm/book3s.c              |  6 +++
 arch/powerpc/kvm/book3s.h              |  1 +
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           |  5 +++
 6 files changed, 80 insertions(+)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 14ee0dece853..75c260ea8a9e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -883,4 +883,12 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}

+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+     return IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
+            cpu_has_feature(CPU_FTR_HVMODE) && cpu_has_feature(CPU_FTR_ARCH_300) &&
+            radix_enabled();
This could probably be radix_enabled() && !kvmhv_on_pseries().
Will do. (I used !kvmhv_on_pseries() in v1 but had second thoughts on
moving kvmhv_on_pseries() into this file.)
Although unclear why not nested hypervisor... I'd have to think about it a bit
more. Do you have any idea what might go wrong, or just didn't have the
time to consider it? (Not just powerpc nested but any others).
Yes, this series excludes nested KVM support on all architures. The
common reason for such a decision on powerpc and x86 (aarch64 doesn't
support nested yet) is that it's quite challenging to make the rmap, a
complex data structure that maps one PFN to multiple GFNs, lockless.
(See kvmhv_insert_nest_rmap().) It might be possible, however, the
potential ROI would be in question.
quoted
+}
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 79a9c0bb8bba..ff1af6a7b44f 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,7 @@ struct kvmppc_ops {
      bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
      bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
      bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+     bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range);
      bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
      void (*free_memslot)(struct kvm_memory_slot *slot);
      int (*init_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 686d8d9eda3e..37bf40b0c4ff 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -899,6 +899,12 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
      return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }

+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+     return !kvm->arch.kvm_ops->test_clear_young ||
+            kvm->arch.kvm_ops->test_clear_young(kvm, range);
+}
+
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
      return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 58391b4b32ed..fa2659e21ccc 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -12,6 +12,7 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);

 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 3b65b3b11041..0a392e9a100a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -1088,6 +1088,65 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
      return ref;
 }

+bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+     bool err;
+     gfn_t gfn = range->start;
+
+     rcu_read_lock();
+
+     err = !kvm_is_radix(kvm);
+     if (err)
+             goto unlock;
+
+     /*
+      * Case 1:  This function          kvmppc_switch_mmu_to_hpt()
+      *
+      *          rcu_read_lock()
+      *          Test kvm_is_radix()    kvm->arch.radix = 0
+      *          Use kvm->arch.pgtable  synchronize_rcu()
+      *          rcu_read_unlock()
+      *                                 kvmppc_free_radix()
+      *
+      *
+      * Case 2:  This function          kvmppc_switch_mmu_to_radix()
+      *
+      *                                 kvmppc_init_vm_radix()
+      *                                 smp_wmb()
+      *          Test kvm_is_radix()    kvm->arch.radix = 1
+      *          smp_rmb()
+      *          Use kvm->arch.pgtable
+      */
+     smp_rmb();
Comment could stand to expand slightly on what you are solving, in
words.
Will do.
If you use synchronize_rcu() on both sides, you wouldn't need the
barrier, right?
Case 2 is about memory ordering, which is orthogonal to case 1 (RCU
freeing). So we need the r/w barrier regardless.
quoted
+     while (gfn < range->end) {
+             pte_t *ptep;
+             pte_t old, new;
+             unsigned int shift;
+
+             ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift);
+             if (!ptep)
+                     goto next;
+
+             VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep)));
Not really appropriate at the KVM level. mm enforces this kind of
thing (with notifiers).
Will remove this.
quoted
+
+             old = READ_ONCE(*ptep);
+             if (!pte_present(old) || !pte_young(old))
+                     goto next;
+
+             new = pte_mkold(old);
+
+             if (kvm_should_clear_young(range, gfn))
+                     pte_xchg(ptep, old, new);
*Probably* will work. I can't think of a reason why not at the
moment anyway :)
My reasoning:
* It should work if we only change the dedicated A bit, i.e., not
shared for other purposes, because races are safe (the case here).
* It may not work for x86 EPT without the A bit (excluded in this
series) where accesses are trapped by the R/X bits, because races in
changing the R/X bits can be unsafe.
quoted
+next:
+             gfn += shift ? BIT(shift - PAGE_SHIFT) : 1;
+     }
+unlock:
+     rcu_read_unlock();
+
+     return err;
+}
+
 /* Returns the number of PAGE_SIZE pages that are dirty */
 static int kvm_radix_test_clear_dirty(struct kvm *kvm,
                              struct kvm_memory_slot *memslot, int pagenum)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 130bafdb1430..20a81ec9fde8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5262,6 +5262,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
      spin_lock(&kvm->mmu_lock);
      kvm->arch.radix = 0;
      spin_unlock(&kvm->mmu_lock);
+     /* see the comments in kvm_test_clear_young_hv() */
I'm guilty of such comments at times, but it wouldn't hurt to say
        /* Finish concurrent kvm_test_clear_young_hv access to page tables */

Then you know where to look for more info and you have a vague
idea what it's for.
Will do.
quoted
+     synchronize_rcu();
quoted
      kvmppc_free_radix(kvm);

      lpcr = LPCR_VPM1;
@@ -5286,6 +5288,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
      if (err)
              return err;
      kvmppc_rmap_reset(kvm);
+     /* see the comments in kvm_test_clear_young_hv() */
+     smp_wmb();
      /* Mutual exclusion with kvm_unmap_gfn_range etc. */
      spin_lock(&kvm->mmu_lock);
      kvm->arch.radix = 1;
@@ -6185,6 +6189,7 @@ static struct kvmppc_ops kvm_ops_hv = {
      .unmap_gfn_range = kvm_unmap_gfn_range_hv,
      .age_gfn = kvm_age_gfn_hv,
      .test_age_gfn = kvm_test_age_gfn_hv,
+     .test_clear_young = kvm_test_clear_young_hv,
      .set_spte_gfn = kvm_set_spte_gfn_hv,
      .free_memslot = kvmppc_core_free_memslot_hv,
      .init_vm =  kvmppc_core_init_vm_hv,
Thanks for looking at the powerpc conversion!
Thanks for reviewing!
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help