Re: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

From: Sean Christopherson <seanjc@google.com>
Date: 2021-01-25 18:36:57
Also in: kvm, kvmarm, linux-mips, lkml

+Cc the other architectures, I'm guessing this would be a helpful optimization
for all archs.

Quite a few comments, but they're all little more than nits.  Nice!

On Mon, Jan 25, 2021, David Stevens wrote:

From: David Stevens <redacted>

Use the range passed to mmu_notifer's invalidate_range_start to prevent

s/mmu_notifer/mmu_notifier.  

And maybe avoid calling out invalidate_range_start() by name?  It took me a few
reads to understand it's referring to the function, i.e. the start of the
invalidation, not the start of the range.

spurious page fault retries due to changes in unrelated host virtual
addresses.

This needs to elaborate on the exact scenario this is handling, as is it sounds
like KVM is tracking the history of invalidations or something.  Understanding
this patch requires a priori knowledge of mmu_notifier_count.  Something like:

  Track the range being invalidated by mmu_notifier and skip page fault
  retries if the fault address is not affected by the in-progress
  invalidation.  Disable the optimization if multiple invalidations are
  in-progress to keep things simple, as tracking multiple ranges has
  diminishing returns.

This has the secondary effect of greatly reducing the likelihood of extreme

Out of curiosity, is this really the _secondary_ effect?  I would expect this
change to primarily benefit scenarios where the invalidation has gotten
waylaid for whatever reason.

latency when handing a page fault due to another thread having been preempted
while modifying host virtual addresses.

Signed-off-by: David Stevens <redacted>
---

...

quoted hunk

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..79166288ed03 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c

@@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
-			 bool *writable)
+			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+			 bool write, bool *writable)

Side topic, I'm all for creating a 'struct kvm_page_fault' or whatever to hold
all these variables.  The helper functions stacks are getting unwieldy.
Definitely doesn't need to be addressed here, this just reminded of how ugly
these stacks are.

quoted hunk

 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	bool async;

@@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	}
 
 	async = false;
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+				    write, writable, hva);
 	if (!async)
 		return false; /* *pfn has correct page already */

@@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+				    write, writable, hva);
 	return false;
 }

@@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	int r;
 
 	if (page_fault_handle_page_track(vcpu, error_code, gfn))

@@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+			 write, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))

@@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 
 	r = RET_PF_RETRY;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 	r = make_mmu_pages_available(vcpu);
 	if (r)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..3171784139a4 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h

@@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	struct guest_walker walker;
 	int r;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 	int max_level;

@@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-			 &map_writable))
+	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+			 write_fault, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))

@@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 
 	r = RET_PF_RETRY;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..b70097685249 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h

@@ -502,6 +502,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
+	unsigned long mmu_notifier_range_start;
+	unsigned long mmu_notifier_range_end;
 #endif
 	long tlbs_dirty;
 	struct list_head devices;

@@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable);
+			       bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);

@@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 		return 1;
 	return 0;
 }
+
+static inline int mmu_notifier_retry_hva(struct kvm *kvm,
+					 unsigned long mmu_seq,
+					 unsigned long hva)
+{
+	/*
+	 * Unlike mmu_notifier_retry, this function relies on
+	 * kvm->mmu_lock for consistency.

mmu_notifier_retry is the outlier due to PPC behavior.  Maybe just add a lockdep
annonation and call it good?

+	 */

This needs a comment to explicitly state that 'count > 1' cannot be done at
this time.  My initial thought is that it would be more intuitive to check for
'count > 1' here, but that would potentially check the wrong wrange when count
goes from 2->1.  The comment about persistence in invalidate_range_start() is a
good hint, but I think it's worth being explicit to avoid bad "cleanup" in the
future.

+	if (unlikely(kvm->mmu_notifier_count)) {
+		if (kvm->mmu_notifier_range_start <= hva &&
+		    hva < kvm->mmu_notifier_range_end)

Combine these into a single statement?  I think the result is easier to read?

	if (unlikely(kvm->mmu_notifier_count) &&
	    kvm->mmu_notifier_range_start <= hva &&
	    hva < kvm->mmu_notifier_range_end)

quoted hunk

+			return 1;
+	}
+	if (kvm->mmu_notifier_seq != mmu_seq)
+		return 1;
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa9e3614d30e..d6e1ef5cb184 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c

@@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
+	if (likely(kvm->mmu_notifier_count == 1)) {
+		kvm->mmu_notifier_range_start = range->start;
+		kvm->mmu_notifier_range_end = range->end;
+	} else {
+		/**
+		 * Tracking multiple concurrent ranges has diminishing returns,
+		 * so just use the maximum range. This persists until after all
+		 * outstanding invalidation operations complete.
+		 */
+		kvm->mmu_notifier_range_start = 0;
+		kvm->mmu_notifier_range_end = ULONG_MAX;

Hrm, I don't think there's a corner case in practice, but ULONG_MAX is a legal
virtual address and range_end is exclusive.  E.g. passing hva=-1ul would get a
false negative in mmu_notifier_retry_hva().  It's not an issue as written
because hva is generated from the gfn, and hva can't be a kernel address.  I'm
guessing mmu_notifier also doesn't fire on kernel addresses.  I assume that all
holds true for other architectures, and adding checks in mmu_notifier_retry_hva()
feels like a waste of cycles, but it still bugs me. :-)

Maybe zero out range_end and explicitly check for that, just to be paranoid?

	if (unlikely(kvm->mmu_notifier_count) &&
	    (!kvm->mmu_notifier_range_end ||
            (kvm->mmu_notifier_range_start <= hva &&
             hva < kvm->mmu_notifier_range_end))

quoted hunk

+	}
 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
 					     range->flags);
 	/* we've to flush the tlb before the pages can be freed */

@@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable)
+			       bool *writable, hva_t *hva)

Hrm, it feels like we should really split gfn->hva and hva->pfn into separate
operations, but pretty much every arch needs the hva error handling.  Splitting
it would probably do more harm than good, at least not without a lot of
additional refactoring.  Bummer.

 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);

Newline here.

quoted hunk

+	if (hva)
+		*hva = addr;
 
 	if (addr == KVM_HVA_ERR_RO_BAD) {
 		if (writable)

@@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-				    write_fault, writable);
+				    write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);

-- 
2.30.0.280.ga3ce27912f-goog

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help