[PATCH 1/2] kvm: Fix mmu_notifier release race
From: Christoffer Dall <hidden>
Date: 2017-04-25 15:37:17
Also in:
kvm, kvmarm, lkml
On Mon, Apr 24, 2017 at 11:10:23AM +0100, Suzuki K Poulose wrote:
The KVM uses mmu_notifier (wherever available) to keep track
of the changes to the mm of the guest. The guest shadow page
tables are released when the VM exits via mmu_notifier->ops.release().
There is a rare chance that the mmu_notifier->release could be
called more than once via two different paths, which could end
up in use-after-free of kvm instance (such as [0]).
e.g:
thread A thread B
------- --------------
get_signal-> kvm_destroy_vm()->
do_exit-> mmu_notifier_unregister->
exit_mm-> kvm_arch_flush_shadow_all()->
exit_mmap-> spin_lock(&kvm->mmu_lock)
mmu_notifier_release-> ....
kvm_arch_flush_shadow_all()-> .....
... spin_lock(&kvm->mmu_lock) .....
spin_unlock(&kvm->mmu_lock)
kvm_arch_free_kvm()
*** use after free of kvm ***
This patch attempts to solve the problem by holding a reference to the KVM
for the mmu_notifier, which is dropped only from notifier->ops.release().
This will ensure that the KVM struct is available till we reach the
kvm_mmu_notifier_release, and the kvm_destroy_vm is called only from/after
it. So, we can unregister the notifier with no_release option and hence
avoiding the race above. However, we need to make sure that the KVM is
freed only after the mmu_notifier has finished processing the notifier due to
the following possible path of execution :
mmu_notifier_release -> kvm_mmu_notifier_release -> kvm_put_kvm ->
kvm_destroy_vm -> kvm_arch_free_kvm
[0] http://lkml.kernel.org/r/CAAeHK+x8udHKq9xa1zkTO6ax5E8Dk32HYWfaT05FMchL2cr48g at mail.gmail.com
Fixes: commit 85db06e514422 ("KVM: mmu_notifiers release method")
Reported-by: andreyknvl at google.com
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Kr?m?? <redacted>
Cc: Marc Zyngier <redacted>
Cc: Christoffer Dall <redacted>
Cc: andreyknvl at google.com
Cc: Marc Zyngier <redacted>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>This looks good to me, but we should have some KVM generic experts look at it as well. Reviewed-by: Christoffer Dall [off-list ref]
quoted hunk ↗ jump to hunk
--- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 59 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 7 deletions(-)diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d025074..561e968 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h@@ -424,6 +424,7 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + struct rcu_head mmu_notifier_rcu; #endif long tlbs_dirty; struct list_head devices;diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 88257b3..2c3fdd4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c@@ -471,6 +471,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); + kvm_put_kvm(kvm); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {@@ -486,8 +487,46 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { static int kvm_init_mmu_notifier(struct kvm *kvm) { + int rc; kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - return mmu_notifier_register(&kvm->mmu_notifier, current->mm); + rc = mmu_notifier_register(&kvm->mmu_notifier, current->mm); + /* + * We hold a reference to KVM here to make sure that the KVM + * doesn't get free'd before ops->release() completes. + */ + if (!rc) + kvm_get_kvm(kvm); + return rc; +} + +static void kvm_free_vm_rcu(struct rcu_head *rcu) +{ + struct kvm *kvm = container_of(rcu, struct kvm, mmu_notifier_rcu); + kvm_arch_free_vm(kvm); +} + +static void kvm_flush_shadow_mmu(struct kvm *kvm) +{ + /* + * We hold a reference to kvm instance for mmu_notifier and is + * only released when ops->release() is called via exit_mmap path. + * So, when we reach here ops->release() has been called already, which + * flushes the shadow page tables. Hence there is no need to call the + * release() again when we unregister the notifier. However, we need + * to delay freeing up the kvm until the release() completes, since + * we could reach here via : + * kvm_mmu_notifier_release() -> kvm_put_kvm() -> kvm_destroy_vm() + */ + mmu_notifier_unregister_no_release(&kvm->mmu_notifier, kvm->mm); +} + +static void kvm_free_vm(struct kvm *kvm) +{ + /* + * Wait until the mmu_notifier has finished the release(). + * See comments above in kvm_flush_shadow_mmu. + */ + mmu_notifier_call_srcu(&kvm->mmu_notifier_rcu, kvm_free_vm_rcu); } #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */@@ -497,6 +536,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return 0; } +static void kvm_flush_shadow_mmu(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); +} + +static void kvm_free_vm(struct kvm *kvm) +{ + kvm_arch_free_vm(kvm); +} + #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm_memslots *kvm_alloc_memslots(void)@@ -733,18 +782,14 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); -#else - kvm_arch_flush_shadow_all(kvm); -#endif + kvm_flush_shadow_mmu(kvm); kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); - kvm_arch_free_vm(kvm); + kvm_free_vm(kvm); preempt_notifier_dec(); hardware_disable_all(); mmdrop(mm);-- 2.7.4