Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory
From: Isaku Yamahata <hidden>
Date: 2023-07-21 22:27:12
Also in:
kvm, kvm-riscv, kvmarm, linux-fsdevel, linux-mips, linux-mm, linux-riscv, linux-security-module, linuxppc-dev, lkml
Subsystem:
kernel virtual machine (kvm), the rest · Maintainers:
Paolo Bonzini, Linus Torvalds
On Fri, Jul 21, 2023 at 02:13:14PM +0800, Yuan Yao [off-list ref] wrote:
On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:quoted
TODO Cc: Fuad Tabba <redacted> Cc: Vishal Annapurve <redacted> Cc: Ackerley Tng <redacted> Cc: Jarkko Sakkinen <jarkko@kernel.org> Cc: Maciej Szmigiero <redacted> Cc: Vlastimil Babka <redacted> Cc: David Hildenbrand <redacted> Cc: Quentin Perret <redacted> Cc: Michael Roth <redacted> Cc: Wang <redacted> Cc: Liam Merwick <redacted> Cc: Isaku Yamahata <redacted> Co-developed-by: Kirill A. Shutemov <redacted> Signed-off-by: Kirill A. Shutemov <redacted> Co-developed-by: Yu Zhang <redacted> Signed-off-by: Yu Zhang <redacted> Co-developed-by: Chao Peng <redacted> Signed-off-by: Chao Peng <redacted> Co-developed-by: Ackerley Tng <redacted> Signed-off-by: Ackerley Tng <redacted> Signed-off-by: Sean Christopherson <seanjc@google.com> --- include/linux/kvm_host.h | 48 +++ include/uapi/linux/kvm.h | 14 +- include/uapi/linux/magic.h | 1 + virt/kvm/Kconfig | 4 + virt/kvm/Makefile.kvm | 1 + virt/kvm/guest_mem.c | 591 +++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 58 +++- virt/kvm/kvm_mm.h | 38 +++ 8 files changed, 750 insertions(+), 5 deletions(-) create mode 100644 virt/kvm/guest_mem.cdiff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 97db63da6227..0d1e2ee8ae7a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h@@ -592,8 +592,20 @@ struct kvm_memory_slot { u32 flags; short id; u16 as_id; + +#ifdef CONFIG_KVM_PRIVATE_MEM + struct { + struct file __rcu *file; + pgoff_t pgoff; + } gmem; +#endif }; +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +{ + return slot && (slot->flags & KVM_MEM_PRIVATE); +} + static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;@@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif +/* + * Arch code must define kvm_arch_has_private_mem if support for private memory + * is enabled. + */ +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +static inline bool kvm_arch_has_private_mem(struct kvm *kvm) +{ + return false; +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot;@@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);@@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); + +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; +} +#else +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return false; +} #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ +#ifdef CONFIG_KVM_PRIVATE_MEM +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order); +#else +static inline int kvm_gmem_get_pfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t *pfn, int *max_order) +{ + KVM_BUG_ON(1, kvm); + return -EIO; +} +#endif /* CONFIG_KVM_PRIVATE_MEM */ + #endifdiff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f065c57db327..9b344fc98598 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h@@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 { __u64 guest_phys_addr; __u64 memory_size; __u64 userspace_addr; - __u64 pad[16]; + __u64 gmem_offset; + __u32 gmem_fd; + __u32 pad1; + __u64 pad2[14]; }; /*@@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_PRIVATE (1UL << 2) /* for KVM_IRQ_LINE */ struct kvm_irq_level {@@ -2284,4 +2288,12 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) + +struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; +}; + #endif /* __LINUX_KVM_H */diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..15041aa7d9ae 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h@@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 8375bc49f97d..3ee3205e0b39 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig@@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER config KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_GENERIC_MMU_NOTIFIER bool + +config KVM_PRIVATE_MEM + select XARRAY_MULTI + booldiff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 2c27d5d0c367..a5a61bbe7f4c 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm@@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.odiff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c new file mode 100644 index 000000000000..1b705fd63fa8 --- /dev/null +++ b/virt/kvm/guest_mem.c@@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/backing-dev.h> +#include <linux/falloc.h> +#include <linux/kvm_host.h> +#include <linux/pagemap.h> +#include <linux/pseudo_fs.h> + +#include <uapi/linux/magic.h> + +#include "kvm_mm.h" + +static struct vfsmount *kvm_gmem_mnt; + +struct kvm_gmem { + struct kvm *kvm; + struct xarray bindings; + struct list_head entry; +}; + +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index) +{ + struct folio *folio; + + /* TODO: Support huge pages. */ + folio = filemap_grab_folio(file->f_mapping, index); + if (!folio) + return NULL; + + /* + * Use the up-to-date flag to track whether or not the memory has been + * zeroed before being handed off to the guest. There is no backing + * storage for the memory, so the folio will remain up-to-date until + * it's removed. + * + * TODO: Skip clearing pages when trusted firmware will do it when + * assigning memory to the guest. + */ + if (!folio_test_uptodate(folio)) { + unsigned long nr_pages = folio_nr_pages(folio); + unsigned long i; + + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + + folio_mark_uptodate(folio); + } + + /* + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ + return folio; +} + +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + bool flush = false; + + KVM_MMU_LOCK(kvm); + + kvm_mmu_invalidate_begin(kvm); + + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + pgoff_t pgoff = slot->gmem.pgoff; + + struct kvm_gfn_range gfn_range = { + .start = slot->base_gfn + max(pgoff, start) - pgoff, + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, + .slot = slot, + .may_block = true, + }; + + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + + KVM_MMU_UNLOCK(kvm); +} + +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + struct kvm *kvm = gmem->kvm; + + KVM_MMU_LOCK(kvm); + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) + kvm_mmu_invalidate_end(kvm); + KVM_MMU_UNLOCK(kvm); +} + +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct list_head *gmem_list = &inode->i_mapping->private_list; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + struct kvm_gmem *gmem; + + /* + * Bindings must stable across invalidation to ensure the start+end + * are balanced. + */ + filemap_invalidate_lock(inode->i_mapping); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_begin(gmem, start, end); + + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_end(gmem, start, end); + + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start, index, end; + int r; + + /* Dedicated guest is immutable by default. */ + if (offset + len > i_size_read(inode)) + return -EINVAL; + + filemap_invalidate_lock_shared(mapping); + + start = offset >> PAGE_SHIFT; + end = (offset + len) >> PAGE_SHIFT; + + r = 0; + for (index = start; index < end; ) { + struct folio *folio; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + folio = kvm_gmem_get_folio(inode, index); + if (!folio) { + r = -ENOMEM; + break; + } + + index = folio_next_index(folio); + + folio_unlock(folio); + folio_put(folio); + + /* 64-bit only, wrapping the index should be impossible. */ + if (WARN_ON_ONCE(!index)) + break; + + cond_resched(); + } + + filemap_invalidate_unlock_shared(mapping); + + return r; +} + +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + int ret; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = kvm_gmem_punch_hole(file_inode(file), offset, len); + else + ret = kvm_gmem_allocate(file_inode(file), offset, len); + + if (!ret) + file_modified(file); + return ret; +} + +static int kvm_gmem_release(struct inode *inode, struct file *file) +{ + struct kvm_gmem *gmem = file->private_data; + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + + filemap_invalidate_lock(inode->i_mapping); + + /* + * Prevent concurrent attempts to *unbind* a memslot. This is the last + * reference to the file and thus no new bindings can be created, but + * dereferencing the slot for existing bindings needs to be protected + * against memslot updates, specifically so that unbind doesn't race + * and free the memslot (kvm_gmem_get_file() will return NULL). + */ + mutex_lock(&kvm->slots_lock); + + xa_for_each(&gmem->bindings, index, slot) + rcu_assign_pointer(slot->gmem.file, NULL); + + synchronize_rcu(); + + /* + * All in-flight operations are gone and new bindings can be created. + * Zap all SPTEs pointed at by this file. Do not free the backing + * memory, as its lifetime is associated with the inode, not the file. + */ + kvm_gmem_invalidate_begin(gmem, 0, -1ul); + kvm_gmem_invalidate_end(gmem, 0, -1ul); + + mutex_unlock(&kvm->slots_lock); + + list_del(&gmem->entry); + + filemap_invalidate_unlock(inode->i_mapping); + + xa_destroy(&gmem->bindings); + kfree(gmem); + + kvm_put_kvm(kvm); + + return 0; +} + +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) +{ + struct file *file; + + rcu_read_lock(); + + file = rcu_dereference(slot->gmem.file); + if (file && !get_file_rcu(file)) + file = NULL; + + rcu_read_unlock(); + + return file; +} + +static const struct file_operations kvm_gmem_fops = { + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +}; + +static int kvm_gmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) +{ + struct list_head *gmem_list = &mapping->private_list; + struct kvm_memory_slot *slot; + struct kvm_gmem *gmem; + unsigned long index; + pgoff_t start, end; + gfn_t gfn; + + filemap_invalidate_lock_shared(mapping); + + start = page->index; + end = start + thp_nr_pages(page); + + list_for_each_entry(gmem, gmem_list, entry) { + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + for (gfn = start; gfn < end; gfn++) {Why the start end range used as gfn here ? the page->index is offset of inode's page cache mapping and gmem address space, IIUC, gfn calculation should follow same way as kvm_gmem_invalidate_begin().
Also instead of sending signal multiple times, we can utilize lsb argument. Something like this?
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index a14eaac9dbad..8072ac901855 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c@@ -349,20 +349,35 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) struct kvm_gmem *gmem; unsigned long index; pgoff_t start, end; - gfn_t gfn; + unsigned int order; + int nr_pages; + gfn_t gfn, gfn_end; filemap_invalidate_lock_shared(mapping); start = page->index; end = start + thp_nr_pages(page); + nr_pages = thp_nr_pages(page); + order = thp_order(page); list_for_each_entry(gmem, gmem_list, entry) { xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { - for (gfn = start; gfn < end; gfn++) { - if (WARN_ON_ONCE(gfn < slot->base_gfn || - gfn >= slot->base_gfn + slot->npages)) - continue; + gfn = slot->base_gfn + page->index - slot->gmem.pgoff; + if (page->index + nr_pages <= slot->gmem.pgoff + slot->npages && + !(gfn & ~((1ULL << order) - 1))) { + /* + * FIXME: Tell userspace that the *private* + * memory encountered an error. + */ + send_sig_mceerr(BUS_MCEERR_AR, + (void __user *)gfn_to_hva_memslot(slot, gfn), + order, current); + break; + } + + gfn_end = min(gfn + nr_pages, slot->base_gfn + slot->npages); + for (; gfn < gfn_end; gfn++) { /* * FIXME: Tell userspace that the *private* * memory encountered an error.
--
Isaku Yamahata <isaku.yamahata@gmail.com>
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel