Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create... | linux-api

[PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Chao Peng <hidden> · 2022-12-02
[PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Fuad Tabba <hidden> · 2022-12-06
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-07
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2022-12-13
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-19
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2022-12-19
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-20
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2022-12-20
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-21
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2022-12-22
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Vlastimil Babka <hidden> · 2023-01-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Kirill A. Shutemov <hidden> · 2023-01-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Vlastimil Babka <hidden> · 2023-02-13
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2023-01-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Sean Christopherson <seanjc@google.com> · 2023-01-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Vlastimil Babka <hidden> · 2023-01-24
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Sean Christopherson <seanjc@google.com> · 2022-12-22
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Huang, Kai <hidden> · 2022-12-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2022-12-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Kirill A. Shutemov <hidden> · 2023-01-23
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Vlastimil Babka <hidden> · 2023-02-13
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Michael Roth <hidden> · 2023-02-13
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Sean Christopherson <seanjc@google.com> · 2023-01-13
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2023-01-17
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Sean Christopherson <seanjc@google.com> · 2023-01-17
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2023-01-18
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Isaku Yamahata <hidden> · 2023-01-18
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Alexey Kardashevskiy <hidden> · 2023-02-22
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Chao Peng <hidden> · 2023-02-24
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Nikunj A. Dadhania <hidden> · 2023-02-16
Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory · Michael Roth <hidden> · 2023-03-20
[PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Fabiano Rosas <hidden> · 2022-12-06
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2022-12-07
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Fuad Tabba <hidden> · 2022-12-06
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2022-12-07
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Borislav Petkov <bp@alien8.de> · 2022-12-16
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2022-12-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Borislav Petkov <bp@alien8.de> · 2022-12-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2022-12-20
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chenyi Qiang <hidden> · 2022-12-28
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2023-01-03
RE: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Wang, Wei W <hidden> · 2023-01-03
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-01-03
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2023-01-05
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-01-13
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Binbin Wu <hidden> · 2023-01-17
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Chao Peng <hidden> · 2023-01-17
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-01-17
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Isaku Yamahata <hidden> · 2023-02-09
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-02-10
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Isaku Yamahata <hidden> · 2023-02-13
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-02-14
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Nicolas Saenz Julienne <hidden> · 2023-05-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-05-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Nicolas Saenz Julienne <hidden> · 2023-05-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Sean Christopherson <seanjc@google.com> · 2023-05-19
Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes · Nicolas Saenz Julienne <hidden> · 2023-05-23
[PATCH v10 4/9] KVM: Add KVM_EXIT_MEMORY_FAULT exit · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 4/9] KVM: Add KVM_EXIT_MEMORY_FAULT exit · Fuad Tabba <hidden> · 2022-12-06
Re: [PATCH v10 4/9] KVM: Add KVM_EXIT_MEMORY_FAULT exit · Chao Peng <hidden> · 2022-12-07
Re: [PATCH v10 4/9] KVM: Add KVM_EXIT_MEMORY_FAULT exit · Sean Christopherson <seanjc@google.com> · 2023-01-13
[PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Fuad Tabba <hidden> · 2022-12-05
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-06
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Fuad Tabba <hidden> · 2022-12-06
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-07
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Xiaoyao Li <hidden> · 2022-12-08
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-08
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Xiaoyao Li <hidden> · 2022-12-13
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-19
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Borislav Petkov <bp@alien8.de> · 2022-12-19
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-20
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Borislav Petkov <bp@alien8.de> · 2022-12-20
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2022-12-21
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Jarkko Sakkinen <jarkko@kernel.org> · 2023-01-05
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2023-01-06
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Sean Christopherson <seanjc@google.com> · 2023-01-09
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2023-01-10
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Vishal Annapurve <hidden> · 2023-01-10
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Sean Christopherson <seanjc@google.com> · 2023-01-13
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Chao Peng <hidden> · 2023-01-17
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Jarkko Sakkinen <jarkko@kernel.org> · 2023-01-20
Re: [PATCH v10 3/9] KVM: Extend the memslot to support fd-based private memory · Jarkko Sakkinen <jarkko@kernel.org> · 2023-01-20
[PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Fuad Tabba <hidden> · 2022-12-05
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Chao Peng <hidden> · 2022-12-06
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Fuad Tabba <hidden> · 2022-12-06
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Chao Peng <hidden> · 2022-12-09
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Isaku Yamahata <hidden> · 2022-12-07
Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry · Chao Peng <hidden> · 2022-12-07
[PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Yuan Yao <hidden> · 2022-12-07
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Chao Peng <hidden> · 2022-12-08
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Yuan Yao <hidden> · 2022-12-09
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Fuad Tabba <hidden> · 2022-12-07
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Chao Peng <hidden> · 2022-12-08
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Fuad Tabba <hidden> · 2022-12-09
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Chao Peng <hidden> · 2022-12-12
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Huang, Kai <hidden> · 2022-12-13
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Chao Peng <hidden> · 2022-12-19
Re: [PATCH v10 6/9] KVM: Unmap existing mappings when change the memory attributes · Sean Christopherson <seanjc@google.com> · 2023-01-13
[PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Isaku Yamahata <hidden> · 2022-12-05
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Chao Peng <hidden> · 2022-12-06
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Isaku Yamahata <hidden> · 2022-12-07
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Chao Peng <hidden> · 2022-12-08
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Sean Christopherson <seanjc@google.com> · 2023-01-13
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Sean Christopherson <seanjc@google.com> · 2023-01-13
Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed · Chao Peng <hidden> · 2023-01-28
[PATCH v10 8/9] KVM: Handle page fault for private memory · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Yuan Yao <hidden> · 2022-12-08
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Chao Peng <hidden> · 2022-12-08
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Yuan Yao <hidden> · 2022-12-09
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Fuad Tabba <hidden> · 2022-12-09
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Chao Peng <hidden> · 2022-12-12
Re: [PATCH v10 8/9] KVM: Handle page fault for private memory · Sean Christopherson <seanjc@google.com> · 2023-01-13
[PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Chao Peng <hidden> · 2022-12-02
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Fuad Tabba <hidden> · 2022-12-09
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Vishal Annapurve <hidden> · 2023-01-05
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Chao Peng <hidden> · 2023-01-06
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Sean Christopherson <seanjc@google.com> · 2023-01-14
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Chao Peng <hidden> · 2023-01-17
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Sean Christopherson <seanjc@google.com> · 2023-01-17
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Chao Peng <hidden> · 2023-01-18
Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE · Chao Peng <hidden> · 2023-01-28
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-01-14
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Kirill A. Shutemov <hidden> · 2023-01-16
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Chao Peng <hidden> · 2023-01-17
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Fuad Tabba <hidden> · 2023-01-17
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Isaku Yamahata <hidden> · 2023-01-19
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-01-19
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Isaku Yamahata <hidden> · 2023-01-19
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-01-24
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Isaku Yamahata <hidden> · 2023-02-08
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Michael Roth <hidden> · 2023-02-13
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Chao Peng <hidden> · 2023-02-21
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Michael Roth <hidden> · 2023-03-23
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Chao Peng <hidden> · 2023-03-24
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-04-12
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Chao Peng <hidden> · 2023-04-17
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-04-17
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Liam Merwick <hidden> · 2023-01-24
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-01-25
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Kirill A. Shutemov <hidden> · 2023-01-25
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Liam Merwick <hidden> · 2023-01-25
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-04-13
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Kirill A. Shutemov <hidden> · 2023-04-13
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Mike Rapoport <rppt@kernel.org> · 2023-02-16
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · David Hildenbrand <hidden> · 2023-02-16
Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM · Sean Christopherson <seanjc@google.com> · 2023-02-22

Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory

From: Chao Peng <hidden>
Date: 2023-01-17 12:51:00
Also in: kvm, linux-arch, linux-doc, linux-fsdevel, linux-mm, lkml, qemu-devel

On Fri, Jan 13, 2023 at 09:54:41PM +0000, Sean Christopherson wrote:

On Fri, Dec 02, 2022, Chao Peng wrote:

quoted

The system call is currently wired up for x86 arch.

Building on other architectures (except for arm64 for some reason) yields:

  CALL    /.../scripts/checksyscalls.sh
  <stdin>:1565:2: warning: #warning syscall memfd_restricted not implemented [-Wcpp]

Do we care?  It's the only such warning, which makes me think we either need to
wire this up for all architectures, or explicitly document that it's unsupported.

I'm a bit conservative and prefer enabling only on x86 where we know the
exact usecase. For the warning we can get rid of by changing
scripts/checksyscalls.sh, just like __IGNORE_memfd_secret:

https://lkml.kernel.org/r/20210518072034.31572-7-rppt@kernel.org

quoted

Signed-off-by: Kirill A. Shutemov <redacted>
Signed-off-by: Chao Peng <redacted>
---

...

quoted

diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
new file mode 100644
index 000000000000..c2700c5daa43
--- /dev/null
+++ b/include/linux/restrictedmem.h

@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_RESTRICTEDMEM_H

Missing

 #define _LINUX_RESTRICTEDMEM_H

which causes fireworks if restrictedmem.h is included more than once.

quoted

+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/pfn_t.h>

...

quoted

+static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
+					 struct page **pagep, int *order)
+{
+	return -1;

This should be a proper -errno, though in the current incarnation of things it's
a moot point because no stub is needed.  KVM can (and should) easily provide its
own stub for this one.

quoted

+}
+
+static inline bool file_is_restrictedmem(struct file *file)
+{
+	return false;
+}
+
+static inline void restrictedmem_error_page(struct page *page,
+					    struct address_space *mapping)
+{
+}
+
+#endif /* CONFIG_RESTRICTEDMEM */
+
+#endif /* _LINUX_RESTRICTEDMEM_H */

...

quoted

diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
new file mode 100644
index 000000000000..56953c204e5c
--- /dev/null
+++ b/mm/restrictedmem.c

@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "linux/sbitmap.h"
+#include <linux/pagemap.h>
+#include <linux/pseudo_fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/syscalls.h>
+#include <uapi/linux/falloc.h>
+#include <uapi/linux/magic.h>
+#include <linux/restrictedmem.h>
+
+struct restrictedmem_data {

Any objection to simply calling this "restrictedmem"?  And then using either "rm"
or "rmem" for local variable names?  I kept reading "data" as the underyling data
being written to the page, as opposed to the metadata describing the restrictedmem
instance.

quoted

+	struct mutex lock;
+	struct file *memfd;
+	struct list_head notifiers;
+};
+
+static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
+					   pgoff_t start, pgoff_t end)
+{
+	struct restrictedmem_notifier *notifier;
+
+	mutex_lock(&data->lock);

This can be a r/w semaphore instead of a mutex, that way punching holes at multiple
points in the file can at least run the notifiers in parallel.  The actual allocation
by shmem will still be serialized, but I think it's worth the simple optimization
since zapping and flushing in KVM may be somewhat slow.

quoted

+	list_for_each_entry(notifier, &data->notifiers, list) {
+		notifier->ops->invalidate_start(notifier, start, end);

Two major design issues that we overlooked long ago:

  1. Blindly invoking notifiers will not scale.  E.g. if userspace configures a
     VM with a large number of convertible memslots that are all backed by a
     single large restrictedmem instance, then converting a single page will
     result in a linear walk through all memslots.  I don't expect anyone to
     actually do something silly like that, but I also never expected there to be
     a legitimate usecase for thousands of memslots.

  2. This approach fails to provide the ability for KVM to ensure a guest has
     exclusive access to a page.  As discussed in the past, the kernel can rely
     on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
     only for SNP and TDX VMs.  For VMs where userspace is trusted to some extent,
     e.g. SEV, there is value in ensuring a 1:1 association.

     And probably more importantly, relying on hardware for SNP and TDX yields a
     poor ABI and complicates KVM's internals.  If the kernel doesn't guarantee a
     page is exclusive to a guest, i.e. if userspace can hand out the same page
     from a restrictedmem instance to multiple VMs, then failure will occur only
     when KVM tries to assign the page to the second VM.  That will happen deep
     in KVM, which means KVM needs to gracefully handle such errors, and it means
     that KVM's ABI effectively allows plumbing garbage into its memslots.

It may not be a valid usage, but in my TDX environment I do meet below
issue.

kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x80000000 ua=0x7fe1ebfff000 ret=0
kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc00000 size=0x400000 ua=0x7fe271579000 ret=0
kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda0000 size=0x20000 ua=0x7fe1ec09f000 ret=-22

Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
and slot#2 fails due to below exclusive check.

Currently I changed QEMU code to mark these alias slots as shared
instead of private but I'm not 100% confident this is correct fix.

Rather than use a simple list of notifiers, this appears to be yet another
opportunity to use an xarray.  Supporting sharing of restrictedmem will be
non-trivial, but IMO we should punt that to the future since it's still unclear
exactly how sharing will work.

An xarray will solve #1 by notifying only the consumers (memslots) that are bound
to the affected range.

And for #2, it's relatively straightforward (knock wood) to detect existing
entries, i.e. if the user wants exclusive access to memory, then the bind operation
can be reject if there's an existing entry.

VERY lightly tested code snippet at the bottom (will provide link to fully worked
code in cover letter).

quoted

+static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
+				     loff_t offset, loff_t len)
+{
+	int ret;
+	pgoff_t start, end;
+	struct file *memfd = data->memfd;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	restrictedmem_invalidate_start(data, start, end);
+	ret = memfd->f_op->fallocate(memfd, mode, offset, len);
+	restrictedmem_invalidate_end(data, start, end);

The lock needs to be end for the entire duration of the hole punch, i.e. needs to
be taken before invalidate_start() and released after invalidate_end().  If a user
(un)binds/(un)registers after invalidate_state(), it will see an unpaired notification,
e.g. could leave KVM with incorrect notifier counts.

quoted

+
+	return ret;
+}

What I ended up with for an xarray-based implementation.  I'm very flexible on
names and whatnot, these are just what made sense to me.

static long restrictedmem_punch_hole(struct restrictedmem *rm, int mode,
				     loff_t offset, loff_t len)
{
	struct restrictedmem_notifier *notifier;
	struct file *memfd = rm->memfd;
	unsigned long index;
	pgoff_t start, end;
	int ret;

	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
		return -EINVAL;

	start = offset >> PAGE_SHIFT;
	end = (offset + len) >> PAGE_SHIFT;

	/*
	 * Bindings must stable across invalidation to ensure the start+end
	 * are balanced.
	 */
	down_read(&rm->lock);

	xa_for_each_range(&rm->bindings, index, notifier, start, end)
		notifier->ops->invalidate_start(notifier, start, end);

	ret = memfd->f_op->fallocate(memfd, mode, offset, len);

	xa_for_each_range(&rm->bindings, index, notifier, start, end)
		notifier->ops->invalidate_end(notifier, start, end);

	up_read(&rm->lock);

	return ret;
}

int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
		       struct restrictedmem_notifier *notifier, bool exclusive)
{
	struct restrictedmem *rm = file->f_mapping->private_data;
	int ret = -EINVAL;

	down_write(&rm->lock);

	/* Non-exclusive mappings are not yet implemented. */
	if (!exclusive)
		goto out_unlock;

	if (!xa_empty(&rm->bindings)) {
		if (exclusive != rm->exclusive)
			goto out_unlock;

		if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
			goto out_unlock;
	}

	xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);
	rm->exclusive = exclusive;
	ret = 0;
out_unlock:
	up_write(&rm->lock);
	return ret;
}
EXPORT_SYMBOL_GPL(restrictedmem_bind);

void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
			  struct restrictedmem_notifier *notifier)
{
	struct restrictedmem *rm = file->f_mapping->private_data;

	down_write(&rm->lock);
	xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
	synchronize_rcu();
	up_write(&rm->lock);
}
EXPORT_SYMBOL_GPL(restrictedmem_unbind);

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help