--- v3
+++ v5
@@ -1,64 +1,180 @@
-Bit 31 in the page fault-error bit will be set when processor encounters
-an RMP violation.
+When SEV-SNP is enabled globally, a write from the host goes through the
+RMP check. When the host writes to pages, hardware checks the following
+conditions at the end of page walk:
-While at it, use the BIT() macro.
+1. Assigned bit in the RMP table is zero (i.e page is shared).
+2. If the page table entry that gives the sPA indicates that the target
+ page size is a large page, then all RMP entries for the 4KB
+ constituting pages of the target must have the assigned bit 0.
+3. Immutable bit in the RMP table is not zero.
+
+The hardware will raise page fault if one of the above conditions is not
+met. Try resolving the fault instead of taking fault again and again. If
+the host attempts to write to the guest private memory then send the
+SIGBUS signal to kill the process. If the page level between the host and
+RMP entry does not match, then split the address to keep the RMP and host
+page levels in sync.
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
- arch/x86/include/asm/trap_pf.h | 18 +++++++++++-------
- arch/x86/mm/fault.c | 1 +
- 2 files changed, 12 insertions(+), 7 deletions(-)
+ arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mm.h | 6 ++++-
+ mm/memory.c | 13 +++++++++
+ 3 files changed, 84 insertions(+), 1 deletion(-)
-diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
-index 10b1de500ab1..29f678701753 100644
---- a/arch/x86/include/asm/trap_pf.h
-+++ b/arch/x86/include/asm/trap_pf.h
-@@ -2,6 +2,8 @@
- #ifndef _ASM_X86_TRAP_PF_H
- #define _ASM_X86_TRAP_PF_H
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 8b7a5757440e..f2d543b92f43 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -19,6 +19,7 @@
+ #include <linux/uaccess.h> /* faulthandler_disabled() */
+ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
+ #include <linux/mm_types.h>
++#include <linux/sev.h> /* snp_lookup_rmpentry() */
-+#include <vdso/bits.h> /* BIT() macro */
+ #include <asm/cpufeature.h> /* boot_cpu_has, ... */
+ #include <asm/traps.h> /* dotraplinkage, ... */
+@@ -1202,6 +1203,60 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ }
+ NOKPROBE_SYMBOL(do_kern_addr_fault);
+
++static inline size_t pages_per_hpage(int level)
++{
++ return page_level_size(level) / PAGE_SIZE;
++}
++
++/*
++ * Return 1 if the caller need to retry, 0 if it the address need to be split
++ * in order to resolve the fault.
++ */
++static int handle_user_rmp_page_fault(struct pt_regs *regs, unsigned long error_code,
++ unsigned long address)
++{
++ int rmp_level, level;
++ pte_t *pte;
++ u64 pfn;
++
++ pte = lookup_address_in_mm(current->mm, address, &level);
++
++ /*
++ * It can happen if there was a race between an unmap event and
++ * the RMP fault delivery.
++ */
++ if (!pte || !pte_present(*pte))
++ return 1;
++
++ pfn = pte_pfn(*pte);
++
++ /* If its large page then calculte the fault pfn */
++ if (level > PG_LEVEL_4K) {
++ unsigned long mask;
++
++ mask = pages_per_hpage(level) - pages_per_hpage(level - 1);
++ pfn |= (address >> PAGE_SHIFT) & mask;
++ }
++
++ /*
++ * If its a guest private page, then the fault cannot be resolved.
++ * Send a SIGBUS to terminate the process.
++ */
++ if (snp_lookup_rmpentry(pfn, &rmp_level)) {
++ do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
++ return 1;
++ }
++
++ /*
++ * The backing page level is higher than the RMP page level, request
++ * to split the page.
++ */
++ if (level > rmp_level)
++ return 0;
++
++ return 1;
++}
+
/*
- * Page fault error code bits:
+ * Handle faults in the user portion of the address space. Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+@@ -1299,6 +1354,17 @@ void do_user_addr_fault(struct pt_regs *regs,
+ if (error_code & X86_PF_INSTR)
+ flags |= FAULT_FLAG_INSTRUCTION;
+
++ /*
++ * If its an RMP violation, try resolving it.
++ */
++ if (error_code & X86_PF_RMP) {
++ if (handle_user_rmp_page_fault(regs, error_code, address))
++ return;
++
++ /* Ask to split the page */
++ flags |= FAULT_FLAG_PAGE_SPLIT;
++ }
++
+ #ifdef CONFIG_X86_64
+ /*
+ * Faults in the vsyscall page might need emulation. The
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 7ca22e6e694a..74a53c146365 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -447,6 +447,8 @@ extern pgprot_t protection_map[16];
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
++ * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the
++ * region to smaller page size and retry.
*
-@@ -12,15 +14,17 @@
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- * bit 15 == 1: SGX MMU page-fault
-+ * bit 31 == 1: fault was an RMP violation
- */
- enum x86_pf_error_code {
-- X86_PF_PROT = 1 << 0,
-- X86_PF_WRITE = 1 << 1,
-- X86_PF_USER = 1 << 2,
-- X86_PF_RSVD = 1 << 3,
-- X86_PF_INSTR = 1 << 4,
-- X86_PF_PK = 1 << 5,
-- X86_PF_SGX = 1 << 15,
-+ X86_PF_PROT = BIT(0),
-+ X86_PF_WRITE = BIT(1),
-+ X86_PF_USER = BIT(2),
-+ X86_PF_RSVD = BIT(3),
-+ X86_PF_INSTR = BIT(4),
-+ X86_PF_PK = BIT(5),
-+ X86_PF_SGX = BIT(15),
-+ X86_PF_RMP = BIT(31),
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+@@ -478,6 +480,7 @@ enum fault_flag {
+ FAULT_FLAG_REMOTE = 1 << 7,
+ FAULT_FLAG_INSTRUCTION = 1 << 8,
+ FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
++ FAULT_FLAG_PAGE_SPLIT = 1 << 10,
};
- #endif /* _ASM_X86_TRAP_PF_H */
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index 1c548ad00752..2715240c757e 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -545,6 +545,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
- !(error_code & X86_PF_PROT) ? "not-present page" :
- (error_code & X86_PF_RSVD) ? "reserved bit violation" :
- (error_code & X86_PF_PK) ? "protection keys violation" :
-+ (error_code & X86_PF_RMP) ? "rmp violation" :
- "permissions violation");
+ /*
+@@ -517,7 +520,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
+ { FAULT_FLAG_USER, "USER" }, \
+ { FAULT_FLAG_REMOTE, "REMOTE" }, \
+ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
+- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
++ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
++ { FAULT_FLAG_PAGE_SPLIT, "PAGESPLIT" }
- if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ /*
+ * vm_fault is filled by the pagefault handler and passed to the vma's
+diff --git a/mm/memory.c b/mm/memory.c
+index 747a01d495f2..27e6ccec3fc1 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -4589,6 +4589,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+ return 0;
+ }
+
++static int handle_split_page_fault(struct vm_fault *vmf)
++{
++ if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
++ return VM_FAULT_SIGBUS;
++
++ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
++ return 0;
++}
++
+ /*
+ * By the time we get here, we already hold the mm semaphore
+ *
+@@ -4666,6 +4675,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
++
++ if (flags & FAULT_FLAG_PAGE_SPLIT)
++ return handle_split_page_fault(&vmf);
++
+ if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
--
2.17.1