Inter-revision diff: patch 8

Comparing v3 (message) to v5 (message)

--- v3
+++ v5
@@ -1,64 +1,180 @@
-Bit 31 in the page fault-error bit will be set when processor encounters
-an RMP violation.
+When SEV-SNP is enabled globally, a write from the host goes through the
+RMP check. When the host writes to pages, hardware checks the following
+conditions at the end of page walk:
 
-While at it, use the BIT() macro.
+1. Assigned bit in the RMP table is zero (i.e page is shared).
+2. If the page table entry that gives the sPA indicates that the target
+   page size is a large page, then all RMP entries for the 4KB
+   constituting pages of the target must have the assigned bit 0.
+3. Immutable bit in the RMP table is not zero.
+
+The hardware will raise page fault if one of the above conditions is not
+met. Try resolving the fault instead of taking fault again and again. If
+the host attempts to write to the guest private memory then send the
+SIGBUS signal to kill the process. If the page level between the host and
+RMP entry does not match, then split the address to keep the RMP and host
+page levels in sync.
 
 Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
 ---
- arch/x86/include/asm/trap_pf.h | 18 +++++++++++-------
- arch/x86/mm/fault.c            |  1 +
- 2 files changed, 12 insertions(+), 7 deletions(-)
+ arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mm.h  |  6 ++++-
+ mm/memory.c         | 13 +++++++++
+ 3 files changed, 84 insertions(+), 1 deletion(-)
 
-diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
-index 10b1de500ab1..29f678701753 100644
---- a/arch/x86/include/asm/trap_pf.h
-+++ b/arch/x86/include/asm/trap_pf.h
-@@ -2,6 +2,8 @@
- #ifndef _ASM_X86_TRAP_PF_H
- #define _ASM_X86_TRAP_PF_H
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 8b7a5757440e..f2d543b92f43 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -19,6 +19,7 @@
+ #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
+ #include <linux/efi.h>			/* efi_crash_gracefully_on_page_fault()*/
+ #include <linux/mm_types.h>
++#include <linux/sev.h>			/* snp_lookup_rmpentry()	*/
  
-+#include <vdso/bits.h>  /* BIT() macro */
+ #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
+ #include <asm/traps.h>			/* dotraplinkage, ...		*/
+@@ -1202,6 +1203,60 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ }
+ NOKPROBE_SYMBOL(do_kern_addr_fault);
+ 
++static inline size_t pages_per_hpage(int level)
++{
++	return page_level_size(level) / PAGE_SIZE;
++}
++
++/*
++ * Return 1 if the caller need to retry, 0 if it the address need to be split
++ * in order to resolve the fault.
++ */
++static int handle_user_rmp_page_fault(struct pt_regs *regs, unsigned long error_code,
++				      unsigned long address)
++{
++	int rmp_level, level;
++	pte_t *pte;
++	u64 pfn;
++
++	pte = lookup_address_in_mm(current->mm, address, &level);
++
++	/*
++	 * It can happen if there was a race between an unmap event and
++	 * the RMP fault delivery.
++	 */
++	if (!pte || !pte_present(*pte))
++		return 1;
++
++	pfn = pte_pfn(*pte);
++
++	/* If its large page then calculte the fault pfn */
++	if (level > PG_LEVEL_4K) {
++		unsigned long mask;
++
++		mask = pages_per_hpage(level) - pages_per_hpage(level - 1);
++		pfn |= (address >> PAGE_SHIFT) & mask;
++	}
++
++	/*
++	 * If its a guest private page, then the fault cannot be resolved.
++	 * Send a SIGBUS to terminate the process.
++	 */
++	if (snp_lookup_rmpentry(pfn, &rmp_level)) {
++		do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS);
++		return 1;
++	}
++
++	/*
++	 * The backing page level is higher than the RMP page level, request
++	 * to split the page.
++	 */
++	if (level > rmp_level)
++		return 0;
++
++	return 1;
++}
 +
  /*
-  * Page fault error code bits:
+  * Handle faults in the user portion of the address space.  Nothing in here
+  * should check X86_PF_USER without a specific justification: for almost
+@@ -1299,6 +1354,17 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	if (error_code & X86_PF_INSTR)
+ 		flags |= FAULT_FLAG_INSTRUCTION;
+ 
++	/*
++	 * If its an RMP violation, try resolving it.
++	 */
++	if (error_code & X86_PF_RMP) {
++		if (handle_user_rmp_page_fault(regs, error_code, address))
++			return;
++
++		/* Ask to split the page */
++		flags |= FAULT_FLAG_PAGE_SPLIT;
++	}
++
+ #ifdef CONFIG_X86_64
+ 	/*
+ 	 * Faults in the vsyscall page might need emulation.  The
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 7ca22e6e694a..74a53c146365 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -447,6 +447,8 @@ extern pgprot_t protection_map[16];
+  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
++ * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the
++ *  region to smaller page size and retry.
   *
-@@ -12,15 +14,17 @@
-  *   bit 4 ==				1: fault was an instruction fetch
-  *   bit 5 ==				1: protection keys block access
-  *   bit 15 ==				1: SGX MMU page-fault
-+ *   bit 31 ==				1: fault was an RMP violation
-  */
- enum x86_pf_error_code {
--	X86_PF_PROT	=		1 << 0,
--	X86_PF_WRITE	=		1 << 1,
--	X86_PF_USER	=		1 << 2,
--	X86_PF_RSVD	=		1 << 3,
--	X86_PF_INSTR	=		1 << 4,
--	X86_PF_PK	=		1 << 5,
--	X86_PF_SGX	=		1 << 15,
-+	X86_PF_PROT	=		BIT(0),
-+	X86_PF_WRITE	=		BIT(1),
-+	X86_PF_USER	=		BIT(2),
-+	X86_PF_RSVD	=		BIT(3),
-+	X86_PF_INSTR	=		BIT(4),
-+	X86_PF_PK	=		BIT(5),
-+	X86_PF_SGX	=		BIT(15),
-+	X86_PF_RMP	=		BIT(31),
+  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+  * whether we would allow page faults to retry by specifying these two
+@@ -478,6 +480,7 @@ enum fault_flag {
+ 	FAULT_FLAG_REMOTE =		1 << 7,
+ 	FAULT_FLAG_INSTRUCTION =	1 << 8,
+ 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
++	FAULT_FLAG_PAGE_SPLIT =		1 << 10,
  };
  
- #endif /* _ASM_X86_TRAP_PF_H */
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index 1c548ad00752..2715240c757e 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -545,6 +545,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
- 		 !(error_code & X86_PF_PROT) ? "not-present page" :
- 		 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
- 		 (error_code & X86_PF_PK)    ? "protection keys violation" :
-+		 (error_code & X86_PF_RMP)   ? "rmp violation" :
- 					       "permissions violation");
+ /*
+@@ -517,7 +520,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
+ 	{ FAULT_FLAG_USER,		"USER" }, \
+ 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
+ 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
+-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
++	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
++	{ FAULT_FLAG_PAGE_SPLIT,	"PAGESPLIT" }
  
- 	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ /*
+  * vm_fault is filled by the pagefault handler and passed to the vma's
+diff --git a/mm/memory.c b/mm/memory.c
+index 747a01d495f2..27e6ccec3fc1 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -4589,6 +4589,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+ 	return 0;
+ }
+ 
++static int handle_split_page_fault(struct vm_fault *vmf)
++{
++	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
++		return VM_FAULT_SIGBUS;
++
++	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
++	return 0;
++}
++
+ /*
+  * By the time we get here, we already hold the mm semaphore
+  *
+@@ -4666,6 +4675,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ 				pmd_migration_entry_wait(mm, vmf.pmd);
+ 			return 0;
+ 		}
++
++		if (flags & FAULT_FLAG_PAGE_SPLIT)
++			return handle_split_page_fault(&vmf);
++
+ 		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ 			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ 				return do_huge_pmd_numa_page(&vmf);
 -- 
 2.17.1
 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help