Re: [PATCH v2 07/13] powerpc: add support for folded p4d page tables
From: Mike Rapoport <rppt@kernel.org>
Date: 2020-02-18 10:54:55
Also in:
kvmarm, linux-arch, linux-arm-kernel, linux-mm, linux-sh, lkml
On Sun, Feb 16, 2020 at 11:41:07AM +0100, Christophe Leroy wrote:
Le 16/02/2020 à 09:18, Mike Rapoport a écrit :quoted
From: Mike Rapoport <redacted> Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.I don't think it is worth adding all this additionnals walks of p4d, this patch could be limited to changes like: - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d_offset(pgd, gpa), gpa); The additionnal walks should be added through another patch the day powerpc need them.
Ok, I'll update the patch to reduce walking the p4d.
See below for more comments.quoted
Signed-off-by: Mike Rapoport <redacted> Tested-by: Christophe Leroy <redacted> # 8xx and 83xx ---
...
quoted
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 201a69e6a355..ddddbafff0ab 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h@@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ -#include <asm-generic/5level-fixup.h> +#include <asm-generic/pgtable-nop4d.h> #ifndef __ASSEMBLY__ #include <linux/mmdebug.h>@@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift; /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0xc0000000000000ffUL /* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL +#define P4D_MASKED_BITS 0xc0000000000000ffUL /* * Used as an indicator for rcu callback functions@@ -949,54 +949,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return pte_access_permitted(pud_pte(pud), write); } -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) }) +static inline __be64 p4d_raw(p4d_t x) +{ + return pgd_raw(x.pgd); +} +Shouldn't this be defined in asm/pgtable-be-types.h, just like other __pxx_raw() ?
Ideally yes, but this creates weird header file dependencies and untangling them would generate way too much churn.
quoted
+#define p4d_write(p4d) pte_write(p4d_pte(p4d)) -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); }
...
quoted
@@ -573,9 +596,15 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Traverse the guest's 2nd-level tree, allocate new levels needed */ pgd = pgtable + pgd_index(gpa); - pud = NULL; + p4d = NULL; if (pgd_present(*pgd)) - pud = pud_offset(pgd, gpa); + p4d = p4d_offset(pgd, gpa); + else + new_p4d = p4d_alloc_one(kvm->mm, gpa); + + pud = NULL; + if (p4d_present(*p4d)) + pud = pud_offset(p4d, gpa);Is it worth adding all this new code ? My understanding is that the series objective is to get rid of __ARCH_HAS_5LEVEL_HACK, to to add support for 5 levels to an architecture that not need it (at least for now). If we want to add support for 5 levels, it can be done later in another patch. Here I think your change could be limited to: - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d_offset(pgd, gpa), gpa);
This won't work. Without __ARCH_USE_5LEVEL_HACK defined pgd_present() is hardwired to 1 and the actual check for the top level is performed with p4d_present(). The 'else' clause that allocates p4d will never be taken and it could be removed, but I prefer to keep it for consistency.
quoted
else new_pud = pud_alloc_one(kvm->mm, gpa);@@ -597,12 +626,18 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Now traverse again under the lock and change the tree */ ret = -ENOMEM; if (pgd_none(*pgd)) { + if (!new_p4d) + goto out_unlock; + pgd_populate(kvm->mm, pgd, new_p4d); + new_p4d = NULL; + } + if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock; - pgd_populate(kvm->mm, pgd, new_pud); + p4d_populate(kvm->mm, p4d, new_pud); new_pud = NULL; } - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d, gpa); if (pud_is_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK;@@ -1220,6 +1255,7 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, pgd_t *pgt; struct kvm_nested_guest *nested; pgd_t pgd, *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ptep;@@ -1298,7 +1334,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, continue; } - pudp = pud_offset(&pgd, gpa); + p4dp = p4d_offset(&pgd, gpa); + p4d = READ_ONCE(*p4dp); + if (!(p4d_val(p4d) & _PAGE_PRESENT)) { + gpa = (gpa & P4D_MASK) + P4D_SIZE; + continue; + } + + pudp = pud_offset(&p4d, gpa);Same, here you are forcing a useless read with READ_ONCE(). Your change could be limited to - pudp = pud_offset(&pgd, gpa); + pudp = pud_offset(p4d_offset(&pgd, gpa), gpa);
Here again the actual check must be done against p4d rather than pgd. We could skip READ_ONCE() for pgd, but since it is a debugfs method I don't think it is more important than code consistency.
This comment applies to many other places.
I'll make another pass to see where we can take the shortcut and use pudp = pud_offset(p4d_offset(...))
quoted
pud = READ_ONCE(*pudp); if (!(pud_val(pud) & _PAGE_PRESENT)) { gpa = (gpa & PUD_MASK) + PUD_SIZE;diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3345f039a876..7a59f6863cec 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c@@ -107,13 +107,18 @@ static inline int unmap_patch_area(unsigned long addr) pte_t *ptep; pmd_t *pmdp; pud_t *pudp; + p4d_t *p4dp; pgd_t *pgdp; pgdp = pgd_offset_k(addr); if (unlikely(!pgdp)) return -EINVAL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (unlikely(!p4dp)) + return -EINVAL; + + pudp = pud_offset(p4dp, addr); if (unlikely(!pudp)) return -EINVAL;diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 0a1c65a2c565..b2fc3e71165c 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c@@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea) if (!Hash) return; - pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, ea), ea), ea), ea);If we continue like this, in ten years this like is going to be many kilometers long. I think the above would be worth a generic helper.
Agree. My plan was to first unify all the architectures and then start introducing the generic helpers, like e.g. pmd_offset_mm().
quoted
if (!pmd_none(*pmd)) add_hash_page(mm->context.id, ea, pmd_val(*pmd)); }diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 2fcd321040ff..175bc33b41b7 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c@@ -87,7 +87,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start, if (start >= end) return; end = (end - 1) | ~PAGE_MASK; - pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, start), start), start), start); for (;;) { pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; if (pmd_end > end)@@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) return; } mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; - pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr), vmaddr); if (!pmd_none(*pmd)) flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); }diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..9cd15937e88a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c@@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep;@@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea);Could be a single line, without a new var. - pudp = pud_alloc(&init_mm, pgdp, ea); + pudp = pud_alloc(&init_mm, p4d_offset(pgdp, ea), ea); Same kind of comments as already done apply to the rest. Christophe
-- Sincerely yours, Mike.