Thread (20 messages) 20 messages, 5 authors, 2025-08-07

Re: [PATCH 3/4] arm64: mm: support large block mapping when rodata=full

From: Yang Shi <hidden>
Date: 2025-08-05 21:28:36
Also in: lkml


On 7/29/25 5:34 AM, Dev Jain wrote:
quoted hunk ↗ jump to hunk
On 25/07/25 3:41 am, Yang Shi wrote:
quoted
[----- snip -----]
    #ifdef CONFIG_ARM64_PAN
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3d5fb37424ab..f63b39613571 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -480,6 +480,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, 
phys_addr_t phys, unsigned long virt,
                   int flags);
  #endif
  +#define INVALID_PHYS_ADDR    -1
+
  static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
                         enum pgtable_type pgtable_type)
  {
@@ -487,7 +489,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct 
mm_struct *mm,
      struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & 
~__GFP_ZERO, 0);
      phys_addr_t pa;
  -    BUG_ON(!ptdesc);
+    if (!ptdesc)
+        return INVALID_PHYS_ADDR;
+
      pa = page_to_phys(ptdesc_page(ptdesc));
        switch (pgtable_type) {
@@ -509,15 +513,29 @@ static phys_addr_t __pgd_pgtable_alloc(struct 
mm_struct *mm,
  }
    static phys_addr_t __maybe_unused
-pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+split_pgtable_alloc(enum pgtable_type pgtable_type)
  {
      return __pgd_pgtable_alloc(&init_mm, pgtable_type);
  }
  +static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+{
+    phys_addr_t pa;
+
+    pa = __pgd_pgtable_alloc(&init_mm, pgtable_type);
+    BUG_ON(pa == INVALID_PHYS_ADDR);
+    return pa;
+}
+
  static phys_addr_t
  pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
  {
-    return __pgd_pgtable_alloc(NULL, pgtable_type);
+    phys_addr_t pa;
+
+    pa = __pgd_pgtable_alloc(NULL, pgtable_type);
+    BUG_ON(pa == INVALID_PHYS_ADDR);
+    return pa;
  }
    /*
@@ -552,6 +570,254 @@ void __init create_pgd_mapping(struct mm_struct 
*mm, phys_addr_t phys,
                   pgd_pgtable_alloc_special_mm, flags);
  }
  +static DEFINE_MUTEX(pgtable_split_lock);
Thanks for taking a separate lock.
quoted
+
+static int split_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned 
long end)
+{
+    pte_t *ptep;
+    unsigned long next;
+    unsigned int nr;
+    unsigned long span;
+
+    ptep = pte_offset_kernel(pmdp, addr);
+
+    do {
+        pte_t *_ptep;
+
+        nr = 0;
+        next = pte_cont_addr_end(addr, end);
+        if (next < end)
+            nr = max(nr, ((end - next) / CONT_PTE_SIZE));
+        span = nr * CONT_PTE_SIZE;
+
+        _ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+        ptep += pte_index(next) - pte_index(addr) + nr * CONT_PTES;
+
+        if (((addr | next) & ~CONT_PTE_MASK) == 0)
+            continue;
+
+        if (!pte_cont(__ptep_get(_ptep)))
+            continue;
+
+        for (int i = 0; i < CONT_PTES; i++, _ptep++)
+            __set_pte(_ptep, pte_mknoncont(__ptep_get(_ptep)));
+    } while (addr = next + span, addr != end);
+
+    return 0;
+}
+
+static int split_pmd(pmd_t *pmdp, unsigned long addr, unsigned long 
end)
+{
+    unsigned long next;
+    unsigned int nr;
+    unsigned long span;
+    int ret = 0;
+
+    do {
+        pmd_t pmd;
+
+        nr = 1;
+        next = pmd_addr_end(addr, end);
+        if (next < end)
+            nr = max(nr, ((end - next) / PMD_SIZE));
+        span = (nr - 1) * PMD_SIZE;
+
+        if (((addr | next) & ~PMD_MASK) == 0)
+            continue;
+
+        pmd = pmdp_get(pmdp);
+        if (pmd_leaf(pmd)) {
+            phys_addr_t pte_phys;
+            pte_t *ptep;
+            pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN |
+                      PMD_TABLE_AF;
+            unsigned long pfn = pmd_pfn(pmd);
+            pgprot_t prot = pmd_pgprot(pmd);
+
+            pte_phys = split_pgtable_alloc(TABLE_PTE);
+            if (pte_phys == INVALID_PHYS_ADDR)
+                return -ENOMEM;
+
+            if (pgprot_val(prot) & PMD_SECT_PXN)
+                pmdval |= PMD_TABLE_PXN;
+
+            prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) |
+                    PTE_TYPE_PAGE);
+            prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+            ptep = (pte_t *)phys_to_virt(pte_phys);
+            for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+                __set_pte(ptep, pfn_pte(pfn, prot));
+
+            dsb(ishst);
+
+            __pmd_populate(pmdp, pte_phys, pmdval);
+        }
+
+        ret = split_cont_pte(pmdp, addr, next);
+        if (ret)
+            break;
+    } while (pmdp += nr, addr = next + span, addr != end);
+
+    return ret;
+}
+
+static int split_cont_pmd(pud_t *pudp, unsigned long addr, unsigned 
long end)
+{
+    pmd_t *pmdp;
+    unsigned long next;
+    unsigned int nr;
+    unsigned long span;
+    int ret = 0;
+
+    pmdp = pmd_offset(pudp, addr);
+
+    do {
+        pmd_t *_pmdp;
+
+        nr = 0;
+        next = pmd_cont_addr_end(addr, end);
+        if (next < end)
+            nr = max(nr, ((end - next) / CONT_PMD_SIZE));
+        span = nr * CONT_PMD_SIZE;
+
+        if (((addr | next) & ~CONT_PMD_MASK) == 0) {
+            pmdp += pmd_index(next) - pmd_index(addr) +
+                nr * CONT_PMDS;
+            continue;
+        }
+
+        _pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+        if (!pmd_cont(pmdp_get(_pmdp)))
+            goto split;
+
+        for (int i = 0; i < CONT_PMDS; i++, _pmdp++)
+            set_pmd(_pmdp, pmd_mknoncont(pmdp_get(_pmdp)));
+
+split:
+        ret = split_pmd(pmdp, addr, next);
+        if (ret)
+            break;
+
+        pmdp += pmd_index(next) - pmd_index(addr) + nr * CONT_PMDS;
+    } while (addr = next + span, addr != end);
+
+    return ret;
+}
+
+static int split_pud(p4d_t *p4dp, unsigned long addr, unsigned long 
end)
+{
+    pud_t *pudp;
+    unsigned long next;
+    unsigned int nr;
+    unsigned long span;
+    int ret = 0;
+
+    pudp = pud_offset(p4dp, addr);
+
+    do {
+        pud_t pud;
+
+        nr = 1;
+        next = pud_addr_end(addr, end);
+        if (next < end)
+            nr = max(nr, ((end - next) / PUD_SIZE));
+        span = (nr - 1) * PUD_SIZE;
+
+        if (((addr | next) & ~PUD_MASK) == 0)
+            continue;
+
+        pud = pudp_get(pudp);
+        if (pud_leaf(pud)) {
+            phys_addr_t pmd_phys;
+            pmd_t *pmdp;
+            pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN |
+                      PUD_TABLE_AF;
+            unsigned long pfn = pud_pfn(pud);
+            pgprot_t prot = pud_pgprot(pud);
+            unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+
+            pmd_phys = split_pgtable_alloc(TABLE_PMD);
+            if (pmd_phys == INVALID_PHYS_ADDR)
+                return -ENOMEM;
+
+            if (pgprot_val(prot) & PMD_SECT_PXN)
+                pudval |= PUD_TABLE_PXN;
+
+            prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) |
+                    PMD_TYPE_SECT);
+            prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+            pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+            for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+                set_pmd(pmdp, pfn_pmd(pfn, prot));
+                pfn += step;
+            }
+
+            dsb(ishst);
+
+            __pud_populate(pudp, pmd_phys, pudval);
+        }
+
+        ret = split_cont_pmd(pudp, addr, next);
+        if (ret)
+            break;
+    } while (pudp += nr, addr = next + span, addr != end);
+
+    return ret;
+}
+
+static int split_p4d(pgd_t *pgdp, unsigned long addr, unsigned long 
end)
+{
+    p4d_t *p4dp;
+    unsigned long next;
+    int ret = 0;
+
+    p4dp = p4d_offset(pgdp, addr);
+
+    do {
+        next = p4d_addr_end(addr, end);
+
+        ret = split_pud(p4dp, addr, next);
+        if (ret)
+            break;
+    } while (p4dp++, addr = next, addr != end);
+
+    return ret;
+}
+
+static int split_pgd(pgd_t *pgdp, unsigned long addr, unsigned long 
end)
+{
+    unsigned long next;
+    int ret = 0;
+
+    do {
+        next = pgd_addr_end(addr, end);
+        ret = split_p4d(pgdp, addr, next);
+        if (ret)
+            break;
+    } while (pgdp++, addr = next, addr != end);
+
+    return ret;
+}
+
+int split_kernel_pgtable_mapping(unsigned long start, unsigned long 
end)
+{
+    int ret;
+
+    if (!system_supports_bbml2_noabort())
+        return 0;
+
+    if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+        return -EINVAL;
+
+    mutex_lock(&pgtable_split_lock);
+    arch_enter_lazy_mmu_mode();
+    ret = split_pgd(pgd_offset_k(start), start, end);
+    arch_leave_lazy_mmu_mode();
+    mutex_unlock(&pgtable_split_lock);
+
+    return ret;
+}
+
      /*
--- snip ---
I'm afraid I'll have to agree with Ryan :) Looking at the signature of 
split_kernel_pgtable_mapping,
one would expect that this function splits all block mappings in this 
region. But that is just a
nit; it does not seem right to me that we are iterating over the 
entire space when we know *exactly* where
we have to make the split, just to save on pgd/p4d/pud loads - the 
effect of which is probably cancelled
out by unnecessary iterations your approach takes to skip the 
intermediate blocks.
The implementation is aimed to reuse the split code for repainting. We 
have to split all leaf mappings down to PTEs for repainting.

Now Ryan suggested use pgtable walk API for repainting, it made the 
duplicate code problem gone. We had some discussion in the other series.
If we are concerned that most change_memory_common() operations are 
for a single page, then
we can do something like:

unsigned long size = end - start;
bool end_split, start_split = false;

if (start not aligned to block mapping)
    start_split = split(start);

/*
 * split the end only if the start wasn't split, or
 * if it cannot be guaranteed that start and end lie
 * on the same contig block
 */
if (!start_split || (size > CONT_PTE_SIZE))
    end_split = split(end);
Thanks for the suggestion. It works for some cases, but I don't think it 
can work if the range is cross page table IIUC. For example, start is in 
a PMD, but end is in another PMD.

Regards,
Yang
  
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help