--- v2
+++ v1
@@ -1,265 +1,137 @@
-s390 uses page->index to keep track of page tables for the guest address
-space. In an attempt to consolidate the usage of page fields in s390,
-replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap.
+s390 currently uses _refcount to identify fragmented page tables.
+The page table struct already has a member pt_frag_refcount used by
+powerpc, so have s390 use that instead of the _refcount field as well.
+This improves the safety for _refcount and the page table tracking.
-This will help with the splitting of struct ptdesc from struct page, as
-well as allow s390 to use _pt_frag_refcount for fragmented page table
-tracking.
-
-Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL
-before freeing the pages as well.
-
-This also reverts commit 7e25de77bc5ea ("s390/mm: use pmd_pgtable_page()
-helper in __gmap_segment_gaddr()") which had s390 use
-pmd_pgtable_page() to get a gmap page table, as pmd_pgtable_page()
-should be used for more generic process page tables.
+This also allows us to simplify the tracking since we can once again use
+the lower byte of pt_frag_refcount instead of the upper byte of _refcount.
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
- arch/s390/mm/gmap.c | 56 +++++++++++++++++++++++++++-------------
- include/linux/mm_types.h | 2 +-
- 2 files changed, 39 insertions(+), 19 deletions(-)
+ arch/s390/mm/pgalloc.c | 38 +++++++++++++++-----------------------
+ 1 file changed, 15 insertions(+), 23 deletions(-)
-diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
-index dfe905c7bd8e..a9e8b1805894 100644
---- a/arch/s390/mm/gmap.c
-+++ b/arch/s390/mm/gmap.c
-@@ -70,7 +70,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
- if (!page)
- goto out_free;
-- page->index = 0;
-+ page->_pt_s390_gaddr = 0;
- list_add(&page->lru, &gmap->crst_list);
- table = page_to_virt(page);
- crst_table_init(table, etype);
-@@ -187,16 +187,20 @@ static void gmap_free(struct gmap *gmap)
- if (!(gmap_is_shadow(gmap) && gmap->removed))
- gmap_flush_tlb(gmap);
- /* Free all segment & region tables. */
-- list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
-+ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
-+ }
- gmap_radix_tree_free(&gmap->guest_to_host);
- gmap_radix_tree_free(&gmap->host_to_guest);
+diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
+index 66ab68db9842..6b99932abc66 100644
+--- a/arch/s390/mm/pgalloc.c
++++ b/arch/s390/mm/pgalloc.c
+@@ -182,20 +182,17 @@ void page_table_free_pgste(struct page *page)
+ * As follows from the above, no unallocated or fully allocated parent
+ * pages are contained in mm_context_t::pgtable_list.
+ *
+- * The upper byte (bits 24-31) of the parent page _refcount is used
++ * The lower byte (bits 0-7) of the parent page pt_frag_refcount is used
+ * for tracking contained 2KB-pgtables and has the following format:
+ *
+ * PP AA
+- * 01234567 upper byte (bits 24-31) of struct page::_refcount
++ * 01234567 upper byte (bits 0-7) of struct page::pt_frag_refcount
+ * || ||
+ * || |+--- upper 2KB-pgtable is allocated
+ * || +---- lower 2KB-pgtable is allocated
+ * |+------- upper 2KB-pgtable is pending for removal
+ * +-------- lower 2KB-pgtable is pending for removal
+ *
+- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+- * using _refcount is possible).
+- *
+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+@@ -243,11 +240,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ if (!list_empty(&mm->context.pgtable_list)) {
+ page = list_first_entry(&mm->context.pgtable_list,
+ struct page, lru);
+- mask = atomic_read(&page->_refcount) >> 24;
++ mask = atomic_read(&page->pt_frag_refcount);
+ /*
+ * The pending removal bits must also be checked.
+ * Failure to do so might lead to an impossible
+- * value of (i.e 0x13 or 0x23) written to _refcount.
++ * value of (i.e 0x13 or 0x23) written to
++ * pt_frag_refcount.
+ * Such values violate the assumption that pending and
+ * allocation bits are mutually exclusive, and the rest
+ * of the code unrails as result. That could lead to
+@@ -259,8 +257,8 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ bit = mask & 1; /* =1 -> second 2K */
+ if (bit)
+ table += PTRS_PER_PTE;
+- atomic_xor_bits(&page->_refcount,
+- 0x01U << (bit + 24));
++ atomic_xor_bits(&page->pt_frag_refcount,
++ 0x01U << bit);
+ list_del(&page->lru);
+ }
+ }
+@@ -281,12 +279,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ table = (unsigned long *) page_to_virt(page);
+ if (mm_alloc_pgste(mm)) {
+ /* Return 4K page table with PGSTEs */
+- atomic_xor_bits(&page->_refcount, 0x03U << 24);
++ atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+ } else {
+ /* Return the first 2K fragment of the page */
+- atomic_xor_bits(&page->_refcount, 0x01U << 24);
++ atomic_xor_bits(&page->pt_frag_refcount, 0x01U);
+ memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
+ spin_lock_bh(&mm->context.lock);
+ list_add(&page->lru, &mm->context.pgtable_list);
+@@ -323,22 +321,19 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
+ * will happen outside of the critical section from this
+ * function or from __tlb_remove_table()
+ */
+- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ if (mask & 0x03U)
+ list_add(&page->lru, &mm->context.pgtable_list);
+ else
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.lock);
+- mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x10U << bit);
+ if (mask != 0x00U)
+ return;
+ half = 0x01U << bit;
+ } else {
+ half = 0x03U;
+- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ }
- /* Free additional data for a shadow gmap */
- if (gmap_is_shadow(gmap)) {
- /* Free all page tables. */
-- list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
-+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru) {
-+ page->_pt_s390_gaddr = 0;
- page_table_free_pgste(page);
-+ }
- gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
- /* Release reference to the parent */
- gmap_put(gmap->parent);
-@@ -318,12 +322,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
- list_add(&page->lru, &gmap->crst_list);
- *table = __pa(new) | _REGION_ENTRY_LENGTH |
- (*table & _REGION_ENTRY_TYPE_MASK);
-- page->index = gaddr;
-+ page->_pt_s390_gaddr = gaddr;
- page = NULL;
+ page_table_release_check(page, table, half, mask);
+@@ -368,8 +363,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+ * outside of the critical section from __tlb_remove_table() or from
+ * page_table_free()
+ */
+- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ if (mask & 0x03U)
+ list_add_tail(&page->lru, &mm->context.pgtable_list);
+ else
+@@ -391,14 +385,12 @@ void __tlb_remove_table(void *_table)
+ return;
+ case 0x01U: /* lower 2K of a 4K page table */
+ case 0x02U: /* higher 2K of a 4K page table */
+- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, mask << 4);
+ if (mask != 0x00U)
+ return;
+ break;
+ case 0x03U: /* 4K page table with pgstes */
+- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ break;
}
- spin_unlock(&gmap->guest_table_lock);
-- if (page)
-+ if (page) {
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
-+ }
- return 0;
- }
-@@ -336,12 +342,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
- static unsigned long __gmap_segment_gaddr(unsigned long *entry)
- {
- struct page *page;
-- unsigned long offset;
-+ unsigned long offset, mask;
-
- offset = (unsigned long) entry / sizeof(unsigned long);
- offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
-- page = pmd_pgtable_page((pmd_t *) entry);
-- return page->index + offset;
-+ mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-+ page = virt_to_page((void *)((unsigned long) entry & mask));
-+
-+ return page->_pt_s390_gaddr + offset;
- }
-
- /**
-@@ -1351,6 +1359,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
- /* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- page_table_free_pgste(page);
- }
-
-@@ -1379,6 +1388,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
- /* Free page table */
- page = phys_to_page(pgt);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- page_table_free_pgste(page);
- }
- }
-@@ -1409,6 +1419,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
- /* Free segment table */
- page = phys_to_page(sgt);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
-
-@@ -1437,6 +1448,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
- /* Free segment table */
- page = phys_to_page(sgt);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
- }
-@@ -1467,6 +1479,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
- /* Free region 3 table */
- page = phys_to_page(r3t);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
-
-@@ -1495,6 +1508,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
- /* Free region 3 table */
- page = phys_to_page(r3t);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
- }
-@@ -1525,6 +1539,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
- /* Free region 2 table */
- page = phys_to_page(r2t);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
-
-@@ -1557,6 +1572,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
- /* Free region 2 table */
- page = phys_to_page(r2t);
- list_del(&page->lru);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- }
- }
-@@ -1762,9 +1778,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
- if (!page)
- return -ENOMEM;
-- page->index = r2t & _REGION_ENTRY_ORIGIN;
-+ page->_pt_s390_gaddr = r2t & _REGION_ENTRY_ORIGIN;
- if (fake)
-- page->index |= GMAP_SHADOW_FAKE_TABLE;
-+ page->_pt_s390_gaddr |= GMAP_SHADOW_FAKE_TABLE;
- s_r2t = page_to_phys(page);
- /* Install shadow region second table */
- spin_lock(&sg->guest_table_lock);
-@@ -1814,6 +1830,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
- return rc;
- out_free:
- spin_unlock(&sg->guest_table_lock);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- return rc;
- }
-@@ -1846,9 +1863,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
- if (!page)
- return -ENOMEM;
-- page->index = r3t & _REGION_ENTRY_ORIGIN;
-+ page->_pt_s390_gaddr = r3t & _REGION_ENTRY_ORIGIN;
- if (fake)
-- page->index |= GMAP_SHADOW_FAKE_TABLE;
-+ page->_pt_s390_gaddr |= GMAP_SHADOW_FAKE_TABLE;
- s_r3t = page_to_phys(page);
- /* Install shadow region second table */
- spin_lock(&sg->guest_table_lock);
-@@ -1898,6 +1915,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
- return rc;
- out_free:
- spin_unlock(&sg->guest_table_lock);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- return rc;
- }
-@@ -1930,9 +1948,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
- page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
- if (!page)
- return -ENOMEM;
-- page->index = sgt & _REGION_ENTRY_ORIGIN;
-+ page->_pt_s390_gaddr = sgt & _REGION_ENTRY_ORIGIN;
- if (fake)
-- page->index |= GMAP_SHADOW_FAKE_TABLE;
-+ page->_pt_s390_gaddr |= GMAP_SHADOW_FAKE_TABLE;
- s_sgt = page_to_phys(page);
- /* Install shadow region second table */
- spin_lock(&sg->guest_table_lock);
-@@ -1982,6 +2000,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
- return rc;
- out_free:
- spin_unlock(&sg->guest_table_lock);
-+ page->_pt_s390_gaddr = 0;
- __free_pages(page, CRST_ALLOC_ORDER);
- return rc;
- }
-@@ -2014,9 +2033,9 @@ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
-- *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
-+ *pgt = page->_pt_s390_gaddr & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
-- *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
-+ *fake = !!(page->_pt_s390_gaddr & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
-@@ -2054,9 +2073,9 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
- page = page_table_alloc_pgste(sg->mm);
- if (!page)
- return -ENOMEM;
-- page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
-+ page->_pt_s390_gaddr = pgt & _SEGMENT_ENTRY_ORIGIN;
- if (fake)
-- page->index |= GMAP_SHADOW_FAKE_TABLE;
-+ page->_pt_s390_gaddr |= GMAP_SHADOW_FAKE_TABLE;
- s_pgt = page_to_phys(page);
- /* Install shadow page table */
- spin_lock(&sg->guest_table_lock);
-@@ -2101,6 +2120,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
- return rc;
- out_free:
- spin_unlock(&sg->guest_table_lock);
-+ page->_pt_s390_gaddr = 0;
- page_table_free_pgste(page);
- return rc;
-
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 306a3d1a0fa6..6161fe1ae5b8 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -144,7 +144,7 @@ struct page {
- struct { /* Page table pages */
- unsigned long _pt_pad_1; /* compound_head */
- pgtable_t pmd_huge_pte; /* protected by page->ptl */
-- unsigned long _pt_pad_2; /* mapping */
-+ unsigned long _pt_s390_gaddr; /* mapping */
- union {
- struct mm_struct *pt_mm; /* x86 pgds only */
- atomic_t pt_frag_refcount; /* powerpc */
--
2.39.2