--- v1
+++ v3
@@ -1,73 +1,137 @@
-Currently, page table information is stored within struct page. As part
-of simplifying struct page, create struct ptdesc for page table
-information.
+s390 currently uses _refcount to identify fragmented page tables.
+The page table struct already has a member pt_frag_refcount used by
+powerpc, so have s390 use that instead of the _refcount field as well.
+This improves the safety for _refcount and the page table tracking.
+
+This also allows us to simplify the tracking since we can once again use
+the lower byte of pt_frag_refcount instead of the upper byte of _refcount.
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
- include/linux/pgtable.h | 50 +++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 50 insertions(+)
+ arch/s390/mm/pgalloc.c | 38 +++++++++++++++-----------------------
+ 1 file changed, 15 insertions(+), 23 deletions(-)
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index 023918666dd4..7cc6ea057ee9 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -47,6 +47,56 @@
- #define pmd_pgtable(pmd) pmd_page(pmd)
- #endif
+diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
+index 66ab68db9842..6b99932abc66 100644
+--- a/arch/s390/mm/pgalloc.c
++++ b/arch/s390/mm/pgalloc.c
+@@ -182,20 +182,17 @@ void page_table_free_pgste(struct page *page)
+ * As follows from the above, no unallocated or fully allocated parent
+ * pages are contained in mm_context_t::pgtable_list.
+ *
+- * The upper byte (bits 24-31) of the parent page _refcount is used
++ * The lower byte (bits 0-7) of the parent page pt_frag_refcount is used
+ * for tracking contained 2KB-pgtables and has the following format:
+ *
+ * PP AA
+- * 01234567 upper byte (bits 24-31) of struct page::_refcount
++ * 01234567 upper byte (bits 0-7) of struct page::pt_frag_refcount
+ * || ||
+ * || |+--- upper 2KB-pgtable is allocated
+ * || +---- lower 2KB-pgtable is allocated
+ * |+------- upper 2KB-pgtable is pending for removal
+ * +-------- lower 2KB-pgtable is pending for removal
+ *
+- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+- * using _refcount is possible).
+- *
+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+@@ -243,11 +240,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ if (!list_empty(&mm->context.pgtable_list)) {
+ page = list_first_entry(&mm->context.pgtable_list,
+ struct page, lru);
+- mask = atomic_read(&page->_refcount) >> 24;
++ mask = atomic_read(&page->pt_frag_refcount);
+ /*
+ * The pending removal bits must also be checked.
+ * Failure to do so might lead to an impossible
+- * value of (i.e 0x13 or 0x23) written to _refcount.
++ * value of (i.e 0x13 or 0x23) written to
++ * pt_frag_refcount.
+ * Such values violate the assumption that pending and
+ * allocation bits are mutually exclusive, and the rest
+ * of the code unrails as result. That could lead to
+@@ -259,8 +257,8 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ bit = mask & 1; /* =1 -> second 2K */
+ if (bit)
+ table += PTRS_PER_PTE;
+- atomic_xor_bits(&page->_refcount,
+- 0x01U << (bit + 24));
++ atomic_xor_bits(&page->pt_frag_refcount,
++ 0x01U << bit);
+ list_del(&page->lru);
+ }
+ }
+@@ -281,12 +279,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ table = (unsigned long *) page_to_virt(page);
+ if (mm_alloc_pgste(mm)) {
+ /* Return 4K page table with PGSTEs */
+- atomic_xor_bits(&page->_refcount, 0x03U << 24);
++ atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+ } else {
+ /* Return the first 2K fragment of the page */
+- atomic_xor_bits(&page->_refcount, 0x01U << 24);
++ atomic_xor_bits(&page->pt_frag_refcount, 0x01U);
+ memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
+ spin_lock_bh(&mm->context.lock);
+ list_add(&page->lru, &mm->context.pgtable_list);
+@@ -323,22 +321,19 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
+ * will happen outside of the critical section from this
+ * function or from __tlb_remove_table()
+ */
+- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ if (mask & 0x03U)
+ list_add(&page->lru, &mm->context.pgtable_list);
+ else
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.lock);
+- mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x10U << bit);
+ if (mask != 0x00U)
+ return;
+ half = 0x01U << bit;
+ } else {
+ half = 0x03U;
+- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ }
-+/**
-+ * struct ptdesc - Memory descriptor for page tables.
-+ * @__page_flags: Same as page flags. Unused for page tables.
-+ * @pt_list: List of used page tables. Used for s390 and x86.
-+ * @_pt_pad_1: Padding that aliases with page's compound head.
-+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
-+ * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
-+ * @pt_mm: Used for x86 pgds.
-+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
-+ * @ptl: Lock for the page table.
-+ *
-+ * This struct overlays struct page for now. Do not modify without a good
-+ * understanding of the issues.
-+ */
-+struct ptdesc {
-+ unsigned long __page_flags;
-+
-+ union {
-+ struct list_head pt_list;
-+ struct {
-+ unsigned long _pt_pad_1;
-+ pgtable_t pmd_huge_pte;
-+ };
-+ };
-+ unsigned long _pt_s390_gaddr;
-+
-+ union {
-+ struct mm_struct *pt_mm;
-+ atomic_t pt_frag_refcount;
-+ };
-+
-+#if ALLOC_SPLIT_PTLOCKS
-+ spinlock_t *ptl;
-+#else
-+ spinlock_t ptl;
-+#endif
-+};
-+
-+#define TABLE_MATCH(pg, pt) \
-+ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
-+TABLE_MATCH(flags, __page_flags);
-+TABLE_MATCH(compound_head, pt_list);
-+TABLE_MATCH(compound_head, _pt_pad_1);
-+TABLE_MATCH(mapping, _pt_s390_gaddr);
-+TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
-+TABLE_MATCH(pt_mm, pt_mm);
-+TABLE_MATCH(ptl, ptl);
-+#undef TABLE_MATCH
-+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
-+
- /*
- * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
- *
+ page_table_release_check(page, table, half, mask);
+@@ -368,8 +363,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+ * outside of the critical section from __tlb_remove_table() or from
+ * page_table_free()
+ */
+- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ if (mask & 0x03U)
+ list_add_tail(&page->lru, &mm->context.pgtable_list);
+ else
+@@ -391,14 +385,12 @@ void __tlb_remove_table(void *_table)
+ return;
+ case 0x01U: /* lower 2K of a 4K page table */
+ case 0x02U: /* higher 2K of a 4K page table */
+- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, mask << 4);
+ if (mask != 0x00U)
+ return;
+ break;
+ case 0x03U: /* 4K page table with pgstes */
+- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+- mask >>= 24;
++ mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ break;
+ }
+
--
-2.39.2
+2.40.1