Inter-revision diff: patch 3

Comparing v1 (message) to v3 (message)

--- v1
+++ v3
@@ -1,73 +1,137 @@
-Currently, page table information is stored within struct page. As part
-of simplifying struct page, create struct ptdesc for page table
-information.
+s390 currently uses _refcount to identify fragmented page tables.
+The page table struct already has a member pt_frag_refcount used by
+powerpc, so have s390 use that instead of the _refcount field as well.
+This improves the safety for _refcount and the page table tracking.
+
+This also allows us to simplify the tracking since we can once again use
+the lower byte of pt_frag_refcount instead of the upper byte of _refcount.
 
 Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
 ---
- include/linux/pgtable.h | 50 +++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 50 insertions(+)
+ arch/s390/mm/pgalloc.c | 38 +++++++++++++++-----------------------
+ 1 file changed, 15 insertions(+), 23 deletions(-)
 
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index 023918666dd4..7cc6ea057ee9 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -47,6 +47,56 @@
- #define pmd_pgtable(pmd) pmd_page(pmd)
- #endif
+diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
+index 66ab68db9842..6b99932abc66 100644
+--- a/arch/s390/mm/pgalloc.c
++++ b/arch/s390/mm/pgalloc.c
+@@ -182,20 +182,17 @@ void page_table_free_pgste(struct page *page)
+  * As follows from the above, no unallocated or fully allocated parent
+  * pages are contained in mm_context_t::pgtable_list.
+  *
+- * The upper byte (bits 24-31) of the parent page _refcount is used
++ * The lower byte (bits 0-7) of the parent page pt_frag_refcount is used
+  * for tracking contained 2KB-pgtables and has the following format:
+  *
+  *   PP  AA
+- * 01234567    upper byte (bits 24-31) of struct page::_refcount
++ * 01234567    upper byte (bits 0-7) of struct page::pt_frag_refcount
+  *   ||  ||
+  *   ||  |+--- upper 2KB-pgtable is allocated
+  *   ||  +---- lower 2KB-pgtable is allocated
+  *   |+------- upper 2KB-pgtable is pending for removal
+  *   +-------- lower 2KB-pgtable is pending for removal
+  *
+- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+- * using _refcount is possible).
+- *
+  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+  * The parent page is either:
+  *   - added to mm_context_t::pgtable_list in case the second half of the
+@@ -243,11 +240,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ 		if (!list_empty(&mm->context.pgtable_list)) {
+ 			page = list_first_entry(&mm->context.pgtable_list,
+ 						struct page, lru);
+-			mask = atomic_read(&page->_refcount) >> 24;
++			mask = atomic_read(&page->pt_frag_refcount);
+ 			/*
+ 			 * The pending removal bits must also be checked.
+ 			 * Failure to do so might lead to an impossible
+-			 * value of (i.e 0x13 or 0x23) written to _refcount.
++			 * value of (i.e 0x13 or 0x23) written to
++			 * pt_frag_refcount.
+ 			 * Such values violate the assumption that pending and
+ 			 * allocation bits are mutually exclusive, and the rest
+ 			 * of the code unrails as result. That could lead to
+@@ -259,8 +257,8 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ 				bit = mask & 1;		/* =1 -> second 2K */
+ 				if (bit)
+ 					table += PTRS_PER_PTE;
+-				atomic_xor_bits(&page->_refcount,
+-							0x01U << (bit + 24));
++				atomic_xor_bits(&page->pt_frag_refcount,
++							0x01U << bit);
+ 				list_del(&page->lru);
+ 			}
+ 		}
+@@ -281,12 +279,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
+ 	table = (unsigned long *) page_to_virt(page);
+ 	if (mm_alloc_pgste(mm)) {
+ 		/* Return 4K page table with PGSTEs */
+-		atomic_xor_bits(&page->_refcount, 0x03U << 24);
++		atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ 		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+ 	} else {
+ 		/* Return the first 2K fragment of the page */
+-		atomic_xor_bits(&page->_refcount, 0x01U << 24);
++		atomic_xor_bits(&page->pt_frag_refcount, 0x01U);
+ 		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
+ 		spin_lock_bh(&mm->context.lock);
+ 		list_add(&page->lru, &mm->context.pgtable_list);
+@@ -323,22 +321,19 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
+ 		 * will happen outside of the critical section from this
+ 		 * function or from __tlb_remove_table()
+ 		 */
+-		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+-		mask >>= 24;
++		mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ 		if (mask & 0x03U)
+ 			list_add(&page->lru, &mm->context.pgtable_list);
+ 		else
+ 			list_del(&page->lru);
+ 		spin_unlock_bh(&mm->context.lock);
+-		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+-		mask >>= 24;
++		mask = atomic_xor_bits(&page->pt_frag_refcount, 0x10U << bit);
+ 		if (mask != 0x00U)
+ 			return;
+ 		half = 0x01U << bit;
+ 	} else {
+ 		half = 0x03U;
+-		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+-		mask >>= 24;
++		mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ 	}
  
-+/**
-+ * struct ptdesc - Memory descriptor for page tables.
-+ * @__page_flags: Same as page flags. Unused for page tables.
-+ * @pt_list: List of used page tables. Used for s390 and x86.
-+ * @_pt_pad_1: Padding that aliases with page's compound head.
-+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
-+ * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
-+ * @pt_mm: Used for x86 pgds.
-+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
-+ * @ptl: Lock for the page table.
-+ *
-+ * This struct overlays struct page for now. Do not modify without a good
-+ * understanding of the issues.
-+ */
-+struct ptdesc {
-+	unsigned long __page_flags;
-+
-+	union {
-+		struct list_head pt_list;
-+		struct {
-+			unsigned long _pt_pad_1;
-+			pgtable_t pmd_huge_pte;
-+		};
-+	};
-+	unsigned long _pt_s390_gaddr;
-+
-+	union {
-+		struct mm_struct *pt_mm;
-+		atomic_t pt_frag_refcount;
-+	};
-+
-+#if ALLOC_SPLIT_PTLOCKS
-+	spinlock_t *ptl;
-+#else
-+	spinlock_t ptl;
-+#endif
-+};
-+
-+#define TABLE_MATCH(pg, pt)						\
-+	static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
-+TABLE_MATCH(flags, __page_flags);
-+TABLE_MATCH(compound_head, pt_list);
-+TABLE_MATCH(compound_head, _pt_pad_1);
-+TABLE_MATCH(mapping, _pt_s390_gaddr);
-+TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
-+TABLE_MATCH(pt_mm, pt_mm);
-+TABLE_MATCH(ptl, ptl);
-+#undef TABLE_MATCH
-+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
-+
- /*
-  * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
-  *
+ 	page_table_release_check(page, table, half, mask);
+@@ -368,8 +363,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+ 	 * outside of the critical section from __tlb_remove_table() or from
+ 	 * page_table_free()
+ 	 */
+-	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+-	mask >>= 24;
++	mask = atomic_xor_bits(&page->pt_frag_refcount, 0x11U << bit);
+ 	if (mask & 0x03U)
+ 		list_add_tail(&page->lru, &mm->context.pgtable_list);
+ 	else
+@@ -391,14 +385,12 @@ void __tlb_remove_table(void *_table)
+ 		return;
+ 	case 0x01U:	/* lower 2K of a 4K page table */
+ 	case 0x02U:	/* higher 2K of a 4K page table */
+-		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+-		mask >>= 24;
++		mask = atomic_xor_bits(&page->pt_frag_refcount, mask << 4);
+ 		if (mask != 0x00U)
+ 			return;
+ 		break;
+ 	case 0x03U:	/* 4K page table with pgstes */
+-		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+-		mask >>= 24;
++		mask = atomic_xor_bits(&page->pt_frag_refcount, 0x03U);
+ 		break;
+ 	}
+ 
 -- 
-2.39.2
+2.40.1
 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help