Thread (9 messages) 9 messages, 4 authors, 2021-12-23

Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()

From: Peter Zijlstra <peterz@infradead.org>
Date: 2021-12-18 00:38:16
Also in: linux-arch, linux-mm, linux-s390, lkml, sparclinux
Subsystem: arm port, arm64 port (aarch64 architecture), memory management, mmu gather and tlb invalidation, the rest, x86 architecture (32-bit and 64-bit) · Maintainers: Russell King, Catalin Marinas, Will Deacon, Andrew Morton, "Aneesh Kumar K.V", Nick Piggin, Peter Zijlstra, Linus Torvalds, Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen

On Fri, Dec 17, 2021 at 11:19:10AM +0300, Nikita Yushchenko wrote:
When batched page table freeing via struct mmu_table_batch is used, the
final freeing in __tlb_remove_table_free() executes a loop, calling
arch hook __tlb_remove_table() to free each table individually.

Shift that loop down to archs. This allows archs to optimize it, by
freeing multiple tables in a single release_pages() call. This is
faster than individual put_page() calls, especially with memcg
accounting enabled.

Signed-off-by: Andrey Ryabinin <redacted>
Signed-off-by: Nikita Yushchenko <redacted>
---
 arch/arm/include/asm/tlb.h                   |  5 ++++
 arch/arm64/include/asm/tlb.h                 |  5 ++++
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  8 +++++++
 arch/powerpc/include/asm/book3s/64/pgalloc.h |  1 +
 arch/powerpc/include/asm/nohash/pgalloc.h    |  8 +++++++
 arch/powerpc/mm/book3s64/pgtable.c           |  8 +++++++
 arch/s390/include/asm/tlb.h                  |  1 +
 arch/s390/mm/pgalloc.c                       |  8 +++++++
 arch/sparc/include/asm/pgalloc_64.h          |  8 +++++++
 arch/x86/include/asm/tlb.h                   |  5 ++++
 include/asm-generic/tlb.h                    |  2 +-
 include/linux/swap.h                         |  5 +++-
 mm/mmu_gather.c                              |  6 +----
 mm/swap_state.c                              | 24 +++++++++++++++-----
 14 files changed, 81 insertions(+), 13 deletions(-)
Oh gawd, that's terrible. Never, ever duplicate code like that.

I'm thinking the below does the same? But yes, please do as Dave said,
give us actual numbers that show this is worth it.

---
 arch/Kconfig                 |  4 ++++
 arch/arm/Kconfig             |  1 +
 arch/arm/include/asm/tlb.h   |  5 -----
 arch/arm64/Kconfig           |  1 +
 arch/arm64/include/asm/tlb.h |  5 -----
 arch/x86/Kconfig             |  1 +
 arch/x86/include/asm/tlb.h   |  4 ----
 mm/mmu_gather.c              | 22 +++++++++++++++++++---
 8 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 26b8ed11639d..f2bd3f5af2b1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -415,6 +415,10 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE
 config MMU_GATHER_TABLE_FREE
 	bool
 
+config MMU_GATHER_TABLE_PAGE
+	bool
+	depends on MMU_GATHER_TABLE_FREE
+
 config MMU_GATHER_RCU_TABLE_FREE
 	bool
 	select MMU_GATHER_TABLE_FREE
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f0f9e8bec83a..11baaa5719c2 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -110,6 +110,7 @@ config ARM
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
+	select MMU_GATHER_TABLE_PAGE if MMU
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..9d9b21649ca0 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -29,11 +29,6 @@
 #include <linux/swap.h>
 #include <asm/tlbflush.h>
 
-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}
-
 #include <asm-generic/tlb.h>
 
 static inline void
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..4aa28fb03f4f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -196,6 +196,7 @@ config ARM64
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_TABLE_PAGE
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..401826260a5c 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -11,11 +11,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 
-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}
-
 #define tlb_flush tlb_flush
 static void tlb_flush(struct mmu_gather *tlb);
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b9281fab4e3e..a22e653f4d0e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -235,6 +235,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE		if PARAVIRT
+	select MMU_GATHER_TABLE_PAGE
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..dec5ffa3042a 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -32,9 +32,5 @@ static inline void tlb_flush(struct mmu_gather *tlb)
  * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
  * for more details.
  */
-static inline void __tlb_remove_table(void *table)
-{
-	free_page_and_swap_cache(table);
-}
 
 #endif /* _ASM_X86_TLB_H */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..0195d0f13ed3 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -93,13 +93,29 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 
 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 
-static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+#ifdef CONFIG_MMU_GATHER_TABLE_PAGE
+static inline void __tlb_remove_table(void *table)
+{
+	free_page_and_swap_cache(table);
+}
+
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	free_pages_and_swap_cache_nolru((struct page **)tables, nr);
+}
+#else
+static inline void __tlb_remove_tables(void **tables, int nr)
 {
 	int i;
 
-	for (i = 0; i < batch->nr; i++)
-		__tlb_remove_table(batch->tables[i]);
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+#endif
 
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+	__tlb_remove_tables(batch->tables, batch->nr);
 	free_page((unsigned long)batch);
 }
 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help