Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
From: Peter Zijlstra <peterz@infradead.org>
Date: 2021-12-18 00:38:16
Also in:
linux-arch, linux-mm, linux-s390, lkml, sparclinux
Subsystem:
arm port, arm64 port (aarch64 architecture), memory management, mmu gather and tlb invalidation, the rest, x86 architecture (32-bit and 64-bit) · Maintainers:
Russell King, Catalin Marinas, Will Deacon, Andrew Morton, "Aneesh Kumar K.V", Nick Piggin, Peter Zijlstra, Linus Torvalds, Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen
On Fri, Dec 17, 2021 at 11:19:10AM +0300, Nikita Yushchenko wrote:
When batched page table freeing via struct mmu_table_batch is used, the final freeing in __tlb_remove_table_free() executes a loop, calling arch hook __tlb_remove_table() to free each table individually. Shift that loop down to archs. This allows archs to optimize it, by freeing multiple tables in a single release_pages() call. This is faster than individual put_page() calls, especially with memcg accounting enabled. Signed-off-by: Andrey Ryabinin <redacted> Signed-off-by: Nikita Yushchenko <redacted> --- arch/arm/include/asm/tlb.h | 5 ++++ arch/arm64/include/asm/tlb.h | 5 ++++ arch/powerpc/include/asm/book3s/32/pgalloc.h | 8 +++++++ arch/powerpc/include/asm/book3s/64/pgalloc.h | 1 + arch/powerpc/include/asm/nohash/pgalloc.h | 8 +++++++ arch/powerpc/mm/book3s64/pgtable.c | 8 +++++++ arch/s390/include/asm/tlb.h | 1 + arch/s390/mm/pgalloc.c | 8 +++++++ arch/sparc/include/asm/pgalloc_64.h | 8 +++++++ arch/x86/include/asm/tlb.h | 5 ++++ include/asm-generic/tlb.h | 2 +- include/linux/swap.h | 5 +++- mm/mmu_gather.c | 6 +---- mm/swap_state.c | 24 +++++++++++++++----- 14 files changed, 81 insertions(+), 13 deletions(-)
Oh gawd, that's terrible. Never, ever duplicate code like that. I'm thinking the below does the same? But yes, please do as Dave said, give us actual numbers that show this is worth it. --- arch/Kconfig | 4 ++++ arch/arm/Kconfig | 1 + arch/arm/include/asm/tlb.h | 5 ----- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/tlb.h | 5 ----- arch/x86/Kconfig | 1 + arch/x86/include/asm/tlb.h | 4 ---- mm/mmu_gather.c | 22 +++++++++++++++++++--- 8 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 26b8ed11639d..f2bd3f5af2b1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig@@ -415,6 +415,10 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE config MMU_GATHER_TABLE_FREE bool +config MMU_GATHER_TABLE_PAGE + bool + depends on MMU_GATHER_TABLE_FREE + config MMU_GATHER_RCU_TABLE_FREE bool select MMU_GATHER_TABLE_FREE
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f0f9e8bec83a..11baaa5719c2 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig@@ -110,6 +110,7 @@ config ARM select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE + select MMU_GATHER_TABLE_PAGE if MMU select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ select HAVE_STACKPROTECTOR
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..9d9b21649ca0 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h@@ -29,11 +29,6 @@ #include <linux/swap.h> #include <asm/tlbflush.h> -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} - #include <asm-generic/tlb.h> static inline void
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..4aa28fb03f4f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig@@ -196,6 +196,7 @@ config ARM64 select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_TABLE_PAGE select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..401826260a5c 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h@@ -11,11 +11,6 @@ #include <linux/pagemap.h> #include <linux/swap.h> -static inline void __tlb_remove_table(void *_table) -{ - free_page_and_swap_cache((struct page *)_table); -} - #define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b9281fab4e3e..a22e653f4d0e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig@@ -235,6 +235,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_TABLE_PAGE select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..dec5ffa3042a 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h@@ -32,9 +32,5 @@ static inline void tlb_flush(struct mmu_gather *tlb) * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h * for more details. */ -static inline void __tlb_remove_table(void *table) -{ - free_page_and_swap_cache(table); -} #endif /* _ASM_X86_TLB_H */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..0195d0f13ed3 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c@@ -93,13 +93,29 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #ifdef CONFIG_MMU_GATHER_TABLE_FREE -static void __tlb_remove_table_free(struct mmu_table_batch *batch) +#ifdef CONFIG_MMU_GATHER_TABLE_PAGE +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + +static inline void __tlb_remove_tables(void **tables, int nr) +{ + free_pages_and_swap_cache_nolru((struct page **)tables, nr); +} +#else +static inline void __tlb_remove_tables(void **tables, int nr) { int i; - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); + for (i = 0; i < nr; i++) + __tlb_remove_table(tables[i]); +} +#endif +static void __tlb_remove_table_free(struct mmu_table_batch *batch) +{ + __tlb_remove_tables(batch->tables, batch->nr); free_page((unsigned long)batch); }