[RFC] change non-atomic bitops method
From: Wang, Yalin <hidden>
Date: 2015-02-03 08:42:21
Also in:
linux-arch, lkml
Subsystem:
filesystems (vfs and infrastructure), generic include/asm header files, memory management - core, proc filesystem, the rest · Maintainers:
Alexander Viro, Christian Brauner, Arnd Bergmann, Andrew Morton, David Hildenbrand, Linus Torvalds
-----Original Message----- From: Wang, Yalin Sent: Tuesday, February 03, 2015 3:04 PM To: 'Andrew Morton' Cc: 'Kirill A. Shutemov'; 'arnd at arndb.de'; 'linux-arch at vger.kernel.org'; 'linux-kernel at vger.kernel.org'; 'linux at arm.linux.org.uk'; 'linux-arm- kernel at lists.infradead.org' Subject: RE: [RFC] change non-atomic bitops methodquoted
-----Original Message----- From: Andrew Morton [mailto:akpm at linux-foundation.org] Sent: Tuesday, February 03, 2015 2:39 PM To: Wang, Yalin Cc: 'Kirill A. Shutemov'; 'arnd at arndb.de'; 'linux-arch at vger.kernel.org'; 'linux-kernel at vger.kernel.org'; 'linux at arm.linux.org.uk'; 'linux-arm- kernel at lists.infradead.org' Subject: Re: [RFC] change non-atomic bitops method On Tue, 3 Feb 2015 13:42:45 +0800 "Wang, Yalin"[off-list ref]quoted
wrote:quoted
... #ifdef CHECK_BEFORE_SET if (p[i] != times) #endif ... ---- One run on CPU0, reader thread run on CPU1, Test result: sudo ./cache_test reader:8.426228173 8.672198335 With -DCHECK_BEFORE_SET sudo ./cache_test_check reader:7.537036819 10.799746531You aren't measuring the right thing. You should compare if (p[i] != x) p[i] = x; versus p[i] = x; and you should do this for two cases: a) p[i] == x b) p[i] != x The first code sequence will be slower when (p[i] != x) and faster when (p[i] == x). Next, we should instrument the kernel to work out the frequency of set_bit on an already-set bit. It is only with both these ratios that we can work out whether the patch is a net gain. My suspicion is that set_bit on an already-set bit is so rare that the patch will be a loss.I see, let's change the test a little: 1) memset(p, 0, SIZE); if (p[i] != 0) p[i] = 0; // never called #sudo ./cache_test_check 6.698153838 reader:7.529402625 2) memset(p, 0, SIZE); if (p[i] == 0) p[i] = 0; // always called #sudo ./cache_test_check reader:7.895421311 9.000889973 Thanks
I make a change in kernel to test hit/miss ratio: ---
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645..a82937b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c@@ -2,6 +2,7 @@ #include <linux/hugetlb.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mmzone.h>
@@ -15,6 +16,41 @@ #include <asm/pgtable.h> #include "internal.h" +atomic_t __set_bit_success_count = ATOMIC_INIT(0); +atomic_t __set_bit_miss_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(__set_bit_success_count); +EXPORT_SYMBOL_GPL(__set_bit_miss_count); + +atomic_t __clear_bit_success_count = ATOMIC_INIT(0); +atomic_t __clear_bit_miss_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(__clear_bit_success_count); +EXPORT_SYMBOL_GPL(__clear_bit_miss_count); + +atomic_t __test_and_set_bit_success_count = ATOMIC_INIT(0); +atomic_t __test_and_set_bit_miss_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(__test_and_set_bit_success_count); +EXPORT_SYMBOL_GPL(__test_and_set_bit_miss_count); + +atomic_t __test_and_clear_bit_success_count = ATOMIC_INIT(0); +atomic_t __test_and_clear_bit_miss_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(__test_and_clear_bit_success_count); +EXPORT_SYMBOL_GPL(__test_and_clear_bit_miss_count); + +/* + * atomic bitops + */ +atomic_t set_bit_success_count = ATOMIC_INIT(0); +atomic_t set_bit_miss_count = ATOMIC_INIT(0); + +atomic_t clear_bit_success_count = ATOMIC_INIT(0); +atomic_t clear_bit_miss_count = ATOMIC_INIT(0); + +atomic_t test_and_set_bit_success_count = ATOMIC_INIT(0); +atomic_t test_and_set_bit_miss_count = ATOMIC_INIT(0); + +atomic_t test_and_clear_bit_success_count = ATOMIC_INIT(0); +atomic_t test_and_clear_bit_miss_count = ATOMIC_INIT(0); + void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { }
@@ -165,6 +201,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v) HPAGE_PMD_NR) #endif ); + seq_printf(m, "__set_bit_miss_count:%d __set_bit_success_count:%d\n" + "__clear_bit_miss_count:%d __clear_bit_success_count:%d\n" + "__test_and_set_bit_miss_count:%d __test_and_set_bit_success_count:%d\n" + "__test_and_clear_bit_miss_count:%d __test_and_clear_bit_success_count:%d\n", + atomic_read(&__set_bit_miss_count), atomic_read(&__set_bit_success_count), + atomic_read(&__clear_bit_miss_count), atomic_read(&__clear_bit_success_count), + + atomic_read(&__test_and_set_bit_miss_count), + atomic_read(&__test_and_set_bit_success_count), + + atomic_read(&__test_and_clear_bit_miss_count), + atomic_read(&__test_and_clear_bit_success_count)); hugetlb_report_meminfo(m);
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 697cc2b..1895133 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h@@ -2,7 +2,18 @@ #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ #include <asm/types.h> +#include <asm/atomic.h> +extern atomic_t __set_bit_success_count; +extern atomic_t __set_bit_miss_count; +extern atomic_t __clear_bit_success_count; +extern atomic_t __clear_bit_miss_count; + +extern atomic_t __test_and_set_bit_success_count; +extern atomic_t __test_and_set_bit_miss_count; + +extern atomic_t __test_and_clear_bit_success_count; +extern atomic_t __test_and_clear_bit_miss_count; /** * __set_bit - Set a bit in memory * @nr: the bit to set
@@ -17,7 +28,13 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); - *p |= mask; + if ((*p & mask) == 0) { + atomic_inc(&__set_bit_success_count); + *p |= mask; + } else { + atomic_inc(&__set_bit_miss_count); + } + } static inline void __clear_bit(int nr, volatile unsigned long *addr)
@@ -25,7 +42,12 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); - *p &= ~mask; + if ((*p & mask) != 0) { + atomic_inc(&__clear_bit_success_count); + *p &= ~mask; + } else { + atomic_inc(&__clear_bit_miss_count); + } } /**
@@ -60,7 +82,12 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; - *p = old | mask; + if ((old & mask) == 0) { + atomic_inc(&__test_and_set_bit_success_count); + *p = old | mask; + } else { + atomic_inc(&__test_and_set_bit_miss_count); + } return (old & mask) != 0; }
@@ -79,7 +106,12 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; - *p = old & ~mask; + if ((old & mask) != 0) { + atomic_inc(&__test_and_clear_bit_success_count); + *p = old & ~mask; + } else { + atomic_inc(&__test_and_clear_bit_miss_count); + } return (old & mask) != 0; }
--- After use the phone some time: root at D5303:/ # cat /proc/meminfo VmallocUsed: 10348 kB VmallocChunk: 75632 kB __set_bit_miss_count:10002 __set_bit_success_count:1096661 __clear_bit_miss_count:359484 __clear_bit_success_count:3674617 __test_and_set_bit_miss_count:7 __test_and_set_bit_success_count:221 __test_and_clear_bit_miss_count:924611 __test_and_clear_bit_success_count:193 __test_and_clear_bit_miss_count has a very high miss rate. In fact, I think set/clear/test_and_set(clear)_bit atomic version can also Be investigated to see its miss ratio, I have not tested the atomic version, Because it reside in different architectures. Thanks