Thread (20 messages) 20 messages, 8 authors, 2015-02-10

[RFC] change non-atomic bitops method

From: Wang, Yalin <hidden>
Date: 2015-02-03 08:42:21
Also in: linux-arch, lkml
Subsystem: filesystems (vfs and infrastructure), generic include/asm header files, memory management - core, proc filesystem, the rest · Maintainers: Alexander Viro, Christian Brauner, Arnd Bergmann, Andrew Morton, David Hildenbrand, Linus Torvalds

-----Original Message-----
From: Wang, Yalin
Sent: Tuesday, February 03, 2015 3:04 PM
To: 'Andrew Morton'
Cc: 'Kirill A. Shutemov'; 'arnd at arndb.de'; 'linux-arch at vger.kernel.org';
'linux-kernel at vger.kernel.org'; 'linux at arm.linux.org.uk'; 'linux-arm-
kernel at lists.infradead.org'
Subject: RE: [RFC] change non-atomic bitops method
quoted
-----Original Message-----
From: Andrew Morton [mailto:akpm at linux-foundation.org]
Sent: Tuesday, February 03, 2015 2:39 PM
To: Wang, Yalin
Cc: 'Kirill A. Shutemov'; 'arnd at arndb.de'; 'linux-arch at vger.kernel.org';
'linux-kernel at vger.kernel.org'; 'linux at arm.linux.org.uk'; 'linux-arm-
kernel at lists.infradead.org'
Subject: Re: [RFC] change non-atomic bitops method

On Tue, 3 Feb 2015 13:42:45 +0800 "Wang, Yalin"
[off-list ref]
quoted
wrote:
quoted
...

#ifdef CHECK_BEFORE_SET
			if (p[i] != times)
#endif

...

----
One run on CPU0, reader thread run on CPU1,
Test result:
sudo ./cache_test
reader:8.426228173
8.672198335

With -DCHECK_BEFORE_SET
sudo ./cache_test_check
reader:7.537036819
10.799746531
You aren't measuring the right thing.  You should compare

	if (p[i] != x)
		p[i] = x;

versus

	p[i] = x;

and you should do this for two cases:

a) p[i] == x

b) p[i] != x


The first code sequence will be slower when (p[i] != x) and faster when
(p[i] == x).


Next, we should instrument the kernel to work out the frequency of
set_bit on an already-set bit.

It is only with both these ratios that we can work out whether the
patch is a net gain.  My suspicion is that set_bit on an already-set
bit is so rare that the patch will be a loss.
I see, let's change the test a little:
1)
	memset(p, 0, SIZE);
	if (p[i] != 0)
		p[i] = 0;  // never called

	#sudo ./cache_test_check
	6.698153838
	reader:7.529402625


2)
	memset(p, 0, SIZE);
	if (p[i] == 0)
		p[i] = 0; // always called
	#sudo ./cache_test_check
	reader:7.895421311
	9.000889973

Thanks
I make a change in kernel to test hit/miss ratio:
---
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645..a82937b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -2,6 +2,7 @@
 #include <linux/hugetlb.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
@@ -15,6 +16,41 @@
 #include <asm/pgtable.h>
 #include "internal.h"
 
+atomic_t __set_bit_success_count = ATOMIC_INIT(0);
+atomic_t __set_bit_miss_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(__set_bit_success_count);
+EXPORT_SYMBOL_GPL(__set_bit_miss_count);
+
+atomic_t __clear_bit_success_count = ATOMIC_INIT(0);
+atomic_t __clear_bit_miss_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(__clear_bit_success_count);
+EXPORT_SYMBOL_GPL(__clear_bit_miss_count);
+
+atomic_t __test_and_set_bit_success_count = ATOMIC_INIT(0);
+atomic_t __test_and_set_bit_miss_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(__test_and_set_bit_success_count);
+EXPORT_SYMBOL_GPL(__test_and_set_bit_miss_count);
+
+atomic_t __test_and_clear_bit_success_count = ATOMIC_INIT(0);
+atomic_t __test_and_clear_bit_miss_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(__test_and_clear_bit_success_count);
+EXPORT_SYMBOL_GPL(__test_and_clear_bit_miss_count);
+
+/*
+ * atomic bitops
+ */
+atomic_t set_bit_success_count = ATOMIC_INIT(0);
+atomic_t set_bit_miss_count = ATOMIC_INIT(0);
+
+atomic_t clear_bit_success_count = ATOMIC_INIT(0);
+atomic_t clear_bit_miss_count = ATOMIC_INIT(0);
+
+atomic_t test_and_set_bit_success_count = ATOMIC_INIT(0);
+atomic_t test_and_set_bit_miss_count = ATOMIC_INIT(0);
+
+atomic_t test_and_clear_bit_success_count = ATOMIC_INIT(0);
+atomic_t test_and_clear_bit_miss_count = ATOMIC_INIT(0);
+
 void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
 {
 }
@@ -165,6 +201,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		   HPAGE_PMD_NR)
 #endif
 		);
+	seq_printf(m,   "__set_bit_miss_count:%d __set_bit_success_count:%d\n"
+			"__clear_bit_miss_count:%d __clear_bit_success_count:%d\n"
+			"__test_and_set_bit_miss_count:%d __test_and_set_bit_success_count:%d\n"
+			"__test_and_clear_bit_miss_count:%d __test_and_clear_bit_success_count:%d\n",
+			atomic_read(&__set_bit_miss_count), atomic_read(&__set_bit_success_count),
+			atomic_read(&__clear_bit_miss_count), atomic_read(&__clear_bit_success_count),
+
+			atomic_read(&__test_and_set_bit_miss_count),
+			atomic_read(&__test_and_set_bit_success_count),
+
+			atomic_read(&__test_and_clear_bit_miss_count),
+			atomic_read(&__test_and_clear_bit_success_count));
 
 	hugetlb_report_meminfo(m);
 
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 697cc2b..1895133 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -2,7 +2,18 @@
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 
 #include <asm/types.h>
+#include <asm/atomic.h>
+extern atomic_t __set_bit_success_count;
+extern atomic_t __set_bit_miss_count;
 
+extern atomic_t __clear_bit_success_count;
+extern atomic_t __clear_bit_miss_count;
+
+extern atomic_t __test_and_set_bit_success_count;
+extern atomic_t __test_and_set_bit_miss_count;
+
+extern atomic_t __test_and_clear_bit_success_count;
+extern atomic_t __test_and_clear_bit_miss_count;
 /**
  * __set_bit - Set a bit in memory
  * @nr: the bit to set
@@ -17,7 +28,13 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
 
-	*p  |= mask;
+	if ((*p & mask) == 0) {
+		atomic_inc(&__set_bit_success_count);
+		*p  |= mask;
+	} else {
+		atomic_inc(&__set_bit_miss_count);
+	}
+
 }
 
 static inline void __clear_bit(int nr, volatile unsigned long *addr)
@@ -25,7 +42,12 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
 
-	*p &= ~mask;
+	if ((*p & mask) != 0) {
+		atomic_inc(&__clear_bit_success_count);
+		*p &= ~mask;
+	} else {
+		atomic_inc(&__clear_bit_miss_count);
+	}
 }
 
 /**
@@ -60,7 +82,12 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
 	unsigned long old = *p;
 
-	*p = old | mask;
+	if ((old & mask) == 0) {
+		atomic_inc(&__test_and_set_bit_success_count);
+		*p = old | mask;
+	} else {
+		atomic_inc(&__test_and_set_bit_miss_count);
+	}
 	return (old & mask) != 0;
 }
 
@@ -79,7 +106,12 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
 	unsigned long old = *p;
 
-	*p = old & ~mask;
+	if ((old & mask) != 0) {
+		atomic_inc(&__test_and_clear_bit_success_count);
+		*p = old & ~mask;
+	} else {
+		atomic_inc(&__test_and_clear_bit_miss_count);
+	}
 	return (old & mask) != 0;
 }
---
After use the phone some time:
root at D5303:/ # cat /proc/meminfo
VmallocUsed:       10348 kB
VmallocChunk:      75632 kB
__set_bit_miss_count:10002 __set_bit_success_count:1096661
__clear_bit_miss_count:359484 __clear_bit_success_count:3674617
__test_and_set_bit_miss_count:7 __test_and_set_bit_success_count:221
__test_and_clear_bit_miss_count:924611 __test_and_clear_bit_success_count:193

__test_and_clear_bit_miss_count has a very high miss rate.
In fact, I think set/clear/test_and_set(clear)_bit atomic version can also
Be investigated to see its miss ratio,
I have not tested the atomic version,
Because it reside in different architectures.

Thanks
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help