Re: [PATCH v2] lib: optimize cpumask_next_and()
From: Yury Norov <hidden>
Date: 2017-10-25 14:59:53
Also in:
lkml
Subsystem:
bitmap api, library code, the rest · Maintainers:
Yury Norov, Andrew Morton, Linus Torvalds
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote:
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). It's essentially a joined iteration in search for a non-zero bit, which is currently implemented as a lookup join (find a nonzero bit on the lhs, lookup the rhs to see if it's set there). Implement a direct join (find a nonzero bit on the incrementally built join). Direct benchmarking shows that it's 1.17x to 14x faster with a geometric mean of 2.1 on 32 CPUs. No impact on memory usage. Approximate benchmark code:unsigned long src1p[nr_cpumask_longs] = {pattern1}; unsigned long src2p[nr_cpumask_longs] = {pattern2}; for (/*a bunch of repetitions*/) { for (int n = -1; n <= nr_cpu_ids; ++n) { asm volatile("" : "+rm"(src1p)); // prevent any optimization asm volatile("" : "+rm"(src2p)); unsigned long result = cpumask_next_and(n, src1p, src2p); asm volatile("" : "+rm"(result)); } }Signed-off-by: Clement Courbet <redacted> --- Changes in v2: - Refactored _find_next_common_bit into _find_next_bit., as suggested by Yury Norov.
What I actually suggested is make _find_next_and_bit() similar to _find_next_bit(), not to extend _find_next_bit(). But what you did looks OK.
This has no adverse effects on the performance side, as the compiler successfully inlines the code.
I think it's not about inlining, compiler just optimizes out branches known as false at compile-time.
quoted hunk ↗ jump to hunk
include/asm-generic/bitops/find.h | 16 ++++++++++++++ include/linux/bitmap.h | 2 ++ lib/cpumask.c | 9 ++++---- lib/find_bit.c | 37 +++++++++++++++++++++++++-------- tools/include/asm-generic/bitops/find.h | 16 ++++++++++++++ 5 files changed, 67 insertions(+), 13 deletions(-)diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 998d4d544f18..130962f3a264 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h@@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset); #endif +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +extern unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset); +#endif + #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory regiondiff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 700cf5f67118..b4606bfda52f 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h@@ -77,6 +77,8 @@ * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit + * find_next_and_bit(addr1, addr2, nbits, bit) Same as find_first_bit, but in + * (*addr1 & *addr2) */ /*diff --git a/lib/cpumask.c b/lib/cpumask.c index 8b1a1bd77539..5602223837fa 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c@@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next); int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) - if (cpumask_test_cpu(n, src2p)) - break; - return n; + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits, n + 1); } EXPORT_SYMBOL(cpumask_next_and);diff --git a/lib/find_bit.c b/lib/find_bit.c index 6ed74f78380c..ebc08fd9fdf8 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c@@ -24,19 +24,25 @@ #if !defined(find_next_bit) || !defined(find_next_zero_bit) /* - * This is a common helper function for find_next_bit and - * find_next_zero_bit. The difference is the "invert" argument, which - * is XORed with each fetched word before searching it for one bits. + * This is a common helper function for find_next_bit, find_next_zero_bit, and + * find_next_and_bit. The differences are: + * - The "invert" argument, which is XORed with each fetched word before + * searching it for one bits. + * - The optional "addr2", which is anded with "addr1" if present. */ -static unsigned long _find_next_bit(const unsigned long *addr, - unsigned long nbits, unsigned long start, unsigned long invert) +static unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert) { unsigned long tmp; if (unlikely(start >= nbits)) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; /* Handle 1st word. */ tmp &= BITMAP_FIRST_WORD_MASK(start);@@ -47,7 +53,10 @@ static unsigned long _find_next_bit(const unsigned long *addr, if (start >= nbits) return nbits; - tmp = addr[start / BITS_PER_LONG] ^ invert; + tmp = addr1[start / BITS_PER_LONG]; + if (addr2) + tmp &= addr2[start / BITS_PER_LONG]; + tmp ^= invert; } return min(start + __ffs(tmp), nbits);@@ -61,7 +70,7 @@ static unsigned long _find_next_bit(const unsigned long *addr, unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, 0UL); + return _find_next_bit(addr, NULL, size, offset, 0UL); } EXPORT_SYMBOL(find_next_bit); #endif@@ -70,11 +79,21 @@ EXPORT_SYMBOL(find_next_bit); unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - return _find_next_bit(addr, size, offset, ~0UL); + return _find_next_bit(addr, NULL, size, offset, ~0UL); } EXPORT_SYMBOL(find_next_zero_bit); #endif +#if !defined(find_next_and_bit) +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start)
It should be: unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset)
+{
+ return _find_next_bit(addr1, addr2, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_and_bit);
+#endif
+
#ifndef find_first_bit
/*
* Find the first set bit in a memory region.If we continue this way, we'll most probably end up like this, sooner or later:
diff --git a/lib/find_bit.c b/lib/find_bit.c
index ebc08fd9fdf8..1b0b4aedc93a 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c@@ -31,8 +31,12 @@ * - The optional "addr2", which is anded with "addr1" if present. */ static unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert) + const unsigned long *and, + const unsigned long *or, + const unsigned long *xor, + unsigned long nbits, + unsigned long start, + unsigned long invert) { unsigned long tmp;
@@ -40,8 +44,12 @@ static unsigned long _find_next_bit(const unsigned long *addr1, return nbits; tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; + if (and) + tmp &= and[start / BITS_PER_LONG]; + if (or) + tmp |= or[start / BITS_PER_LONG]; + if (xor) + tmp ^= xor[start / BITS_PER_LONG]; tmp ^= invert; /* Handle 1st word. */
@@ -54,8 +62,12 @@ static unsigned long _find_next_bit(const unsigned long *addr1, return nbits; tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; + if (and) + tmp &= and[start / BITS_PER_LONG]; + if (or) + tmp |= or[start / BITS_PER_LONG]; + if (xor) + tmp ^= xor[start / BITS_PER_LONG]; tmp ^= invert; }
Just a fantasy of course. I'm generally fine to proceed this way. It makes _find_next_bit() more complex, but lets us avoid code duplication, which is better deal for long-term maintenance. But I'd like also to keep _find_next_bit() consistent with _find_next_bit_le() Could you please send v3 with fixed find_next_and_bit() declaration, and synced LE routines? Yury