[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [xen master] x86/bitops: Use the POPCNT instruction when available
commit 6978602334d92378269ca0dcea794ddc83b4dae7 Author: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> AuthorDate: Thu Aug 22 20:18:15 2024 +0100 Commit: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> CommitDate: Mon Sep 9 14:59:05 2024 +0100 x86/bitops: Use the POPCNT instruction when available It has existed in x86 CPUs since 2008, so we're only 16 years late adding support. With all the other scafolding in place, implement arch_hweightl() for x86. The only complication is that the call to arch_generic_hweightl() is behind the compilers back. Address this by writing it in ASM and ensure that it preserves all registers. Copy the code generation from generic_hweightl(). It's not a complicated algorithm, and is easy to regenerate if needs be, but cover it with the same unit tests as test_generic_hweightl() just for piece of mind. Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> Acked-by: Jan Beulich <jbeulich@xxxxxxxx> --- xen/arch/x86/include/asm/bitops.h | 23 +++++++++++++ xen/lib/Makefile | 1 + xen/lib/x86-generic-hweightl.c | 71 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h index 642d8e58b2..39e37f1cbe 100644 --- a/xen/arch/x86/include/asm/bitops.h +++ b/xen/arch/x86/include/asm/bitops.h @@ -6,6 +6,7 @@ */ #include <asm/alternative.h> +#include <asm/asm_defns.h> #include <asm/cpufeatureset.h> /* @@ -475,4 +476,26 @@ static always_inline unsigned int arch_flsl(unsigned long x) } #define arch_flsl arch_flsl +unsigned int arch_generic_hweightl(unsigned long x); + +static always_inline unsigned int arch_hweightl(unsigned long x) +{ + unsigned int r; + + /* + * arch_generic_hweightl() is written in ASM in order to preserve all + * registers, as the compiler can't see the call. + * + * This limits the POPCNT instruction to using the same ABI as a function + * call (input in %rdi, output in %eax) but that's fine. + */ + alternative_io("call arch_generic_hweightl", + "popcnt %[val], %q[res]", X86_FEATURE_POPCNT, + ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT), + [val] "D" (x)); + + return r; +} +#define arch_hweightl arch_hweightl + #endif /* _X86_BITOPS_H */ diff --git a/xen/lib/Makefile b/xen/lib/Makefile index b6558e108b..54440f628a 100644 --- a/xen/lib/Makefile +++ b/xen/lib/Makefile @@ -36,6 +36,7 @@ lib-y += strtol.o lib-y += strtoll.o lib-y += strtoul.o lib-y += strtoull.o +lib-$(CONFIG_X86) += x86-generic-hweightl.o lib-$(CONFIG_X86) += xxhash32.o lib-$(CONFIG_X86) += xxhash64.o diff --git a/xen/lib/x86-generic-hweightl.c b/xen/lib/x86-generic-hweightl.c new file mode 100644 index 0000000000..123a5b4392 --- /dev/null +++ b/xen/lib/x86-generic-hweightl.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <xen/bitops.h> +#include <xen/init.h> +#include <xen/self-tests.h> + +/* + * An implementation of generic_hweightl() used on hardware without the POPCNT + * instruction. + * + * This function is called from within an ALTERNATIVE in arch_hweightl(). + * i.e. behind the back of the compiler. Therefore all registers are callee + * preserved. + * + * The ASM is what GCC-12 emits for generic_hweightl() in a release build of + * Xen, with spilling of %rdi/%rdx to preserve the callers registers. + * + * Note: When we can use __attribute__((no_caller_saved_registers)) + * unconditionally (GCC 7, Clang 5), we can implement this in plain C. + */ +asm ( + ".type arch_generic_hweightl, STT_FUNC\n\t" + ".globl arch_generic_hweightl\n\t" + ".hidden arch_generic_hweightl\n\t" + ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n" /* CODE_FILL */ + "arch_generic_hweightl:\n\t" + + "push %rdi\n\t" + "push %rdx\n\t" + + "movabs $0x5555555555555555, %rdx\n\t" + "mov %rdi, %rax\n\t" + "shr $1, %rax\n\t" + "and %rdx, %rax\n\t" + "sub %rax, %rdi\n\t" + "movabs $0x3333333333333333, %rax\n\t" + "mov %rdi, %rdx\n\t" + "shr $2, %rdi\n\t" + "and %rax, %rdx\n\t" + "and %rax, %rdi\n\t" + "add %rdi, %rdx\n\t" + "mov %rdx, %rax\n\t" + "shr $4, %rax\n\t" + "add %rdx, %rax\n\t" + "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t" + "and %rdx, %rax\n\t" + "movabs $0x0101010101010101, %rdx\n\t" + "imul %rdx, %rax\n\t" + "shr $" STR(BITS_PER_LONG) "- 8, %rax\n\t" + + "pop %rdx\n\t" + "pop %rdi\n\t" + + "ret\n\t" + + ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t" +); + +#ifdef CONFIG_SELF_TESTS +static void __init __constructor test_arch_generic_hweightl(void) +{ + RUNTIME_CHECK(arch_generic_hweightl, 0, 0); + RUNTIME_CHECK(arch_generic_hweightl, 1, 1); + RUNTIME_CHECK(arch_generic_hweightl, 3, 2); + RUNTIME_CHECK(arch_generic_hweightl, 7, 3); + RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8); + + RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2); + RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG); +} +#endif -- generated by git-patchbot for /home/xen/git/xen.git#master
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |