[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 9/9] x86/bitops: Use the POPCNT instruction when available



It has existed in x86 CPUs since 2008, so we're only 16 years late adding
support.  With all the other scafolding in place, implement arch_hweightl()
for x86.

The only complication is that the call to arch_generic_hweightl() is behind
the compilers back.  Address this by writing it in ASM and ensure that it
preserves all registers.

Copy the code generation from generic_hweightl().  It's not a complicated
algorithm, and is easy to regenerate if needs be, but cover it with the same
unit tests as test_generic_hweightl() just for piece of mind.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>

A few RFC points.

 * I throught we had an x86 general lib-y but I can't find one, hence why it's
   still in xen/lib/ for now.

 * When we up the minimum toolchain to GCC 7 / Clang 5, we can use a
   __attribute__((no_caller_saved_registers)) and can forgo writing this in asm.

   GCC seems to need extra help, and wants -mgeneral-regs-only too.  It has a
   habit of complaining about incompatible instructions even when it's not
   emitting them.
---
 xen/arch/x86/include/asm/bitops.h | 21 ++++++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/arch-generic-hweightl.S   | 46 +++++++++++++++++++++++++++++++
 xen/lib/generic-hweightl.c        | 15 ++++++++++
 4 files changed, 83 insertions(+)
 create mode 100644 xen/lib/arch-generic-hweightl.S

diff --git a/xen/arch/x86/include/asm/bitops.h 
b/xen/arch/x86/include/asm/bitops.h
index 642d8e58b288..0db698ed3f4c 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -6,6 +6,7 @@
  */
 
 #include <asm/alternative.h>
+#include <asm/asm_defns.h>
 #include <asm/cpufeatureset.h>
 
 /*
@@ -475,4 +476,24 @@ static always_inline unsigned int arch_flsl(unsigned long 
x)
 }
 #define arch_flsl arch_flsl
 
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+    unsigned int r;
+
+    /*
+     * arch_generic_hweightl() is written in ASM in order to preserve all
+     * registers, as the compiler can't see the call.
+     *
+     * This limits the POPCNT instruction to using the same ABI as a function
+     * call (input in %rdi, output in %eax) but that's fine.
+     */
+    alternative_io("call arch_generic_hweightl",
+                   "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+                   ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+                   [val] "D" (x));
+
+    return r;
+}
+#define arch_hweightl arch_hweightl
+
 #endif /* _X86_BITOPS_H */
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index b6558e108bd9..84d731dc0ac8 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_X86) += x86/
 
+lib-$(CONFIG_X86) += arch-generic-hweightl.o
 lib-y += bsearch.o
 lib-y += ctors.o
 lib-y += ctype.o
diff --git a/xen/lib/arch-generic-hweightl.S b/xen/lib/arch-generic-hweightl.S
new file mode 100644
index 000000000000..15c6e3394845
--- /dev/null
+++ b/xen/lib/arch-generic-hweightl.S
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+        .file __FILE__
+
+#include <xen/linkage.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler.  Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ */
+FUNC(arch_generic_hweightl)
+        push   %rdi
+        push   %rdx
+
+        movabs $0x5555555555555555, %rdx
+        mov    %rdi, %rax
+        shr    $1, %rax
+        and    %rdx, %rax
+        sub    %rax, %rdi
+        movabs $0x3333333333333333, %rax
+        mov    %rdi, %rdx
+        shr    $0x2, %rdi
+        and    %rax, %rdx
+        and    %rax, %rdi
+        add    %rdi, %rdx
+        mov    %rdx, %rax
+        shr    $0x4, %rax
+        add    %rdx, %rax
+        movabs $0xf0f0f0f0f0f0f0f, %rdx
+        and    %rdx, %rax
+        movabs $0x101010101010101, %rdx
+        imul   %rdx, %rax
+        shr    $0x38, %rax
+
+        pop    %rdx
+        pop    %rdi
+
+        ret
+END(arch_generic_hweightl)
diff --git a/xen/lib/generic-hweightl.c b/xen/lib/generic-hweightl.c
index fa4bbec273ab..4b39dd84de5e 100644
--- a/xen/lib/generic-hweightl.c
+++ b/xen/lib/generic-hweightl.c
@@ -43,4 +43,19 @@ static void __init __constructor test_generic_hweightl(void)
     RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
     RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG);
 }
+
+#ifdef CONFIG_X86
+unsigned int arch_generic_hweightl(unsigned long);
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+    RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+    RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+    RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+    RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+    RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif /* CONFIG_X86 */
 #endif /* CONFIG_SELF_TESTS */
-- 
2.39.2




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.