[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 4/4] x86: use POPCNT for hweight<N>() when available
This is faster than using the software implementation, and the insn is available on all half-way recent hardware. Therefore convert generic_hweight<N>() to out-of-line functions (without affecting Arm) and use alternatives patching to replace the function calls. Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- Note: Using "g" instead of "X" as the dummy constraint in hweight64() and hweight32(), other than expected, produces slightly better code with gcc 8. --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -31,6 +31,7 @@ obj-y += emul-i8254.o obj-y += extable.o obj-y += flushtlb.o obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o +obj-y += hweight.o obj-y += hypercall.o obj-y += i387.o obj-y += i8259.o @@ -245,6 +246,9 @@ boot/mkelf32: boot/mkelf32.c efi/mkreloc: efi/mkreloc.c $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< +nocov-y += hweight.o +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) + .PHONY: clean clean:: rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 --- /dev/null +++ b/xen/arch/x86/hweight.c @@ -0,0 +1,28 @@ +#define generic_hweight64 _hweight64 +#define generic_hweight32 _hweight32 +#define generic_hweight16 _hweight16 +#define generic_hweight8 _hweight8 + +#include <xen/compiler.h> + +#undef inline +#define inline always_inline + +#include <xen/bitops.h> + +#undef generic_hweight8 +#undef generic_hweight16 +#undef generic_hweight32 +#undef generic_hweight64 + +#define HWEIGHT(n) \ +typeof(_hweight##n) generic_hweight##n; \ +unsigned int generic_hweight##n(typeof((uint##n##_t)0 + 0U) x) \ +{ \ + return _hweight##n(x); \ +} + +HWEIGHT(64) +HWEIGHT(32) +HWEIGHT(16) +HWEIGHT( 8) --- a/xen/include/asm-x86/bitops.h +++ b/xen/include/asm-x86/bitops.h @@ -469,15 +469,35 @@ static inline int fls(unsigned int x) return r + 1; } +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" + /** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * * The Hamming Weight of a number is the total number of bits set in it. */ -#define hweight64(x) generic_hweight64(x) -#define hweight32(x) generic_hweight32(x) -#define hweight16(x) generic_hweight16(x) -#define hweight8(x) generic_hweight8(x) +#define hweight_(n, x, insn, setup, cout, cin) ({ \ + unsigned int res_; \ + /* \ + * For the function call the POPCNT input register needs to be marked \ + * modified as well. Set up a local variable of appropriate type \ + * for this purpose. \ + */ \ + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ + alternative_io(setup "; call generic_hweight" #n, \ + insn, X86_FEATURE_POPCNT, \ + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ + [src] cin (val_)); \ + res_; \ +}) +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ + "mov %[src], %[val]", "=&D", "rm") +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ + "mov %[src], %[val]", "=&D", "rm") #endif /* _X86_BITOPS_H */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |