Xen project Mailing List

Re: [PATCH v2] x86: use POPCNT for hweight<N>() when available

From: Roger Pau Monné <roger.pau@xxxxxxxxxx>

Date: Thu, 14 May 2020 16:05:22 +0200

Authentication-results: esa3.hc3370-68.iphmx.com; dkim=none (message not signed) header.i=none; spf=None smtp.pra=roger.pau@xxxxxxxxxx; spf=Pass smtp.mailfrom=roger.pau@xxxxxxxxxx; spf=None smtp.helo=postmaster@xxxxxxxxxxxxxxx; dmarc=pass (p=none dis=none) d=citrix.com

Cc: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Thu, 14 May 2020 14:06:05 +0000

Ironport-sdr: fyyBCVrePwR4GBdsysZEJAQpa9Epxy7xNAIGXW3MNyuZL0JMQzIFGuIgRmCxvvWgu0zuge19E2 3+5Xf8n3oSEdTnP9jlCUERB+jNWCIslu7E9QYBcbRiuFqilfvK4JKRcplh6SoOujTCNIGzmN3N aeBXQUOk3jd2LvZYEWkXeoplUo6i6qJYcOgpDIEKbLYfOzFbEzilUMt46eQumeu8ChdL6OlmDw 3WTnDvH1MziiUNbLYEF1wnphpYBleq5kZlzwV5NohTKL4IgY2AvtBnThZowrz+QsFiSTT+3JEQ sRQ=

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > This is faster than using the software implementation, and the insn is > available on all half-way recent hardware. Therefore convert > generic_hweight<N>() to out-of-line functions (without affecting Arm) > and use alternatives patching to replace the function calls. > > Note that the approach doesn#t work for clang, due to it not recognizing > -ffixed-*. > > Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > --- > v2: Also suppress UB sanitizer instrumentation. Reduce macroization in > hweight.c. Exclude clang builds. > --- > Note: Using "g" instead of "X" as the dummy constraint in hweight64() > and hweight32(), other than expected, produces slightly better > code with gcc 8. > > --- a/xen/arch/x86/Makefile > +++ b/xen/arch/x86/Makefile > @@ -31,6 +31,10 @@ obj-y += emul-i8254.o > obj-y += extable.o > obj-y += flushtlb.o > obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o > +# clang doesn't appear to know of -ffixed-* > +hweight-$(gcc) := hweight.o > +hweight-$(clang) := > +obj-y += $(hweight-y) > obj-y += hypercall.o > obj-y += i387.o > obj-y += i8259.o > @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > efi/mkreloc: efi/mkreloc.c > $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > > +nocov-y += hweight.o > +noubsan-y += hweight.o > +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) Why not use clobbers in the asm to list the scratch registers? Is it that much expensive? > + > .PHONY: clean > clean:: > rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 > --- /dev/null > +++ b/xen/arch/x86/hweight.c > @@ -0,0 +1,21 @@ > +#define generic_hweight64 _hweight64 > +#define generic_hweight32 _hweight32 > +#define generic_hweight16 _hweight16 > +#define generic_hweight8 _hweight8 > + > +#include <xen/compiler.h> > + > +#undef inline > +#define inline always_inline > + > +#include <xen/bitops.h> > + > +#undef generic_hweight8 > +#undef generic_hweight16 > +#undef generic_hweight32 > +#undef generic_hweight64 > + > +unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); } > +unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); } > +unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); } > +unsigned int generic_hweight64(uint64_t x) { return _hweight64(x); } > --- a/xen/include/asm-x86/bitops.h > +++ b/xen/include/asm-x86/bitops.h > @@ -475,9 +475,36 @@ static inline int fls(unsigned int x) > * > * The Hamming Weight of a number is the total number of bits set in it. > */ > +#ifndef __clang__ > +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ > +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" > +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" > + > +#define hweight_(n, x, insn, setup, cout, cin) ({ \ > + unsigned int res_; \ > + /* \ > + * For the function call the POPCNT input register needs to be marked \ > + * modified as well. Set up a local variable of appropriate type \ > + * for this purpose. \ > + */ \ > + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ > + alternative_io(setup "; call generic_hweight" #n, \ > + insn, X86_FEATURE_POPCNT, \ > + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ > + [src] cin (val_)); \ > + res_; \ > +}) > +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") > +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") > +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ > + "mov %[src], %[val]", "=&D", "rm") > +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ > + "mov %[src], %[val]", "=&D", "rm") Why not just convert types < 32bits into uint32_t and avoid the asm prefix? You are already converting them in hweight_ to an uintX_t. That would made the asm simpler as you would then only need to make sure the input is in %rdi and the output is fetched from %rax? Thanks, Roger.

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.