Xen project Mailing List

[Xen-devel] [PATCH v7 47/49] x86emul: support GFNI insns

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Wed, 19 Dec 2018 08:07:29 -0700

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, Roger Pau Monne <roger.pau@xxxxxxxxxx>

Delivery-date: Wed, 19 Dec 2018 15:07:35 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Note that the ISA extensions document revision 035 is ambiguous regarding fault suppression for VGF2P8MULB: Text says it's supported, while the exception specification listed is E4NF. Given the wording here and for the other two insns I'm inclined to trust the text more than the exception reference, which was also confirmed informally. As to the feature dependency adjustment, while strictly speaking SSE is a sufficient prereq (to have XMM registers), vectors of bytes and qwords have got introduced only with SSE2. gcc, for example, uses a similar connection in its respective intrinsics header. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v7: New. --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -22,7 +22,8 @@ CFLAGS += $(CFLAGS_xeninclude) SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi FMA := fma4 fma SG := avx2-sg avx512f-sg avx512vl-sg -TESTCASES := blowfish $(SIMD) $(FMA) $(SG) +GF := sse2-gf avx2-gf avx512bw-gf +TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF) OPMASK := avx512f avx512dq avx512bw @@ -145,12 +146,17 @@ $(1)-cflags := \ $(foreach flt,$($(1)-flts), \ "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)"))) endef +define simd-gf-defs +$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \ + "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)") +endef define opmask-defs $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)") endef $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor)))) +$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor)))) $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor)))) first-string = $(shell for s in $(1); do echo "$$s"; break; done) @@ -200,6 +206,9 @@ $(addsuffix .c,$(FMA)): $(addsuffix .c,$(SG)): ln -sf simd-sg.c $@ +$(addsuffix .c,$(GF)): + ln -sf simd-gf.c $@ + $(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h xop.h avx512f.h: simd-fma.c --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -591,6 +591,12 @@ static const struct test avx512_vpopcntd INSN(popcnt, 66, 0f38, 55, vl, dq, vl) }; +static const struct test gfni_all[] = { + INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl), + INSN(gf2p8affineqb, 66, 0f3a, ce, vl, q, vl), + INSN(gf2p8mulb, 66, 0f38, cf, vl, b, vl), +}; + /* * The uses of b in this table are simply (one of) the shortest form(s) of * saying "no broadcast" without introducing a 128-bit granularity enumerator. @@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct if ( cpu_has_avx512f ) { + RUN(gfni, all); RUN(vaes, all); RUN(vpclmulqdq, all); } --- /dev/null +++ b/tools/tests/x86_emulator/simd-gf.c @@ -0,0 +1,80 @@ +#define UINT_SIZE 1 + +#include "simd.h" +ENTRY(gf_test); + +#if VEC_SIZE == 16 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a) +#elif VEC_SIZE == 32 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a) +#elif VEC_SIZE == 64 +# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a) +#endif + +#if defined(__AVX512F__) +# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT)) +# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE) +# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0) +# define transform(m, dir, x, c) ({ \ + vec_t t_; \ + asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \ + : [dst] "=v" (t_) \ + : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \ + t_; \ +}) +#else +# if defined(__AVX2__) +# define bcstq(x) ({ \ + vdi_t t_; \ + asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \ + t_; \ +}) +# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0) +# else +# define bcstq(x) ((vdi_t){x, x}) +# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff) +# endif +# define eq(x, y) to_bool((x) == (y)) +# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y)) +# define transform(m, dir, x, c) ({ \ + vdi_t m_ = bcstq(m); \ + touch(m_); \ + ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \ +}) +#endif + +const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL; + +int gf_test(void) +{ + unsigned int i; + vec_t src, one; + + for ( i = 0; i < ELEM_COUNT; ++i ) + { + src[i] = i; + one[i] = 1; + } + + /* Special case for first iteration. */ + one[0] = 0; + + do { + vec_t inv = transform(ident, inv, src, 0); + + touch(src); + touch(inv); + if ( !eq(mul(src, inv), one) ) return __LINE__; + + touch(src); + touch(inv); + if ( !eq(mul(inv, src), one) ) return __LINE__; + + one[0] = 1; + + src += ELEM_COUNT; + i += ELEM_COUNT; + } while ( i < 256 ); + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @prog #include "3dnow.h" #include "sse.h" #include "sse2.h" +#include "sse2-gf.h" #include "sse4.h" #include "avx.h" #include "fma4.h" #include "fma.h" #include "avx2.h" #include "avx2-sg.h" +#include "avx2-gf.h" #include "xop.h" #include "avx512f-opmask.h" #include "avx512dq-opmask.h" @@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512f-sg.h" #include "avx512vl-sg.h" #include "avx512bw.h" +#include "avx512bw-gf.h" #include "avx512dq.h" #include "avx512er.h" #include "avx512vbmi.h" @@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(voi return cpu_has_avx512_vbmi && cpu_has_avx512vl; } +static bool simd_check_sse2_gf(void) +{ + return cpu_has_gfni && cpu_has_sse2; +} + +static bool simd_check_avx2_gf(void) +{ + return cpu_has_gfni && cpu_has_avx2; +} + +static bool simd_check_avx512bw_gf(void) +{ + return cpu_has_gfni && cpu_has_avx512bw; +} + +static bool simd_check_avx512bw_gf_vl(void) +{ + return cpu_has_gfni && cpu_has_avx512vl; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -395,6 +418,12 @@ static const struct { AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2), AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2), AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2), + SIMD(GFNI (legacy), sse2_gf, 16), + SIMD(GFNI (VEX/x16), avx2_gf, 16), + SIMD(GFNI (VEX/x32), avx2_gf, 32), + SIMD(GFNI (EVEX/x64), avx512bw_gf, 64), + AVX512VL(VL+GFNI (x16), avx512bw_gf, 16), + AVX512VL(VL+GFNI (x32), avx512bw_gf, 32), #undef AVX512VL_ #undef AVX512VL #undef SIMD_ --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6)) +#define cpu_has_gfni cp.feat.gfni #define cpu_has_vaes (cp.feat.vaes && xcr0_mask(6)) #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6)) #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6)) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -540,6 +540,7 @@ static const struct ext0f38_table { [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, + [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xf0] = { .two_op = 1 }, @@ -619,6 +620,7 @@ static const struct ext0f3a_table { [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 }, [0xcc] = { .simd_size = simd_other }, + [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xf0] = {}, }; @@ -1922,6 +1924,7 @@ static bool vcpu_has( #define vcpu_has_avx512vl() vcpu_has( 7, EBX, 31, ctxt, ops) #define vcpu_has_avx512_vbmi() vcpu_has( 7, ECX, 1, ctxt, ops) #define vcpu_has_avx512_vbmi2() vcpu_has( 7, ECX, 6, ctxt, ops) +#define vcpu_has_gfni() vcpu_has( 7, ECX, 8, ctxt, ops) #define vcpu_has_vaes() vcpu_has( 7, ECX, 9, ctxt, ops) #define vcpu_has_vpclmulqdq() vcpu_has( 7, ECX, 10, ctxt, ops) #define vcpu_has_avx512_vnni() vcpu_has( 7, ECX, 11, ctxt, ops) @@ -9627,6 +9630,21 @@ x86_emulate( host_and_vcpu_must_have(avx512er); goto simd_zmm_scalar_sae; + case X86EMUL_OPC_66(0x0f38, 0xcf): /* gf2p8mulb xmm/m128,xmm */ + host_and_vcpu_must_have(gfni); + goto simd_0f38_common; + + case X86EMUL_OPC_VEX_66(0x0f38, 0xcf): /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(gfni); + generate_exception_if(vex.w, EXC_UD); + goto simd_0f_avx; + + case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + host_and_vcpu_must_have(gfni); + generate_exception_if(evex.w || evex.brs, EXC_UD); + elem_bytes = 1; + goto avx512f_no_sae; + case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */ @@ -10372,6 +10390,24 @@ x86_emulate( op_bytes = 16; goto simd_0f3a_common; + case X86EMUL_OPC_66(0x0f3a, 0xce): /* gf2p8affineqb $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_66(0x0f3a, 0xcf): /* gf2p8affineinvqb $imm8,xmm/m128,xmm */ + host_and_vcpu_must_have(gfni); + goto simd_0f3a_common; + + case X86EMUL_OPC_VEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(gfni); + generate_exception_if(!vex.w, EXC_UD); + goto simd_0f_imm8_avx; + + case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + host_and_vcpu_must_have(gfni); + generate_exception_if(!evex.w, EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */ host_and_vcpu_must_have(aesni); --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -113,6 +113,7 @@ /* CPUID level 0x00000007:0.ecx */ #define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI) #define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2) +#define cpu_has_gfni boot_cpu_has(X86_FEATURE_GFNI) #define cpu_has_vaes boot_cpu_has(X86_FEATURE_VAES) #define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ) #define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI) --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) / XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */ XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */ XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */ +XEN_CPUFEATURE(GFNI, 6*32+ 8) /*A Galois Field Instrs */ XEN_CPUFEATURE(VAES, 6*32+ 9) /*A Vector AES Instrs */ XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */ XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -200,7 +200,7 @@ def crunch_numbers(state): AESNI, SHA], # SSE2 was re-specified as core instructions for 64bit. - SSE2: [LM], + SSE2: [LM, GFNI], # SSE4.1 explicitly depends on SSE3 and SSSE3 SSE3: [SSE4_1], _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.