[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen master] x86emul: support GFNI insns



commit 15a3314bd68319eecd42e9fb76c555031d400525
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Wed Jul 17 15:43:06 2019 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Wed Jul 17 15:43:06 2019 +0200

    x86emul: support GFNI insns
    
    As to the feature dependency adjustment, while strictly speaking SSE is
    a sufficient prereq (to have XMM registers), vectors of bytes and qwords
    have got introduced only with SSE2. gcc, for example, uses a similar
    connection in its respective intrinsics header.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Acked-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 tools/tests/x86_emulator/Makefile            | 13 ++++-
 tools/tests/x86_emulator/evex-disp8.c        |  7 +++
 tools/tests/x86_emulator/simd-gf.c           | 80 ++++++++++++++++++++++++++++
 tools/tests/x86_emulator/simd.h              |  1 +
 tools/tests/x86_emulator/test_x86_emulator.c | 29 ++++++++++
 tools/tests/x86_emulator/x86-emulate.h       |  1 +
 xen/arch/x86/x86_emulate/x86_emulate.c       | 36 +++++++++++++
 xen/include/asm-x86/cpufeature.h             |  1 +
 xen/include/public/arch-x86/cpufeatureset.h  |  1 +
 xen/tools/gen-cpuid.py                       |  2 +-
 10 files changed, 168 insertions(+), 3 deletions(-)

diff --git a/tools/tests/x86_emulator/Makefile 
b/tools/tests/x86_emulator/Makefile
index 3cfeb6713f..04c169f0cd 100644
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,7 +19,8 @@ CFLAGS += $(CFLAGS_xeninclude)
 SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er 
avx512vbmi
 FMA := fma4 fma
 SG := avx2-sg avx512f-sg avx512vl-sg
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
+GF := sse2-gf avx2-gf avx512bw-gf
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
 
 OPMASK := avx512f avx512dq avx512bw
 
@@ -142,12 +143,17 @@ $(1)-cflags := \
           $(foreach flt,$($(1)-flts), \
             "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os 
-DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define simd-gf-defs
+$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
+                "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os 
-DVEC_SIZE=$(vec)")
+endef
 define opmask-defs
 $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os 
-DSIZE=$(vec)")
 endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
 $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -197,7 +203,10 @@ $(addsuffix .c,$(FMA)):
 $(addsuffix .c,$(SG)):
        ln -sf simd-sg.c $@
 
-$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
+$(addsuffix .c,$(GF)):
+       ln -sf simd-gf.c $@
+
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
 
 xop.h avx512f.h: simd-fma.c
 
diff --git a/tools/tests/x86_emulator/evex-disp8.c 
b/tools/tests/x86_emulator/evex-disp8.c
index 5cc985a940..d948fdb22b 100644
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,12 @@ static const struct test avx512_vpopcntdq_all[] = {
     INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
 };
 
+static const struct test gfni_all[] = {
+    INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
+    INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
+    INSN(gf2p8mulb,        66, 0f38, cf, vl, b, vl),
+};
+
 /*
  * The uses of b in this table are simply (one of) the shortest form(s) of
  * saying "no broadcast" without introducing a 128-bit granularity enumerator.
@@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct x86_emulate_ctxt 
*ctxt,
 
     if ( cpu_has_avx512f )
     {
+        RUN(gfni, all);
         RUN(vaes, all);
         RUN(vpclmulqdq, all);
     }
diff --git a/tools/tests/x86_emulator/simd-gf.c 
b/tools/tests/x86_emulator/simd-gf.c
new file mode 100644
index 0000000000..6a9e966c26
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-gf.c
@@ -0,0 +1,80 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(gf_test);
+
+#if VEC_SIZE == 16
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a)
+#elif VEC_SIZE == 32
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a)
+#elif VEC_SIZE == 64
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a)
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)
+# define transform(m, dir, x, c) ({ \
+    vec_t t_; \
+    asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], 
%[dst]" \
+          : [dst] "=v" (t_) \
+          : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE 
/ 8) ); \
+    t_; \
+})
+#else
+# if defined(__AVX2__)
+#  define bcstq(x) ({ \
+    vdi_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \
+    t_; \
+})
+#  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+#  define bcstq(x) ((vdi_t){x, x})
+#  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y))
+# define transform(m, dir, x, c) ({ \
+    vdi_t m_ = bcstq(m); \
+    touch(m_); \
+    ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \
+})
+#endif
+
+const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL;
+
+int gf_test(void)
+{
+    unsigned int i;
+    vec_t src, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i;
+        one[i] = 1;
+    }
+
+    /* Special case for first iteration. */
+    one[0] = 0;
+
+    do {
+        vec_t inv = transform(ident, inv, src, 0);
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(src, inv), one) ) return __LINE__;
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(inv, src), one) ) return __LINE__;
+
+        one[0] = 1;
+
+        src += ELEM_COUNT;
+        i += ELEM_COUNT;
+    } while ( i < 256 );
+
+    return 0;
+}
diff --git a/tools/tests/x86_emulator/simd.h b/tools/tests/x86_emulator/simd.h
index 950c52281b..19b3444d87 100644
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -371,6 +371,7 @@ OVR(cvttsd2siq);
 OVR(cvttss2si);
 OVR(cvttss2sil);
 OVR(cvttss2siq);
+OVR(gf2p8mulb);
 OVR(movddup);
 OVR(movntdq);
 OVR(movntdqa);
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c 
b/tools/tests/x86_emulator/test_x86_emulator.c
index 25f13fc269..8934ec509f 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @progbits; .popsection" 
);
 #include "3dnow.h"
 #include "sse.h"
 #include "sse2.h"
+#include "sse2-gf.h"
 #include "sse4.h"
 #include "avx.h"
 #include "fma4.h"
 #include "fma.h"
 #include "avx2.h"
 #include "avx2-sg.h"
+#include "avx2-gf.h"
 #include "xop.h"
 #include "avx512f-opmask.h"
 #include "avx512dq-opmask.h"
@@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @progbits; .popsection" );
 #include "avx512f-sg.h"
 #include "avx512vl-sg.h"
 #include "avx512bw.h"
+#include "avx512bw-gf.h"
 #include "avx512dq.h"
 #include "avx512er.h"
 #include "avx512vbmi.h"
@@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(void)
     return cpu_has_avx512_vbmi && cpu_has_avx512vl;
 }
 
+static bool simd_check_sse2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_sse2;
+}
+
+static bool simd_check_avx2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_gf_vl(void)
+{
+    return cpu_has_gfni && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -395,6 +418,12 @@ static const struct {
     AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
     AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
     AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(GFNI (legacy),       sse2_gf,        16),
+    SIMD(GFNI (VEX/x16),      avx2_gf,        16),
+    SIMD(GFNI (VEX/x32),      avx2_gf,        32),
+    SIMD(GFNI (EVEX/x64), avx512bw_gf,        64),
+    AVX512VL(VL+GFNI (x16), avx512bw_gf,      16),
+    AVX512VL(VL+GFNI (x32), avx512bw_gf,      32),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_
diff --git a/tools/tests/x86_emulator/x86-emulate.h 
b/tools/tests/x86_emulator/x86-emulate.h
index bbb60bc65e..586319a5c5 100644
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t mask)
 #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
 #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
 #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_gfni       cp.feat.gfni
 #define cpu_has_vaes      (cp.feat.vaes && xcr0_mask(6))
 #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
 #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c 
b/xen/arch/x86/x86_emulate/x86_emulate.c
index 245c900351..136e236010 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -540,6 +540,7 @@ static const struct ext0f38_table {
     [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0xf0] = { .two_op = 1 },
@@ -619,6 +620,7 @@ static const struct ext0f3a_table {
     [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
+    [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
 };
@@ -1890,6 +1892,7 @@ in_protmode(
 #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
 #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
 #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_gfni()        (ctxt->cpuid->feat.gfni)
 #define vcpu_has_vaes()        (ctxt->cpuid->feat.vaes)
 #define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
 #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
@@ -9640,6 +9643,21 @@ x86_emulate(
         host_and_vcpu_must_have(avx512er);
         goto simd_zmm_scalar_sae;
 
+    case X86EMUL_OPC_66(0x0f38, 0xcf):      /* gf2p8mulb xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f38_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xcf):  /* vgf2p8mulb 
{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(evex.w || evex.brs, EXC_UD);
+        elem_bytes = 1;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdc):  /* vaesenc 
{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdd):  /* vaesenclast 
{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xde):  /* vaesdec 
{x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10383,6 +10401,24 @@ x86_emulate(
         op_bytes = 16;
         goto simd_0f3a_common;
 
+    case X86EMUL_OPC_66(0x0f3a, 0xce):      /* gf2p8affineqb 
$imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0xcf):      /* gf2p8affineinvqb 
$imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xce):  /* vgf2p8affineqb 
$imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf):  /* vgf2p8affineinvqb 
$imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!evex.w, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm8_no_sae;
+
     case X86EMUL_OPC_66(0x0f3a, 0xdf):     /* aeskeygenassist 
$imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist 
$imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(aesni);
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 85f8382eb0..906dd59c4b 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
 /* CPUID level 0x00000007:0.ecx */
 #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
 #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_gfni            boot_cpu_has(X86_FEATURE_GFNI)
 #define cpu_has_vaes            boot_cpu_has(X86_FEATURE_VAES)
 #define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
 #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index 676722fc59..e2c82a4554 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP,          6*32+ 2) /*S  User Mode 
Instruction Prevention */
 XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
 XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
 XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte 
Manipulation Instrs */
+XEN_CPUFEATURE(GFNI,          6*32+ 8) /*A  Galois Field Instrs */
 XEN_CPUFEATURE(VAES,          6*32+ 9) /*A  Vector AES Instrs */
 XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication 
Instrs */
 XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index e5ac93304d..836b010751 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -201,7 +201,7 @@ def crunch_numbers(state):
         # SSE2 was re-specified as core instructions for 64bit.  Also ISA
         # extensions dealing with vectors of integers are added here rather
         # than to SSE.
-        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA],
+        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA, GFNI],
 
         # Other SSEn each depend on their predecessor versions.
         SSE3: [SSSE3],
--
generated by git-patchbot for /home/xen/git/xen.git#master

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.