[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen staging] x86emul: support AVX512{F, BW, _VBMI} full permute insns
commit 597dcb7df00dc8705d8f81ed803d64ac4cbfe7de Author: Jan Beulich <jbeulich@xxxxxxxx> AuthorDate: Tue May 21 08:27:16 2019 +0200 Commit: Jan Beulich <jbeulich@xxxxxxxx> CommitDate: Tue May 21 08:27:16 2019 +0200 x86emul: support AVX512{F,BW,_VBMI} full permute insns Take the liberty and also correct the (public interface) name of the AVX512_VBMI feature flag, on the assumption that no external consumer has actually been using that flag so far. Furthermore make it have AVX512BW instead of AVX512F as a prerequisite, for requiring full 64-bit mask registers (the upper 48 bits of which can't be accessed other than through XSAVE/XRSTOR without AVX512BW support). Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Acked-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- tools/tests/x86_emulator/evex-disp8.c | 12 ++++++++++++ tools/tests/x86_emulator/simd.c | 20 ++++++++++++++++++-- tools/tests/x86_emulator/simd.h | 10 ++++++++++ tools/tests/x86_emulator/x86-emulate.h | 1 + xen/arch/x86/x86_emulate/x86_emulate.c | 20 ++++++++++++++++++++ xen/include/asm-x86/cpufeature.h | 1 + xen/include/public/arch-x86/cpufeatureset.h | 2 +- xen/tools/gen-cpuid.py | 15 ++++++++++----- 8 files changed, 73 insertions(+), 8 deletions(-) diff --git a/tools/tests/x86_emulator/evex-disp8.c b/tools/tests/x86_emulator/evex-disp8.c index bdc597ecd5..8b2a7723db 100644 --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -173,6 +173,10 @@ static const struct test avx512f_all[] = { INSN(pcmpgtd, 66, 0f, 66, vl, d, vl), INSN(pcmpgtq, 66, 0f38, 37, vl, q, vl), INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl), + INSN(permi2, 66, 0f38, 76, vl, dq, vl), + INSN(permi2, 66, 0f38, 77, vl, sd, vl), + INSN(permt2, 66, 0f38, 7e, vl, dq, vl), + INSN(permt2, 66, 0f38, 7f, vl, sd, vl), INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl), INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl), INSN(pmins, 66, 0f38, 39, vl, dq, vl), @@ -294,6 +298,8 @@ static const struct test avx512bw_all[] = { INSN(pcmpgtb, 66, 0f, 64, vl, b, vl), INSN(pcmpgtw, 66, 0f, 65, vl, w, vl), INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl), + INSN(permi2w, 66, 0f38, 75, vl, w, vl), + INSN(permt2w, 66, 0f38, 7d, vl, w, vl), INSN(pmaddwd, 66, 0f, f5, vl, w, vl), INSN(pmaxsb, 66, 0f38, 3c, vl, b, vl), INSN(pmaxsw, 66, 0f, ee, vl, w, vl), @@ -378,6 +384,11 @@ static const struct test avx512dq_512[] = { INSN(inserti32x8, 66, 0f3a, 3a, el_8, d, vl), }; +static const struct test avx512_vbmi_all[] = { + INSN(permi2b, 66, 0f38, 75, vl, b, vl), + INSN(permt2b, 66, 0f38, 7d, vl, b, vl), +}; + static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 }; static const unsigned char vl_128[] = { VL_128 }; static const unsigned char vl_no128[] = { VL_512, VL_256 }; @@ -718,4 +729,5 @@ void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt, RUN(avx512dq, 128); RUN(avx512dq, no128); RUN(avx512dq, 512); + RUN(avx512_vbmi, all); } diff --git a/tools/tests/x86_emulator/simd.c b/tools/tests/x86_emulator/simd.c index 47622c6a59..f26df32644 100644 --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -150,6 +150,9 @@ static inline bool _to_bool(byte_vec_t bv) # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0) +# else +# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0) +# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0) # endif # elif FLOAT_SIZE == 8 # if VEC_SIZE >= 32 @@ -175,6 +178,9 @@ static inline bool _to_bool(byte_vec_t bv) # define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0) # define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0) +# else +# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0) +# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0) # endif # endif #elif FLOAT_SIZE == 4 && defined(__SSE__) @@ -303,6 +309,9 @@ static inline bool _to_bool(byte_vec_t bv) # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) +# else +# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) @@ -324,6 +333,9 @@ static inline bool _to_bool(byte_vec_t bv) # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) +# else +# define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) # endif @@ -769,6 +781,7 @@ int simd_test(void) { unsigned int i, j; vec_t x, y, z, src, inv, alt, sh; + vint_t interleave_lo, interleave_hi; for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i ) { @@ -782,6 +795,9 @@ int simd_test(void) if ( !(i & (i + 1)) ) --j; sh[i] = j; + + interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1); + interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2); } touch(src); @@ -1075,7 +1091,7 @@ int simd_test(void) x = src * alt; y = interleave_lo(x, alt < 0); touch(x); - z = widen1(x); + z = widen1(low_half(x)); touch(x); if ( !eq(z, y) ) return __LINE__; @@ -1107,7 +1123,7 @@ int simd_test(void) # ifdef widen1 touch(src); - x = widen1(src); + x = widen1(low_half(src)); touch(src); if ( !eq(x, y) ) return __LINE__; # endif diff --git a/tools/tests/x86_emulator/simd.h b/tools/tests/x86_emulator/simd.h index 8c5a419f46..1f43dffcff 100644 --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -70,6 +70,16 @@ typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; #endif +#if ELEM_SIZE == 1 +typedef vqi_t vint_t; +#elif ELEM_SIZE == 2 +typedef vhi_t vint_t; +#elif ELEM_SIZE == 4 +typedef vsi_t vint_t; +#elif ELEM_SIZE == 8 +typedef vdi_t vint_t; +#endif + #if VEC_SIZE >= 16 # if ELEM_COUNT >= 2 diff --git a/tools/tests/x86_emulator/x86-emulate.h b/tools/tests/x86_emulator/x86-emulate.h index cb7cc3cd18..65ecb3c167 100644 --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -136,6 +136,7 @@ static inline bool xcr0_mask(uint64_t mask) #define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6)) #define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6)) #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) +#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6)) #define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1) diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c index 6ea189fe89..b801992324 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -468,9 +468,13 @@ static const struct ext0f38_table { [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 }, [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 }, [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 }, + [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x78] = { .simd_size = simd_other, .two_op = 1 }, [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 }, [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 }, + [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x8c] = { .simd_size = simd_packed_int }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, @@ -1829,6 +1833,7 @@ in_protmode( #define vcpu_has_sha() (ctxt->cpuid->feat.sha) #define vcpu_has_avx512bw() (ctxt->cpuid->feat.avx512bw) #define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl) +#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi) #define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid) #define vcpu_must_have(feat) \ @@ -6017,6 +6022,11 @@ x86_emulate( CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK), EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x76): /* vpermi2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x77): /* vpermi2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7e): /* vpermt2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7f): /* vpermt2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ fault_suppression = false; /* fall through */ case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ @@ -8540,6 +8550,16 @@ x86_emulate( generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + if ( !evex.w ) + host_and_vcpu_must_have(avx512_vbmi); + else + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.brs, EXC_UD); + fault_suppression = false; + goto avx512f_no_sae; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */ host_and_vcpu_must_have(avx512bw); diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h index 745801f3c0..bf5f37ec36 100644 --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -107,6 +107,7 @@ #define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL) /* CPUID level 0x00000007:0.ecx */ +#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI) #define cpu_has_rdpid boot_cpu_has(X86_FEATURE_RDPID) /* CPUID level 0x80000007.edx */ diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h index 55231d4b3b..cf28118cab 100644 --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -224,7 +224,7 @@ XEN_CPUFEATURE(AVX512VL, 5*32+31) /*A AVX-512 Vector Length Extensions */ /* Intel-defined CPU features, CPUID level 0x00000007:0.ecx, word 6 */ XEN_CPUFEATURE(PREFETCHWT1, 6*32+ 0) /*A PREFETCHWT1 instruction */ -XEN_CPUFEATURE(AVX512VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */ +XEN_CPUFEATURE(AVX512_VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */ XEN_CPUFEATURE(UMIP, 6*32+ 2) /*S User Mode Instruction Prevention */ XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */ XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */ diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py index f8bd686852..67ec54b183 100755 --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -260,12 +260,17 @@ def crunch_numbers(state): AVX2: [AVX512F], # AVX512F is taken to mean hardware support for 512bit registers - # (which in practice depends on the EVEX prefix to encode), and the - # instructions themselves. All further AVX512 features are built on - # top of AVX512F + # (which in practice depends on the EVEX prefix to encode) as well + # as mask registers, and the instructions themselves. All further + # AVX512 features are built on top of AVX512F AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD, - AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW, - AVX512_4FMAPS, AVX512_VPOPCNTDQ], + AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS, + AVX512_VPOPCNTDQ], + + # AVX512 extensions acting solely on vectors of bytes/words are made + # dependents of AVX512BW (as to requiring wider than 16-bit mask + # registers), despite the SDM not formally making this connection. + AVX512BW: [AVX512_VBMI], # The features: # * Single Thread Indirect Branch Predictors -- generated by git-patchbot for /home/xen/git/xen.git#staging _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |