[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v6 10/42] x86emul: support AVX512{F, BW, _VBMI} full permute insns
Take the liberty and also correct the (public interface) name of the AVX512_VBMI feature flag, on the assumption that no external consumer has actually been using that flag so far. Furthermore make it have AVX512BW instead of AVX512F as a prerequisite, for requiring full 64-bit mask registers (the upper 48 bits of which can't be accessed other than through XSAVE/XRSTOR without AVX512BW support). Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v5: Re-base. v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -173,6 +173,10 @@ static const struct test avx512f_all[] = INSN(pcmpgtd, 66, 0f, 66, vl, d, vl), INSN(pcmpgtq, 66, 0f38, 37, vl, q, vl), INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl), + INSN(permi2, 66, 0f38, 76, vl, dq, vl), + INSN(permi2, 66, 0f38, 77, vl, sd, vl), + INSN(permt2, 66, 0f38, 7e, vl, dq, vl), + INSN(permt2, 66, 0f38, 7f, vl, sd, vl), INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl), INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl), INSN(pmins, 66, 0f38, 39, vl, dq, vl), @@ -294,6 +298,8 @@ static const struct test avx512bw_all[] INSN(pcmpgtb, 66, 0f, 64, vl, b, vl), INSN(pcmpgtw, 66, 0f, 65, vl, w, vl), INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl), + INSN(permi2w, 66, 0f38, 75, vl, w, vl), + INSN(permt2w, 66, 0f38, 7d, vl, w, vl), INSN(pmaddwd, 66, 0f, f5, vl, w, vl), INSN(pmaxsb, 66, 0f38, 3c, vl, b, vl), INSN(pmaxsw, 66, 0f, ee, vl, w, vl), @@ -378,6 +384,11 @@ static const struct test avx512dq_512[] INSN(inserti32x8, 66, 0f3a, 3a, el_8, d, vl), }; +static const struct test avx512_vbmi_all[] = { + INSN(permi2b, 66, 0f38, 75, vl, b, vl), + INSN(permt2b, 66, 0f38, 7d, vl, b, vl), +}; + static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 }; static const unsigned char vl_128[] = { VL_128 }; static const unsigned char vl_no128[] = { VL_512, VL_256 }; @@ -718,4 +729,5 @@ void evex_disp8_test(void *instr, struct RUN(avx512dq, 128); RUN(avx512dq, no128); RUN(avx512dq, 512); + RUN(avx512_vbmi, all); } --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -150,6 +150,9 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0) +# else +# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0) +# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0) # endif # elif FLOAT_SIZE == 8 # if VEC_SIZE >= 32 @@ -175,6 +178,9 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0) # define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0) +# else +# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0) +# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0) # endif # endif #elif FLOAT_SIZE == 4 && defined(__SSE__) @@ -303,6 +309,9 @@ static inline bool _to_bool(byte_vec_t b # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) +# else +# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) @@ -324,6 +333,9 @@ static inline bool _to_bool(byte_vec_t b # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) +# else +# define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0)) +# define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) # endif @@ -769,6 +781,7 @@ int simd_test(void) { unsigned int i, j; vec_t x, y, z, src, inv, alt, sh; + vint_t interleave_lo, interleave_hi; for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i ) { @@ -782,6 +795,9 @@ int simd_test(void) if ( !(i & (i + 1)) ) --j; sh[i] = j; + + interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1); + interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2); } touch(src); @@ -1075,7 +1091,7 @@ int simd_test(void) x = src * alt; y = interleave_lo(x, alt < 0); touch(x); - z = widen1(x); + z = widen1(low_half(x)); touch(x); if ( !eq(z, y) ) return __LINE__; @@ -1107,7 +1123,7 @@ int simd_test(void) # ifdef widen1 touch(src); - x = widen1(src); + x = widen1(low_half(src)); touch(src); if ( !eq(x, y) ) return __LINE__; # endif --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -70,6 +70,16 @@ typedef int __attribute__((vector_size(V typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; #endif +#if ELEM_SIZE == 1 +typedef vqi_t vint_t; +#elif ELEM_SIZE == 2 +typedef vhi_t vint_t; +#elif ELEM_SIZE == 4 +typedef vsi_t vint_t; +#elif ELEM_SIZE == 8 +typedef vdi_t vint_t; +#endif + #if VEC_SIZE >= 16 # if ELEM_COUNT >= 2 --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -136,6 +136,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6)) #define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6)) #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) +#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6)) #define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -468,9 +468,13 @@ static const struct ext0f38_table { [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 }, [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 }, [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 }, + [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x78] = { .simd_size = simd_other, .two_op = 1 }, [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 }, [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 }, + [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x8c] = { .simd_size = simd_packed_int }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, @@ -1861,6 +1865,7 @@ static bool vcpu_has( #define vcpu_has_sha() vcpu_has( 7, EBX, 29, ctxt, ops) #define vcpu_has_avx512bw() vcpu_has( 7, EBX, 30, ctxt, ops) #define vcpu_has_avx512vl() vcpu_has( 7, EBX, 31, ctxt, ops) +#define vcpu_has_avx512_vbmi() vcpu_has( 7, ECX, 1, ctxt, ops) #define vcpu_has_rdpid() vcpu_has( 7, ECX, 22, ctxt, ops) #define vcpu_has_clzero() vcpu_has(0x80000008, EBX, 0, ctxt, ops) @@ -6027,6 +6032,11 @@ x86_emulate( CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK), EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x76): /* vpermi2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x77): /* vpermi2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7e): /* vpermt2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7f): /* vpermt2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ fault_suppression = false; /* fall through */ case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ @@ -8539,6 +8549,16 @@ x86_emulate( generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + if ( !evex.w ) + host_and_vcpu_must_have(avx512_vbmi); + else + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.br, EXC_UD); + fault_suppression = false; + goto avx512f_no_sae; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */ host_and_vcpu_must_have(avx512bw); --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -107,6 +107,9 @@ #define cpu_has_avx512bw boot_cpu_has(X86_FEATURE_AVX512BW) #define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL) +/* CPUID level 0x00000007:0.ecx */ +#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI) + /* CPUID level 0x80000007.edx */ #define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC) --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -224,7 +224,7 @@ XEN_CPUFEATURE(AVX512VL, 5*32+31) / /* Intel-defined CPU features, CPUID level 0x00000007:0.ecx, word 6 */ XEN_CPUFEATURE(PREFETCHWT1, 6*32+ 0) /*A PREFETCHWT1 instruction */ -XEN_CPUFEATURE(AVX512VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */ +XEN_CPUFEATURE(AVX512_VBMI, 6*32+ 1) /*A AVX-512 Vector Byte Manipulation Instrs */ XEN_CPUFEATURE(UMIP, 6*32+ 2) /*S User Mode Instruction Prevention */ XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */ XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */ --- a/xen/tools/gen-cpuid.py +++ b/xen/tools/gen-cpuid.py @@ -259,12 +259,17 @@ def crunch_numbers(state): AVX2: [AVX512F], # AVX512F is taken to mean hardware support for 512bit registers - # (which in practice depends on the EVEX prefix to encode), and the - # instructions themselves. All further AVX512 features are built on - # top of AVX512F + # (which in practice depends on the EVEX prefix to encode) as well + # as mask registers, and the instructions themselves. All further + # AVX512 features are built on top of AVX512F AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD, - AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW, - AVX512_4FMAPS, AVX512_VPOPCNTDQ], + AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS, + AVX512_VPOPCNTDQ], + + # AVX512 extensions acting solely on vectors of bytes/words are made + # dependents of AVX512BW (as to requiring wider than 16-bit mask + # registers), despite the SDM not formally making this connection. + AVX512BW: [AVX512_VBMI], # The features: # * Single Thread Indirect Branch Predictors _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |