[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v4 28/44] x86emul: support AVX512{F, BW} down conversion moves
Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are slightly different from what one would expect, due to them requiring EVEX.W to be zero. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v4: Also #UD when evex.z is set with a memory operand. v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -166,11 +166,26 @@ static const struct test avx512f_all[] = INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl), INSN(pmins, 66, 0f38, 39, vl, dq, vl), INSN(pminu, 66, 0f38, 3b, vl, dq, vl), + INSN(pmovdb, f3, 0f38, 31, vl_4, b, vl), + INSN(pmovdw, f3, 0f38, 33, vl_2, b, vl), + INSN(pmovqb, f3, 0f38, 32, vl_8, b, vl), + INSN(pmovqd, f3, 0f38, 35, vl_2, d_nb, vl), + INSN(pmovqw, f3, 0f38, 34, vl_4, b, vl), + INSN(pmovsdb, f3, 0f38, 21, vl_4, b, vl), + INSN(pmovsdw, f3, 0f38, 23, vl_2, b, vl), + INSN(pmovsqb, f3, 0f38, 22, vl_8, b, vl), + INSN(pmovsqd, f3, 0f38, 25, vl_2, d_nb, vl), + INSN(pmovsqw, f3, 0f38, 24, vl_4, b, vl), INSN(pmovsxbd, 66, 0f38, 21, vl_4, b, vl), INSN(pmovsxbq, 66, 0f38, 22, vl_8, b, vl), INSN(pmovsxwd, 66, 0f38, 23, vl_2, w, vl), INSN(pmovsxwq, 66, 0f38, 24, vl_4, w, vl), INSN(pmovsxdq, 66, 0f38, 25, vl_2, d_nb, vl), + INSN(pmovusdb, f3, 0f38, 11, vl_4, b, vl), + INSN(pmovusdw, f3, 0f38, 13, vl_2, b, vl), + INSN(pmovusqb, f3, 0f38, 12, vl_8, b, vl), + INSN(pmovusqd, f3, 0f38, 15, vl_2, d_nb, vl), + INSN(pmovusqw, f3, 0f38, 14, vl_4, b, vl), INSN(pmovzxbd, 66, 0f38, 31, vl_4, b, vl), INSN(pmovzxbq, 66, 0f38, 32, vl_8, b, vl), INSN(pmovzxwd, 66, 0f38, 33, vl_2, w, vl), @@ -273,7 +288,10 @@ static const struct test avx512bw_all[] INSN(pminsw, 66, 0f, ea, vl, w, vl), INSN(pminub, 66, 0f, da, vl, b, vl), INSN(pminuw, 66, 0f38, 3a, vl, w, vl), + INSN(pmovswb, f3, 0f38, 20, vl_2, b, vl), INSN(pmovsxbw, 66, 0f38, 20, vl_2, b, vl), + INSN(pmovuswb, f3, 0f38, 10, vl_2, b, vl), + INSN(pmovwb, f3, 0f38, 30, vl_2, b, vl), INSN(pmovzxbw, 66, 0f38, 30, vl_2, b, vl), INSN(pmulhuw, 66, 0f, e4, vl, w, vl), INSN(pmulhw, 66, 0f, e5, vl, w, vl), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -271,6 +271,17 @@ static inline bool _to_bool(byte_vec_t b #endif #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \ defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__)) +# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \ + (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \ + (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */ +# define low_half(x) ({ \ + half_t t_; \ + asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \ + : [d] "=m" (t_) \ + : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \ + t_; \ +}) +# endif # if INT_SIZE == 4 || UINT_SIZE == 4 # define broadcast(x) ({ \ vec_t t_; \ @@ -285,6 +296,7 @@ static inline bool _to_bool(byte_vec_t b }) # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) +# define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0)) # elif INT_SIZE == 8 || UINT_SIZE == 8 # define broadcast(x) ({ \ vec_t t_; \ @@ -714,6 +726,27 @@ static inline bool _to_bool(byte_vec_t b # endif #endif +#if VEC_SIZE >= 16 + +# if !defined(low_half) && defined(HALF_SIZE) +static inline half_t low_half(vec_t x) +{ +# if HALF_SIZE < VEC_SIZE + half_t y; + unsigned int i; + + for ( i = 0; i < ELEM_COUNT / 2; ++i ) + y[i] = x[i]; + + return y; +# else + return x; +# endif +} +# endif + +#endif + #if defined(__AVX512F__) && defined(FLOAT_SIZE) # include "simd-fma.c" #endif @@ -1081,6 +1114,21 @@ int simd_test(void) #endif +#if defined(widen1) && defined(shrink1) + { + half_t aux1 = low_half(src), aux2; + + touch(aux1); + x = widen1(aux1); + touch(x); + aux2 = shrink1(x); + touch(aux2); + for ( i = 0; i < ELEM_COUNT / 2; ++i ) + if ( aux2[i] != src[i] ) + return __LINE__; + } +#endif + #ifdef dup_lo touch(src); x = dup_lo(src); --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -70,6 +70,23 @@ typedef int __attribute__((vector_size(V typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; #endif +#if VEC_SIZE >= 16 + +# if ELEM_COUNT >= 2 +# if VEC_SIZE > 32 +# define HALF_SIZE (VEC_SIZE / 2) +# else +# define HALF_SIZE 16 +# endif +typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t; +typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t; +typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t; +typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t; +typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t; +# endif + +#endif + #if VEC_SIZE == 16 # define B(n, s, a...) __builtin_ia32_ ## n ## 128 ## s(a) # define B_(n, s, a...) __builtin_ia32_ ## n ## s(a) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -3056,7 +3056,22 @@ x86_decode( d |= vSIB; state->simd_size = ext0f38_table[b].simd_size; if ( evex_encoded() ) - disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state); + { + /* + * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but + * their attributes don't match those of the vex_66 encoded + * insns with the same base opcodes. Rather than adding new + * columns to the table, handle this here for now. + */ + if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 ) + disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state); + else + { + disp8scale = decode_disp8scale(ext0f38_table[b + 0x10].d8s, + state); + state->simd_size = simd_other; + } + } break; case ext_0f3a: @@ -8318,10 +8333,14 @@ x86_emulate( op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l); goto simd_0f_int; + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} */ host_and_vcpu_must_have(avx512bw); - /* fall through */ + if ( evex.pfx != vex_f3 ) + { case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */ @@ -8332,7 +8351,28 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */ - generate_exception_if(evex.w && (b & 7) == 5, EXC_UD); + generate_exception_if(evex.w && (b & 7) == 5, EXC_UD); + } + else + { + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw [xyz]mm,{x,y}mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd [xyz]mm,{x,y}mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} */ + generate_exception_if(evex.w || (ea.type == OP_MEM && evex.z), EXC_UD); + d = DstMem | SrcReg | TwoOp; + } op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr); elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4; goto avx512f_no_sae; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |