Xen project Mailing List

[Xen-devel] [PATCH v4 44/44] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Tue, 25 Sep 2018 07:55:51 -0600

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>

Delivery-date: Tue, 25 Sep 2018 13:56:03 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Along the lines of prior patches, VCVT{,T}PS2UQQ as well as VCVT{,T}S{S,D}2USI need "manual" overrides of disp8scale. The twobyte_table[] entries get altered, with their prior values now put in place in x86_decode_twobyte(). Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -101,21 +101,29 @@ static const struct test avx512f_all[] = INSN(cvtdq2pd, f3, 0f, e6, vl_2, d, vl), INSN(cvtdq2ps, , 0f, 5b, vl, d, vl), INSN(cvtpd2dq, f2, 0f, e6, vl, q, vl), + INSN(cvtpd2udq, , 0f, 79, vl, q, vl), INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl), INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl), INSN(cvtps2dq, 66, 0f, 5b, vl, d, vl), INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl), INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl), + INSN(cvtps2udq, , 0f, 79, vl, d, vl), INSN(cvtsd2si, f2, 0f, 2d, el, q, el), + INSN(cvtsd2usi, f2, 0f, 79, el, q, el), INSN(cvtsd2ss, f2, 0f, 5a, el, q, el), INSN(cvtsi2sd, f2, 0f, 2a, el, dq64, el), INSN(cvtsi2ss, f3, 0f, 2a, el, dq64, el), INSN(cvtss2sd, f3, 0f, 5a, el, d, el), INSN(cvtss2si, f3, 0f, 2d, el, d, el), + INSN(cvtss2usi, f3, 0f, 79, el, d, el), INSN(cvttpd2dq, 66, 0f, e6, vl, q, vl), + INSN(cvttpd2udq, , 0f, 78, vl, q, vl), INSN(cvttps2dq, f3, 0f, 5b, vl, d, vl), + INSN(cvttps2udq, , 0f, 78, vl, d, vl), INSN(cvttsd2si, f2, 0f, 2c, el, q, el), + INSN(cvttsd2usi, f2, 0f, 78, el, q, el), INSN(cvttss2si, f3, 0f, 2c, el, d, el), + INSN(cvttss2usi, f3, 0f, 78, el, d, el), INSN(cvtudq2pd, f3, 0f, 7a, vl_2, d, vl), INSN(cvtudq2ps, f2, 0f, 7a, vl, d, vl), INSN(cvtusi2sd, f2, 0f, 7b, el, dq64, el), @@ -405,11 +413,15 @@ static const struct test avx512dq_all[] INSN_PFP(andn, 0f, 55), INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl), INSN(cvtpd2qq, 66, 0f, 7b, vl, q, vl), + INSN(cvtpd2uqq, 66, 0f, 79, vl, q, vl), INSN(cvtps2qq, 66, 0f, 7b, vl_2, d, vl), + INSN(cvtps2uqq, 66, 0f, 79, vl_2, d, vl), INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl), INSN(cvtqq2ps, , 0f, 5b, vl, q, vl), INSN(cvttpd2qq, 66, 0f, 7a, vl, q, vl), + INSN(cvttpd2uqq, 66, 0f, 78, vl, q, vl), INSN(cvttps2qq, 66, 0f, 7a, vl_2, d, vl), + INSN(cvttps2uqq, 66, 0f, 78, vl_2, d, vl), INSN(cvtuqq2pd, f3, 0f, 7a, vl, q, vl), INSN(cvtuqq2ps, f2, 0f, 7a, vl, q, vl), INSN_PFP(or, 0f, 56), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -93,31 +93,65 @@ static inline bool _to_bool(byte_vec_t b # ifdef __x86_64__ # define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); }) # endif +# ifdef __AVX512F__ +/* + * Sadly even gcc 9.x, at the time of writing, does not carry out at least + * uint -> FP conversions using VCVTUSI2S{S,D}, so we need to use builtins + * or inline assembly here. The full-vector parameter types of the builtins + * aren't very helpful for our purposes, so use inline assembly. + */ +# if FLOAT_SIZE == 4 +# define to_u_int(type, x) ({ \ + unsigned type u_; \ + float __attribute__((vector_size(16))) t_; \ + asm ( "vcvtss2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \ + asm ( "vcvtusi2ss%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \ + (vec_t){ t_[0] }; \ +}) +# elif FLOAT_SIZE == 8 +# define to_u_int(type, x) ({ \ + unsigned type u_; \ + double __attribute__((vector_size(16))) t_; \ + asm ( "vcvtsd2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \ + asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \ + (vec_t){ t_[0] }; \ +}) +# endif +# define to_uint(x) to_u_int(int, x) +# ifdef __x86_64__ +# define to_uwint(x) to_u_int(long, x) +# endif +# endif #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__) # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x)) #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \ (VEC_SIZE == 64 || defined(__AVX512VL__)) # if FLOAT_SIZE == 4 # define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0) +# define to_uint(x) BR(cvtudq2ps, _mask, BR(cvtps2udq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0) # ifdef __AVX512DQ__ -# define to_wint(x) ({ \ +# define to_w_int(x, s) ({ \ vsf_half_t t_ = low_half(x); \ vdi_t lo_, hi_; \ touch(t_); \ - lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + lo_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \ t_ = high_half(x); \ touch(t_); \ - hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \ + hi_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \ touch(lo_); touch(hi_); \ insert_half(insert_half(undef(), \ - BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \ - BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \ + BR(cvt ## s ## qq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \ + BR(cvt ## s ## qq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \ }) +# define to_wint(x) to_w_int(x, ) +# define to_uwint(x) to_w_int(x, u) # endif # elif FLOAT_SIZE == 8 # define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0) +# define to_uint(x) B(cvtudq2pd, _mask, BR(cvtpd2udq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0) # ifdef __AVX512DQ__ # define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0) +# define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0) # endif # endif #elif VEC_SIZE == 16 && defined(__SSE2__) @@ -1208,6 +1242,20 @@ int simd_test(void) touch(src); if ( !eq(x, src) ) return __LINE__; # endif + +# ifdef to_uint + touch(src); + x = to_uint(src); + touch(src); + if ( !eq(x, src) ) return __LINE__; +# endif + +# ifdef to_uwint + touch(src); + x = to_uwint(src); + touch(src); + if ( !eq(x, src) ) return __LINE__; +# endif # ifdef sqrt x = src * src; --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -323,8 +323,7 @@ static const struct twobyte_table { [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl }, [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl }, [0x77] = { DstImplicit|SrcNone }, - [0x78] = { ImplicitOps|ModRM }, - [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int }, + [0x78 ... 0x79] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl }, [0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl }, [0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_dq64 }, [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other }, @@ -2507,6 +2506,8 @@ x86_decode_twobyte( break; case 0x78: + state->desc = ImplicitOps; + state->simd_size = simd_none; switch ( vex.pfx ) { case vex_66: /* extrq $imm8, $imm8, xmm */ @@ -2519,7 +2520,7 @@ x86_decode_twobyte( case 0x10 ... 0x18: case 0x28 ... 0x2f: case 0x50 ... 0x77: - case 0x79 ... 0x7d: + case 0x7a ... 0x7d: case 0x7f: case 0xc2 ... 0xc3: case 0xc5 ... 0xc6: @@ -2541,6 +2542,12 @@ x86_decode_twobyte( op_bytes = mode_64bit() ? 8 : 4; break; + case 0x79: + state->desc = DstReg | SrcMem; + state->simd_size = simd_packed_int; + ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); + break; + case 0x7e: ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */ @@ -3062,6 +3069,18 @@ x86_decode( modrm_mod = 3; break; + case 0x78: + case 0x79: + if ( !evex.pfx ) + break; + /* vcvt{,t}ps2uqq need special casing */ + if ( evex.pfx == vex_66 ) + { + if ( !evex.w && !evex.br ) + --disp8scale; + break; + } + /* vcvt{,t}s{s,d}2usi need special casing: fall through */ case 0x2c: /* vcvtts{s,d}2si need special casing */ case 0x2d: /* vcvts{s,d}2si need special casing */ if ( evex_encoded() ) @@ -6274,6 +6293,8 @@ x86_emulate( CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || (ea.type != OP_REG && evex.br)), EXC_UD); @@ -6633,7 +6654,11 @@ x86_emulate( if ( evex.w ) host_and_vcpu_must_have(avx512dq); else + { + case X86EMUL_OPC_EVEX(0x0f, 0x78): /* vcvttp{s,d}2udq [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(0x0f, 0x79): /* vcvtp{s,d}2udq [xyz]mm/mem,[xyz]mm{k} */ host_and_vcpu_must_have(avx512f); + } if ( ea.type == OP_MEM || !evex.br ) avx512_vlen_check(false); d |= TwoOp; @@ -7311,6 +7336,10 @@ x86_emulate( host_and_vcpu_must_have(avx512f); else if ( evex.w ) { + case X86EMUL_OPC_EVEX_66(0x0f, 0x78): /* vcvttps2uqq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvttpd2uqq [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x79): /* vcvtps2uqq {x,y}mm/mem,[xyz]mm{k} */ + /* vcvtpd2uqq [xyz]mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0x7a): /* vcvttps2qq {x,y}mm/mem,[xyz]mm{k} */ /* vcvttpd2qq [xyz]mm/mem,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0x7b): /* vcvtps2qq {x,y}mm/mem,[xyz]mm{k} */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.