|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v4 44/44] x86emul: support AVX512{F, DQ} FP-to-uint conversion insns
Along the lines of prior patches, VCVT{,T}PS2UQQ as well as
VCVT{,T}S{S,D}2USI need "manual" overrides of disp8scale.
The twobyte_table[] entries get altered, with their prior values
now put in place in x86_decode_twobyte().
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v4: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -101,21 +101,29 @@ static const struct test avx512f_all[] =
INSN(cvtdq2pd, f3, 0f, e6, vl_2, d, vl),
INSN(cvtdq2ps, , 0f, 5b, vl, d, vl),
INSN(cvtpd2dq, f2, 0f, e6, vl, q, vl),
+ INSN(cvtpd2udq, , 0f, 79, vl, q, vl),
INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl),
INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl),
INSN(cvtps2dq, 66, 0f, 5b, vl, d, vl),
INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl),
INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl),
+ INSN(cvtps2udq, , 0f, 79, vl, d, vl),
INSN(cvtsd2si, f2, 0f, 2d, el, q, el),
+ INSN(cvtsd2usi, f2, 0f, 79, el, q, el),
INSN(cvtsd2ss, f2, 0f, 5a, el, q, el),
INSN(cvtsi2sd, f2, 0f, 2a, el, dq64, el),
INSN(cvtsi2ss, f3, 0f, 2a, el, dq64, el),
INSN(cvtss2sd, f3, 0f, 5a, el, d, el),
INSN(cvtss2si, f3, 0f, 2d, el, d, el),
+ INSN(cvtss2usi, f3, 0f, 79, el, d, el),
INSN(cvttpd2dq, 66, 0f, e6, vl, q, vl),
+ INSN(cvttpd2udq, , 0f, 78, vl, q, vl),
INSN(cvttps2dq, f3, 0f, 5b, vl, d, vl),
+ INSN(cvttps2udq, , 0f, 78, vl, d, vl),
INSN(cvttsd2si, f2, 0f, 2c, el, q, el),
+ INSN(cvttsd2usi, f2, 0f, 78, el, q, el),
INSN(cvttss2si, f3, 0f, 2c, el, d, el),
+ INSN(cvttss2usi, f3, 0f, 78, el, d, el),
INSN(cvtudq2pd, f3, 0f, 7a, vl_2, d, vl),
INSN(cvtudq2ps, f2, 0f, 7a, vl, d, vl),
INSN(cvtusi2sd, f2, 0f, 7b, el, dq64, el),
@@ -405,11 +413,15 @@ static const struct test avx512dq_all[]
INSN_PFP(andn, 0f, 55),
INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl),
INSN(cvtpd2qq, 66, 0f, 7b, vl, q, vl),
+ INSN(cvtpd2uqq, 66, 0f, 79, vl, q, vl),
INSN(cvtps2qq, 66, 0f, 7b, vl_2, d, vl),
+ INSN(cvtps2uqq, 66, 0f, 79, vl_2, d, vl),
INSN(cvtqq2pd, f3, 0f, e6, vl, q, vl),
INSN(cvtqq2ps, , 0f, 5b, vl, q, vl),
INSN(cvttpd2qq, 66, 0f, 7a, vl, q, vl),
+ INSN(cvttpd2uqq, 66, 0f, 78, vl, q, vl),
INSN(cvttps2qq, 66, 0f, 7a, vl_2, d, vl),
+ INSN(cvttps2uqq, 66, 0f, 78, vl_2, d, vl),
INSN(cvtuqq2pd, f3, 0f, 7a, vl, q, vl),
INSN(cvtuqq2ps, f2, 0f, 7a, vl, q, vl),
INSN_PFP(or, 0f, 56),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -93,31 +93,65 @@ static inline bool _to_bool(byte_vec_t b
# ifdef __x86_64__
# define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); })
# endif
+# ifdef __AVX512F__
+/*
+ * Sadly even gcc 9.x, at the time of writing, does not carry out at least
+ * uint -> FP conversions using VCVTUSI2S{S,D}, so we need to use builtins
+ * or inline assembly here. The full-vector parameter types of the builtins
+ * aren't very helpful for our purposes, so use inline assembly.
+ */
+# if FLOAT_SIZE == 4
+# define to_u_int(type, x) ({ \
+ unsigned type u_; \
+ float __attribute__((vector_size(16))) t_; \
+ asm ( "vcvtss2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
+ asm ( "vcvtusi2ss%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
+ (vec_t){ t_[0] }; \
+})
+# elif FLOAT_SIZE == 8
+# define to_u_int(type, x) ({ \
+ unsigned type u_; \
+ double __attribute__((vector_size(16))) t_; \
+ asm ( "vcvtsd2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
+ asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
+ (vec_t){ t_[0] }; \
+})
+# endif
+# define to_uint(x) to_u_int(int, x)
+# ifdef __x86_64__
+# define to_uwint(x) to_u_int(long, x)
+# endif
+# endif
#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
#elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
# if FLOAT_SIZE == 4
# define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(),
~0), undef(), ~0)
+# define to_uint(x) BR(cvtudq2ps, _mask, BR(cvtps2udq, _mask, x,
(vsi_t)undef(), ~0), undef(), ~0)
# ifdef __AVX512DQ__
-# define to_wint(x) ({ \
+# define to_w_int(x, s) ({ \
vsf_half_t t_ = low_half(x); \
vdi_t lo_, hi_; \
touch(t_); \
- lo_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+ lo_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
t_ = high_half(x); \
touch(t_); \
- hi_ = BR(cvtps2qq, _mask, t_, (vdi_t)undef(), ~0); \
+ hi_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
touch(lo_); touch(hi_); \
insert_half(insert_half(undef(), \
- BR(cvtqq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \
- BR(cvtqq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
+ BR(cvt ## s ## qq2ps, _mask, lo_, (vsf_half_t){},
~0), 0), \
+ BR(cvt ## s ## qq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
})
+# define to_wint(x) to_w_int(x, )
+# define to_uwint(x) to_w_int(x, u)
# endif
# elif FLOAT_SIZE == 8
# define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){},
~0), undef(), ~0)
+# define to_uint(x) B(cvtudq2pd, _mask, BR(cvtpd2udq, _mask, x,
(vsi_half_t){}, ~0), undef(), ~0)
# ifdef __AVX512DQ__
# define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x,
(vdi_t)undef(), ~0), undef(), ~0)
+# define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x,
(vdi_t)undef(), ~0), undef(), ~0)
# endif
# endif
#elif VEC_SIZE == 16 && defined(__SSE2__)
@@ -1208,6 +1242,20 @@ int simd_test(void)
touch(src);
if ( !eq(x, src) ) return __LINE__;
# endif
+
+# ifdef to_uint
+ touch(src);
+ x = to_uint(src);
+ touch(src);
+ if ( !eq(x, src) ) return __LINE__;
+# endif
+
+# ifdef to_uwint
+ touch(src);
+ x = to_uwint(src);
+ touch(src);
+ if ( !eq(x, src) ) return __LINE__;
+# endif
# ifdef sqrt
x = src * src;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -323,8 +323,7 @@ static const struct twobyte_table {
[0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x77] = { DstImplicit|SrcNone },
- [0x78] = { ImplicitOps|ModRM },
- [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
+ [0x78 ... 0x79] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_vl },
[0x7a] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
[0x7b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_dq64 },
[0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -2507,6 +2506,8 @@ x86_decode_twobyte(
break;
case 0x78:
+ state->desc = ImplicitOps;
+ state->simd_size = simd_none;
switch ( vex.pfx )
{
case vex_66: /* extrq $imm8, $imm8, xmm */
@@ -2519,7 +2520,7 @@ x86_decode_twobyte(
case 0x10 ... 0x18:
case 0x28 ... 0x2f:
case 0x50 ... 0x77:
- case 0x79 ... 0x7d:
+ case 0x7a ... 0x7d:
case 0x7f:
case 0xc2 ... 0xc3:
case 0xc5 ... 0xc6:
@@ -2541,6 +2542,12 @@ x86_decode_twobyte(
op_bytes = mode_64bit() ? 8 : 4;
break;
+ case 0x79:
+ state->desc = DstReg | SrcMem;
+ state->simd_size = simd_packed_int;
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ break;
+
case 0x7e:
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
@@ -3062,6 +3069,18 @@ x86_decode(
modrm_mod = 3;
break;
+ case 0x78:
+ case 0x79:
+ if ( !evex.pfx )
+ break;
+ /* vcvt{,t}ps2uqq need special casing */
+ if ( evex.pfx == vex_66 )
+ {
+ if ( !evex.w && !evex.br )
+ --disp8scale;
+ break;
+ }
+ /* vcvt{,t}s{s,d}2usi need special casing: fall through */
case 0x2c: /* vcvtts{s,d}2si need special casing */
case 0x2d: /* vcvts{s,d}2si need special casing */
if ( evex_encoded() )
@@ -6274,6 +6293,8 @@ x86_emulate(
CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */
CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */
+ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */
+ CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */
generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk ||
(ea.type != OP_REG && evex.br)),
EXC_UD);
@@ -6633,7 +6654,11 @@ x86_emulate(
if ( evex.w )
host_and_vcpu_must_have(avx512dq);
else
+ {
+ case X86EMUL_OPC_EVEX(0x0f, 0x78): /* vcvttp{s,d}2udq
[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX(0x0f, 0x79): /* vcvtp{s,d}2udq
[xyz]mm/mem,[xyz]mm{k} */
host_and_vcpu_must_have(avx512f);
+ }
if ( ea.type == OP_MEM || !evex.br )
avx512_vlen_check(false);
d |= TwoOp;
@@ -7311,6 +7336,10 @@ x86_emulate(
host_and_vcpu_must_have(avx512f);
else if ( evex.w )
{
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x78): /* vcvttps2uqq
{x,y}mm/mem,[xyz]mm{k} */
+ /* vcvttpd2uqq
[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x79): /* vcvtps2uqq
{x,y}mm/mem,[xyz]mm{k} */
+ /* vcvtpd2uqq
[xyz]mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0x7a): /* vcvttps2qq
{x,y}mm/mem,[xyz]mm{k} */
/* vcvttpd2qq
[xyz]mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0x7b): /* vcvtps2qq
{x,y}mm/mem,[xyz]mm{k} */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |