[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v6 25/42] x86emul: support remaining AVX512F legacy-equivalent insns



Plus their AVX512BW counterparts.

Take the opportunity and also eliminate a pair of open coded instances
of scalar_1op().

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v6: Re-base over changes earlier in the series.
v5: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -193,6 +193,8 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN(pabsd,        66, 0f38, 1e,    vl,      d, vl),
+    INSN(pabsq,        66, 0f38, 1f,    vl,      q, vl),
     INSN(paddd,        66,   0f, fe,    vl,      d, vl),
     INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
@@ -276,6 +278,10 @@ static const struct test avx512f_all[] =
     INSN(punpckldq,    66,   0f, 62,    vl,      d, vl),
     INSN(punpcklqdq,   66,   0f, 6c,    vl,      q, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
+    INSN(rndscalepd,   66, 0f3a, 09,    vl,      q, vl),
+    INSN(rndscaleps,   66, 0f3a, 08,    vl,      d, vl),
+    INSN(rndscalesd,   66, 0f3a, 0b,    el,      q, el),
+    INSN(rndscaless,   66, 0f3a, 0a,    el,      d, el),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
@@ -336,6 +342,8 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
+    INSN(pabsb,       66, 0f38, 1c,    vl,    b, vl),
+    INSN(pabsw,       66, 0f38, 1d,    vl,    w, vl),
     INSN(packssdw,    66,   0f, 6b,    vl, d_nb, vl),
     INSN(packsswb,    66,   0f, 63,    vl,    w, vl),
     INSN(packusdw,    66, 0f38, 2b,    vl, d_nb, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -211,8 +211,10 @@ static inline vec_t movlhps(vec_t x, vec
 #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
 # if FLOAT_SIZE == 4
 #  define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
+#  define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]")
 # elif FLOAT_SIZE == 8
 #  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
+#  define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
 # endif
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
@@ -263,6 +265,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
 #  define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
 #  define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+#  define trunc(x) BR(rndscaleps_, _mask, x, 0b1011, undef(), ~0)
 #  define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
@@ -316,6 +319,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
 #  define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
+#  define trunc(x) BR(rndscalepd_, _mask, x, 0b1011, undef(), ~0)
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
 #   define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
@@ -548,6 +552,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  endif
 # endif
 # if INT_SIZE == 4
+#  define abs(x) B(pabsd, _mask, x, undef(), ~0)
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
 #  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
 #  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
@@ -558,6 +563,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), 
(vdi_t)undef(), ~0))
 #  define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), 
(vdi_t)undef(), ~0))
 # elif INT_SIZE == 8
+#  define abs(x) ((vec_t)B(pabsq, _mask, (vdi_t)(x), (vdi_t)undef(), ~0))
 #  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), 
(vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), 
(vdi_t)undef(), ~0))
 # elif UINT_SIZE == 8
@@ -625,6 +631,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), 
(vhi_t)undef(), ~0))
 # endif
 # if INT_SIZE == 1
+#  define abs(x) ((vec_t)B(pabsb, _mask, (vqi_t)(x), (vqi_t)undef(), ~0))
 #  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
 #  define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), 
(vhi_t)undef(), ~0))
@@ -637,6 +644,7 @@ static inline vec_t movlhps(vec_t x, vec
 #  define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), 
(vsi_t)undef(), ~0))
 #  define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), 
(vdi_t)undef(), ~0))
 # elif INT_SIZE == 2
+#  define abs(x) B(pabsw, _mask, x, undef(), ~0)
 #  define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
 #  define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
 #  define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
@@ -948,19 +956,11 @@ static inline vec_t movlhps(vec_t x, vec
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ 
? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ 
? x_ : y_; })})
-# ifdef __SSE4_1__
+# if defined(__SSE4_1__) && !defined(__AVX512F__)
 #  if FLOAT_SIZE == 4
-#   define trunc(x) ({ \
-    float __attribute__((vector_size(16))) r_; \
-    asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
-    (vec_t){ r_[0] }; \
-})
+#   define trunc(x) scalar_1op(x, "roundss $0b1011, %[in], %[out]")
 #  elif FLOAT_SIZE == 8
-#   define trunc(x) ({ \
-    double __attribute__((vector_size(16))) r_; \
-    asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
-    (vec_t){ r_[0] }; \
-})
+#   define trunc(x) scalar_1op(x, "roundsd $0b1011, %[in], %[out]")
 #  endif
 # endif
 #endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -184,6 +184,8 @@ DECL_OCTET(half);
 # define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
 # define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
 # define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
+# define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask
+# define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask
 # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
 # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
 # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -245,6 +247,7 @@ OVR_INT(broadcast);
 OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_VFP(cvtdq2);
+OVR_INT(abs);
 OVR_FP(add);
 OVR_INT(add);
 OVR_BW(adds);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -446,7 +446,7 @@ static const struct ext0f38_table {
     [0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
-    [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x1c ... 0x1f] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = 
d8s_vl },
     [0x20] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x21] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
     [0x22] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
@@ -531,8 +531,8 @@ static const struct ext0f3a_table {
     [0x02] = { .simd_size = simd_packed_int },
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = 
d8s_vl },
     [0x06] = { .simd_size = simd_packed_fp },
-    [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
-    [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
+    [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = 
d8s_vl },
+    [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc, .d8s = d8s_dq },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 0 },
@@ -6874,6 +6874,8 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf9): /* vpsubw 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfc): /* vpaddb 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfd): /* vpaddw 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1c): /* vpabsb [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1d): /* vpabsw [xyz]mm/mem,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512bw);
         generate_exception_if(evex.br, EXC_UD);
         elem_bytes = 1 << (b & 1);
@@ -8257,6 +8259,8 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1e): /* vpabsd [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1f): /* vpabsq [xyz]mm/mem,[xyz]mm{k} */
         generate_exception_if(evex.w != (b & 1), EXC_UD);
         goto avx512f_no_sae;
 
@@ -9286,6 +9290,17 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f3a_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0a): /* vrndscaless 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0b): /* vrndscalesd 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(ea.type != OP_REG && evex.br, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x08): /* vrndscaleps 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x09): /* vrndscalepd 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(evex.w != (b & 1), EXC_UD);
+        avx512_vlen_check(b & 2);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.