[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 24/34] x86emul: support AVX512{F, BW, DQ} integer broadcast insns



Note that the pbroadcastw table entry in evex-disp8.c is slightly
different from what one would expect, due to it requiring EVEX.W to be
zero.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -150,6 +150,9 @@ static const struct test avx512f_all[] =
     INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+//       pbroadcast,   66, 0f38, 7c,          dq64
+    INSN(pbroadcastd,  66, 0f38, 58,    el,      d, el),
+    INSN(pbroadcastq,  66, 0f38, 59,    el,      q, el),
     INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
     INSN(pcmpeqd,      66,   0f, 76,    vl,      d, vl),
     INSN(pcmpeqq,      66, 0f38, 29,    vl,      q, vl),
@@ -208,6 +211,7 @@ static const struct test avx512f_128[] =
 
 static const struct test avx512f_no128[] = {
     INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
+    INSN(broadcasti32x4, 66, 0f38, 5a, el_4,  d, vl),
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
@@ -217,6 +221,7 @@ static const struct test avx512f_no128[]
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+    INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
     INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
@@ -236,6 +241,10 @@ static const struct test avx512bw_all[]
     INSN(paddw,       66,   0f, fd,    vl,   w, vl),
     INSN(pavgb,       66,   0f, e0,    vl,   b, vl),
     INSN(pavgw,       66,   0f, e3,    vl,   w, vl),
+    INSN(pbroadcastb, 66, 0f38, 78,    el,   b, el),
+//       pbroadcastb, 66, 0f38, 7a,          b
+    INSN(pbroadcastw, 66, 0f38, 79,    el_2, b, vl),
+//       pbroadcastw, 66, 0f38, 7b,          b
     INSN(pcmp,        66, 0f3a, 3f,    vl,  bw, vl),
     INSN(pcmpeqb,     66,   0f, 74,    vl,   b, vl),
     INSN(pcmpeqw,     66,   0f, 75,    vl,   w, vl),
@@ -287,6 +296,7 @@ static const struct test avx512bw_128[]
 static const struct test avx512dq_all[] = {
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
+    INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
     INSN_PFP(or,               0f, 56),
     INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
@@ -300,6 +310,7 @@ static const struct test avx512dq_128[]
 static const struct test avx512dq_no128[] = {
     INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+    INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
     INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
@@ -308,6 +319,7 @@ static const struct test avx512dq_no128[
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+    INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
     INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -272,9 +272,33 @@ static inline bool _to_bool(byte_vec_t b
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastd %1, %0" \
+          : "=v" (t_) : "m" (*(int[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
+    t_; \
+})
+#  ifdef __x86_64__
+#   define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
+    t_; \
+})
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 
0b01010101))
 # endif
 # if INT_SIZE == 4
@@ -971,10 +995,14 @@ int simd_test(void)
     if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
-#if defined(broadcast)
+#ifdef broadcast
     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#ifdef broadcast2
+    if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -452,9 +452,13 @@ static const struct ext0f38_table {
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
-    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
-    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
+    [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+    [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x78] = { .simd_size = simd_other, .two_op = 1 },
+    [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
+    [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -2615,6 +2619,11 @@ x86_decode_0f38(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
+    case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */
+    case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */
+        break;
+
     case 0xf0: /* movbe / crc32 */
         state->desc |= repne_prefix() ? ByteOp : Mov;
         if ( rep_prefix() )
@@ -8182,6 +8191,8 @@ x86_emulate(
         goto avx512f_no_sae;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} 
*/
+        op_bytes = elem_bytes;
         generate_exception_if(evex.w || evex.br, EXC_UD);
     avx512_broadcast:
         /*
@@ -8200,17 +8211,27 @@ x86_emulate(
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
                                             /* vbroadcastf64x4 m256,zmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+                                            /* vbroadcasti64x4 m256,zmm{k} */
         generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} 
*/
                                             /* vbroadcastf32x2 
xmm/m64,{y,z}mm{k} */
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(!evex.lr, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} 
*/
+                                            /* vbroadcasti32x2 
xmm/m64,[xyz]mm{k} */
+        if ( b == 0x59 )
+            op_bytes = 8;
+        generate_exception_if(evex.br, EXC_UD);
         if ( !evex.w )
             host_and_vcpu_must_have(avx512dq);
         goto avx512_broadcast;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} 
*/
                                             /* vbroadcastf64x2 m128,{y,z}mm{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} 
*/
+                                            /* vbroadcasti64x2 m128,{y,z}mm{k} 
*/
         generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br,
                               EXC_UD);
         if ( evex.w )
@@ -8404,6 +8425,45 @@ x86_emulate(
         generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} 
*/
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} 
*/
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+        op_bytes = elem_bytes = 1 << (b & 1);
+        /* See the comment at the avx512_broadcast label. */
+        op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} 
*/
+        generate_exception_if((ea.type != OP_REG || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0xf8;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (src.val));
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} 
mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} 
{x,y}mm,{x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.