[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v3 24/34] x86emul: support AVX512{F, BW, DQ} integer broadcast insns
Note that the pbroadcastw table entry in evex-disp8.c is slightly different from what one would expect, due to it requiring EVEX.W to be zero. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -150,6 +150,9 @@ static const struct test avx512f_all[] = INSN(paddq, 66, 0f, d4, vl, q, vl), INSN(pand, 66, 0f, db, vl, dq, vl), INSN(pandn, 66, 0f, df, vl, dq, vl), +// pbroadcast, 66, 0f38, 7c, dq64 + INSN(pbroadcastd, 66, 0f38, 58, el, d, el), + INSN(pbroadcastq, 66, 0f38, 59, el, q, el), INSN(pcmp, 66, 0f3a, 1f, vl, dq, vl), INSN(pcmpeqd, 66, 0f, 76, vl, d, vl), INSN(pcmpeqq, 66, 0f38, 29, vl, q, vl), @@ -208,6 +211,7 @@ static const struct test avx512f_128[] = static const struct test avx512f_no128[] = { INSN(broadcastf32x4, 66, 0f38, 1a, el_4, d, vl), + INSN(broadcasti32x4, 66, 0f38, 5a, el_4, d, vl), INSN(broadcastsd, 66, 0f38, 19, el, q, el), INSN(extractf32x4, 66, 0f3a, 19, el_4, d, vl), INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl), @@ -217,6 +221,7 @@ static const struct test avx512f_no128[] static const struct test avx512f_512[] = { INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl), + INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl), INSN(extractf64x4, 66, 0f3a, 1b, el_4, q, vl), INSN(extracti64x4, 66, 0f3a, 3b, el_4, q, vl), INSN(insertf64x4, 66, 0f3a, 1a, el_4, q, vl), @@ -236,6 +241,10 @@ static const struct test avx512bw_all[] INSN(paddw, 66, 0f, fd, vl, w, vl), INSN(pavgb, 66, 0f, e0, vl, b, vl), INSN(pavgw, 66, 0f, e3, vl, w, vl), + INSN(pbroadcastb, 66, 0f38, 78, el, b, el), +// pbroadcastb, 66, 0f38, 7a, b + INSN(pbroadcastw, 66, 0f38, 79, el_2, b, vl), +// pbroadcastw, 66, 0f38, 7b, b INSN(pcmp, 66, 0f3a, 3f, vl, bw, vl), INSN(pcmpeqb, 66, 0f, 74, vl, b, vl), INSN(pcmpeqw, 66, 0f, 75, vl, w, vl), @@ -287,6 +296,7 @@ static const struct test avx512bw_128[] static const struct test avx512dq_all[] = { INSN_PFP(and, 0f, 54), INSN_PFP(andn, 0f, 55), + INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl), INSN_PFP(or, 0f, 56), INSN(pmullq, 66, 0f38, 40, vl, q, vl), INSN_PFP(xor, 0f, 57), @@ -300,6 +310,7 @@ static const struct test avx512dq_128[] static const struct test avx512dq_no128[] = { INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl), INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl), + INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl), INSN(extractf64x2, 66, 0f3a, 19, el_2, q, vl), INSN(extracti64x2, 66, 0f3a, 39, el_2, q, vl), INSN(insertf64x2, 66, 0f3a, 18, el_2, q, vl), @@ -308,6 +319,7 @@ static const struct test avx512dq_no128[ static const struct test avx512dq_512[] = { INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl), + INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl), INSN(extractf32x8, 66, 0f3a, 1b, el_8, d, vl), INSN(extracti32x8, 66, 0f3a, 3b, el_8, d, vl), INSN(insertf32x8, 66, 0f3a, 1a, el_8, d, vl), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -272,9 +272,33 @@ static inline bool _to_bool(byte_vec_t b #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \ defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__)) # if INT_SIZE == 4 || UINT_SIZE == 4 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastd %1, %0" \ + : "=v" (t_) : "m" (*(int[1]){ x }) ); \ + t_; \ +}) +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \ + t_; \ +}) # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) # elif INT_SIZE == 8 || UINT_SIZE == 8 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastq %1, %0" \ + : "=v" (t_) : "m" (*(long long[1]){ x }) ); \ + t_; \ +}) +# ifdef __x86_64__ +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \ + t_; \ +}) +# endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) # endif # if INT_SIZE == 4 @@ -971,10 +995,14 @@ int simd_test(void) if ( !eq(swap2(src), inv) ) return __LINE__; #endif -#if defined(broadcast) +#ifdef broadcast if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__; #endif +#ifdef broadcast2 + if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__; +#endif + #if defined(interleave_lo) && defined(interleave_hi) touch(src); x = interleave_lo(inv, src); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -452,9 +452,13 @@ static const struct ext0f38_table { [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, - [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 }, - [0x5a] = { .simd_size = simd_128, .two_op = 1 }, - [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 }, + [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 }, + [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 }, + [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 }, + [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 }, + [0x78] = { .simd_size = simd_other, .two_op = 1 }, + [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 }, + [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 }, [0x8c] = { .simd_size = simd_packed_int }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, @@ -2615,6 +2619,11 @@ x86_decode_0f38( ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); break; + case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */ + case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */ + case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */ + break; + case 0xf0: /* movbe / crc32 */ state->desc |= repne_prefix() ? ByteOp : Mov; if ( rep_prefix() ) @@ -8182,6 +8191,8 @@ x86_emulate( goto avx512f_no_sae; case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */ + op_bytes = elem_bytes; generate_exception_if(evex.w || evex.br, EXC_UD); avx512_broadcast: /* @@ -8200,17 +8211,27 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */ /* vbroadcastf64x4 m256,zmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */ + /* vbroadcasti64x4 m256,zmm{k} */ generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD); /* fall through */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */ /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */ - generate_exception_if(!evex.lr || evex.br, EXC_UD); + generate_exception_if(!evex.lr, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */ + /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */ + if ( b == 0x59 ) + op_bytes = 8; + generate_exception_if(evex.br, EXC_UD); if ( !evex.w ) host_and_vcpu_must_have(avx512dq); goto avx512_broadcast; case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */ /* vbroadcastf64x2 m128,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */ + /* vbroadcasti64x2 m128,{y,z}mm{k} */ generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br, EXC_UD); if ( evex.w ) @@ -8404,6 +8425,45 @@ x86_emulate( generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */ + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.w || evex.br, EXC_UD); + op_bytes = elem_bytes = 1 << (b & 1); + /* See the comment at the avx512_broadcast label. */ + op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask); + goto avx512f_no_sae; + + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */ + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.w, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} */ + generate_exception_if((ea.type != OP_REG || evex.br || + evex.reg != 0xf || !evex.RX), + EXC_UD); + host_and_vcpu_must_have(avx512f); + avx512_vlen_check(false); + get_fpu(X86EMUL_FPU_zmm); + + opc = init_evex(stub); + opc[0] = b; + /* Convert GPR source to %rAX. */ + evex.b = 1; + if ( !mode_64bit() ) + evex.w = 0; + opc[1] = modrm & 0xf8; + insn_bytes = EVEX_PFX_BYTES + 2; + opc[2] = 0xc3; + + copy_EVEX(opc, evex); + invoke_stub("", "", "+m" (src.val) : "a" (src.val)); + + put_stub(stub); + ASSERT(!state->simd_size); + break; + case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */ generate_exception_if(ea.type != OP_MEM, EXC_UD); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |