[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v7 18/49] x86emul: support AVX512{F, BW, _VBMI} permute insns
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v7: Re-base. v5: Re-base over changes earlier in the series. v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -178,6 +178,10 @@ static const struct test avx512f_all[] = INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl), INSN(permi2, 66, 0f38, 76, vl, dq, vl), INSN(permi2, 66, 0f38, 77, vl, sd, vl), + INSN(permilpd, 66, 0f38, 0d, vl, q, vl), + INSN(permilpd, 66, 0f3a, 05, vl, q, vl), + INSN(permilps, 66, 0f38, 0c, vl, d, vl), + INSN(permilps, 66, 0f3a, 04, vl, d, vl), INSN(permt2, 66, 0f38, 7e, vl, dq, vl), INSN(permt2, 66, 0f38, 7f, vl, sd, vl), INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl), @@ -278,6 +282,10 @@ static const struct test avx512f_no128[] INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl), INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl), INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl), + INSN(perm, 66, 0f38, 36, vl, dq, vl), + INSN(perm, 66, 0f38, 16, vl, sd, vl), + INSN(permpd, 66, 0f3a, 01, vl, q, vl), + INSN(permq, 66, 0f3a, 00, vl, q, vl), INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl), INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl), INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl), @@ -316,6 +324,7 @@ static const struct test avx512bw_all[] INSN(pcmpgtb, 66, 0f, 64, vl, b, vl), INSN(pcmpgtw, 66, 0f, 65, vl, w, vl), INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl), + INSN(permw, 66, 0f38, 8d, vl, w, vl), INSN(permi2w, 66, 0f38, 75, vl, w, vl), INSN(permt2w, 66, 0f38, 7d, vl, w, vl), INSN(pmaddwd, 66, 0f, f5, vl, w, vl), @@ -412,6 +421,7 @@ static const struct test avx512dq_512[] }; static const struct test avx512_vbmi_all[] = { + INSN(permb, 66, 0f38, 8d, vl, b, vl), INSN(permi2b, 66, 0f38, 75, vl, b, vl), INSN(permt2b, 66, 0f38, 7d, vl, b, vl), }; --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -186,6 +186,7 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0) +# define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0) # else # define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0) # define insert_pair(x, y, p) \ @@ -200,6 +201,10 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \ }) +# define swap2(x) B(vpermilps, _mask, \ + B(shuf_f32x4_, _mask, x, x, \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \ + 0b00011011, undef(), ~0) # endif # elif FLOAT_SIZE == 8 # if VEC_SIZE >= 32 @@ -233,6 +238,7 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0) # define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0) +# define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0) # else # define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0) # define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0) @@ -240,6 +246,10 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \ }) +# define swap2(x) B(vpermilpd, _mask, \ + B(shuf_f64x2_, _mask, x, x, \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \ + 0b01010101, undef(), ~0) # endif # endif #elif FLOAT_SIZE == 4 && defined(__SSE__) @@ -405,6 +415,7 @@ static inline bool _to_bool(byte_vec_t b B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \ 0b00011011, (vsi_t)undef(), ~0)) +# define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) @@ -442,8 +453,17 @@ static inline bool _to_bool(byte_vec_t b (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \ 0b01001110, (vsi_t)undef(), ~0)) +# define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) +# if VEC_SIZE == 32 +# define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0)) +# elif VEC_SIZE == 64 +# define swap3(x) ({ \ + vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \ + B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \ +}) +# endif # endif # if INT_SIZE == 4 # define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0) @@ -489,6 +509,9 @@ static inline bool _to_bool(byte_vec_t b # define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0)) # define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0)) # define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0)) +# ifdef __AVX512VBMI__ +# define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0)) +# endif # elif INT_SIZE == 2 || UINT_SIZE == 2 # define broadcast(x) ({ \ vec_t t_; \ @@ -517,6 +540,7 @@ static inline bool _to_bool(byte_vec_t b (0b01010101010101010101010101010101 & ALL_TRUE))) # define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0)) # define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0)) +# define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0)) # endif # if INT_SIZE == 1 # define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) @@ -1325,6 +1349,12 @@ int simd_test(void) if ( !eq(swap2(src), inv) ) return __LINE__; #endif +#ifdef swap3 + touch(src); + if ( !eq(swap3(src), inv) ) return __LINE__; + touch(src); +#endif + #ifdef broadcast if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__; #endif --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -275,6 +275,8 @@ OVR(movlps); OVR_VFP(movnt); OVR_VFP(movu); OVR_FP(mul); +OVR_VFP(perm); +OVR_VFP(permil); OVR_VFP(shuf); OVR_INT(sll); OVR_DQ(sllv); @@ -331,6 +333,8 @@ OVR(movntdq); OVR(movntdqa); OVR(movshdup); OVR(movsldup); +OVR(permd); +OVR(permq); OVR(pmovsxbd); OVR(pmovsxbq); OVR(pmovsxdq); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -434,7 +434,8 @@ static const struct ext0f38_table { } ext0f38_table[256] = { [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x01 ... 0x0b] = { .simd_size = simd_packed_int }, - [0x0c ... 0x0f] = { .simd_size = simd_packed_fp }, + [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, + [0x0e ... 0x0f] = { .simd_size = simd_packed_fp }, [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x13] = { .simd_size = simd_other, .two_op = 1 }, [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, @@ -477,6 +478,7 @@ static const struct ext0f38_table { [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x8c] = { .simd_size = simd_packed_int }, + [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, @@ -522,10 +524,10 @@ static const struct ext0f3a_table { uint8_t four_op:1; disp8scale_t d8s:4; } ext0f3a_table[256] = { - [0x00] = { .simd_size = simd_packed_int, .two_op = 1 }, - [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl }, + [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0x02] = { .simd_size = simd_packed_int }, - [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0x06] = { .simd_size = simd_packed_fp }, [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 }, [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc }, @@ -8091,6 +8093,9 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */ generate_exception_if(evex.brs, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x0c): /* vpermilps [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x0d): /* vpermilpd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ fault_suppression = false; if ( b == 0xe2 ) goto avx512f_no_sae; @@ -8436,6 +8441,12 @@ x86_emulate( generate_exception_if(!vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + generate_exception_if(!evex.lr, EXC_UD); + fault_suppression = false; + goto avx512f_no_sae; + case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */ @@ -8641,6 +8652,7 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ if ( !evex.w ) host_and_vcpu_must_have(avx512_vbmi); else @@ -9066,6 +9078,12 @@ x86_emulate( generate_exception_if(!vex.l || !vex.w, EXC_UD); goto simd_0f_imm8_avx2; + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x00): /* vpermq $imm8,{y,z}mm/mem,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x01): /* vpermpd $imm8,{y,z}mm/mem,{y,z}mm{k} */ + generate_exception_if(!evex.lr || !evex.w, EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */ @@ -9085,6 +9103,12 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_imm8_avx; + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x04): /* vpermilps $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,[xyz]mm/mem,[xyz]mm{k} */ + generate_exception_if(evex.w != (b & 1), EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */ case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */ case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |