[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v7 12/49] x86emul: support AVX512{F, BW} integer shuffle insns
Also include vshuff{32x4,64x2} as being very similar to vshufi{32x4,64x2}. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v7: Disable fault suppression for VPSHUF{D,{H,L}W}. Re-base. v6: Re-base over changes earlier in the series. v5: Re-base over changes earlier in the series. v4: Move OVR() addition into __AVX512VL__ conditional. Correct comments. v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -214,6 +214,7 @@ static const struct test avx512f_all[] = INSN(prolv, 66, 0f38, 15, vl, dq, vl), INSNX(pror, 66, 0f, 72, 0, vl, dq, vl), INSN(prorv, 66, 0f38, 14, vl, dq, vl), + INSN(pshufd, 66, 0f, 70, vl, d, vl), INSN(pslld, 66, 0f, f2, el_4, d, vl), INSNX(pslld, 66, 0f, 72, 6, vl, d, vl), INSN(psllq, 66, 0f, f3, el_2, q, vl), @@ -264,6 +265,10 @@ static const struct test avx512f_no128[] INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl), INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl), INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl), + INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl), + INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl), + INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl), + INSN(shufi64x2, 66, 0f3a, 43, vl, q, vl), }; static const struct test avx512f_512[] = { @@ -318,6 +323,9 @@ static const struct test avx512bw_all[] INSN(pmulhw, 66, 0f, e5, vl, w, vl), INSN(pmullw, 66, 0f, d5, vl, w, vl), INSN(psadbw, 66, 0f, f6, vl, b, vl), + INSN(pshufb, 66, 0f38, 00, vl, b, vl), + INSN(pshufhw, f3, 0f, 70, vl, w, vl), + INSN(pshuflw, f2, 0f, 70, vl, w, vl), INSNX(pslldq, 66, 0f, 73, 7, vl, b, vl), INSN(psllvw, 66, 0f38, 12, vl, w, vl), INSN(psllw, 66, 0f, f1, el_8, w, vl), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -153,6 +153,10 @@ static inline bool _to_bool(byte_vec_t b # else # define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0) # define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0) +# define swap(x) ({ \ + vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ + B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \ +}) # endif # elif FLOAT_SIZE == 8 # if VEC_SIZE >= 32 @@ -181,6 +185,10 @@ static inline bool _to_bool(byte_vec_t b # else # define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0) # define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0) +# define swap(x) ({ \ + vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ + B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \ +}) # endif # endif #elif FLOAT_SIZE == 4 && defined(__SSE__) @@ -309,9 +317,14 @@ static inline bool _to_bool(byte_vec_t b # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) +# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0)) # else # define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0)) # define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0)) +# define swap(x) ((vec_t)B(pshufd, _mask, \ + B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \ + 0b00011011, (vsi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) @@ -333,9 +346,14 @@ static inline bool _to_bool(byte_vec_t b # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) +# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0)) # else # define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0)) # define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0)) +# define swap(x) ((vec_t)B(pshufd, _mask, \ + (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \ + 0b01001110, (vsi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) # endif --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -119,6 +119,12 @@ typedef long long __attribute__((vector_ #ifdef __AVX512F__ +/* Sadly there are a few exceptions to the general naming rules. */ +# define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask +# define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask +# define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask +# define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask + # if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64 # pragma GCC target ( "avx512vl" ) # endif @@ -262,6 +268,7 @@ OVR(pmovzxwq); OVR(pmulld); OVR(pmuldq); OVR(pmuludq); +OVR(pshufd); OVR(punpckhdq); OVR(punpckhqdq); OVR(punpckldq); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -318,7 +318,7 @@ static const struct twobyte_table { [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl }, [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 }, [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl }, - [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other }, + [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl }, [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl }, [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl }, [0x77] = { DstImplicit|SrcNone }, @@ -432,7 +432,8 @@ static const struct ext0f38_table { uint8_t vsib:1; disp8scale_t d8s:4; } ext0f38_table[256] = { - [0x00 ... 0x0b] = { .simd_size = simd_packed_int }, + [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, + [0x01 ... 0x0b] = { .simd_size = simd_packed_int }, [0x0c ... 0x0f] = { .simd_size = simd_packed_fp }, [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x13] = { .simd_size = simd_other, .two_op = 1 }, @@ -543,6 +544,7 @@ static const struct ext0f3a_table { [0x20] = { .simd_size = simd_none, .d8s = 0 }, [0x21] = { .simd_size = simd_other, .d8s = 2 }, [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 }, + [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 }, [0x38] = { .simd_size = simd_128, .d8s = 4 }, @@ -552,6 +554,7 @@ static const struct ext0f3a_table { [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x40 ... 0x41] = { .simd_size = simd_packed_fp }, [0x42] = { .simd_size = simd_packed_int }, + [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x44] = { .simd_size = simd_packed_int }, [0x46] = { .simd_size = simd_packed_int }, [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 }, @@ -6689,6 +6692,7 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ fault_suppression = false; /* fall through */ case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ @@ -6944,6 +6948,21 @@ x86_emulate( insn_bytes = PFX_BYTES + 3; break; + case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */ + if ( evex.pfx == vex_66 ) + generate_exception_if(evex.w, EXC_UD); + else + { + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.brs, EXC_UD); + } + d = (d & ~SrcMask) | SrcMem | TwoOp; + op_bytes = 16 << evex.lr; + fault_suppression = false; + goto avx512f_imm8_no_sae; + CASE_SIMD_PACKED_INT(0x0f, 0x71): /* Grp12 */ case X86EMUL_OPC_VEX_66(0x0f, 0x71): CASE_SIMD_PACKED_INT(0x0f, 0x72): /* Grp13 */ @@ -9138,7 +9157,13 @@ x86_emulate( /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */ if ( evex.w ) host_and_vcpu_must_have(avx512dq); - generate_exception_if(!evex.lr || evex.brs, EXC_UD); + generate_exception_if(evex.brs, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + /* vshuff64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + /* vshufi64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + generate_exception_if(!evex.lr, EXC_UD); fault_suppression = false; goto avx512f_imm8_no_sae; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |