[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v5 02/47] x86emul: support basic AVX512 moves
Note: SDM Vol 2 rev 067 is not really consistent about EVEX.L'L for LIG insns - the only place where this is made explicit is a table in the section titled "Vector Length Orthogonality": While they tolerate 0, 1, and 2, a value of 3 uniformly leads to #UD. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v5: Use IMPOSSIBLE() to guard against division by zero. Correct style. Re-base. v4: Introduce d8s_dq64 to deal with 32-bit mode VMOVD with EVEX.W set. Adjust a comment. v3: Restrict k-reg reading to insns with memory operand. Shrink scope of "disp8scale". v2: Move "full" into more narrow scope. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -1985,6 +1985,53 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing {evex} vmovq %xmm1,32(%edx)..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovq_to_mem); + + asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n" + put_insn(evex_vmovq_to_mem, "%{evex%} vmovq %%xmm1, 32(%0)") + :: "d" (NULL) ); + + memset(res, 0xdb, 64); + set_insn(evex_vmovq_to_mem); + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem) || + *((uint64_t *)res + 4) || + memcmp(res, res + 10, 24) || + memcmp(res, res + 6, 8) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing {evex} vmovq 32(%edx),%xmm0..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovq_from_mem); + + asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n" + put_insn(evex_vmovq_from_mem, "%{evex%} vmovq 32(%0), %%xmm0") + :: "d" (NULL) ); + + set_insn(evex_vmovq_from_mem); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_from_mem) ) + goto fail; + asm ( "vmovq %1, %%xmm1\n\t" + "vpcmpeqq %%zmm0, %%zmm1, %%k0\n" + "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) ); + if ( rc != 0xff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movdqu %xmm2,(%ecx)..."); if ( stack_exec && cpu_has_sse2 ) { @@ -2085,6 +2132,118 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vmovdqu32 %zmm2,(%ecx){%k1}..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(vmovdqu32_to_mem); + + memset(res, 0x55, 128); + + asm volatile ( "vpcmpeqd %%ymm2, %%ymm2, %%ymm2\n\t" + "kmovw %1,%%k1\n" + put_insn(vmovdqu32_to_mem, + "vmovdqu32 %%zmm2, (%0)%{%%k1%}") + :: "c" (NULL), "rm" (res[0]) ); + set_insn(vmovdqu32_to_mem); + + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) || + !check_eip(vmovdqu32_to_mem) ) + goto fail; + + res[16] = ~0; res[18] = ~0; res[20] = ~0; res[22] = ~0; + res[24] = 0; res[26] = 0; res[28] = 0; res[30] = 0; + if ( memcmp(res, res + 16, 64) ) + goto fail; + + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovdqu32 64(%edx),%zmm2{%k2}..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(vmovdqu32_from_mem); + + asm volatile ( "knotw %%k1, %%k2\n" + put_insn(vmovdqu32_from_mem, + "vmovdqu32 64(%0), %%zmm2%{%%k2%}") + :: "d" (NULL) ); + + set_insn(vmovdqu32_from_mem); + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu32_from_mem) ) + goto fail; + asm ( "vpcmpeqd %1, %%zmm2, %%k0\n\t" + "kmovw %%k0, %0" : "=r" (rc) : "m" (res[0]) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovdqu16 %zmm3,(%ecx){%k1}..."); + if ( stack_exec && cpu_has_avx512bw ) + { + decl_insn(vmovdqu16_to_mem); + + memset(res, 0x55, 128); + + asm volatile ( "vpcmpeqw %%ymm3, %%ymm3, %%ymm3\n\t" + "kmovd %1,%%k1\n" + put_insn(vmovdqu16_to_mem, + "vmovdqu16 %%zmm3, (%0)%{%%k1%}") + :: "c" (NULL), "rm" (res[0]) ); + set_insn(vmovdqu16_to_mem); + + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) || + !check_eip(vmovdqu16_to_mem) ) + goto fail; + + for ( i = 16; i < 24; ++i ) + res[i] |= 0x0000ffff; + for ( ; i < 32; ++i ) + res[i] &= 0xffff0000; + if ( memcmp(res, res + 16, 64) ) + goto fail; + + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovdqu16 64(%edx),%zmm3{%k2}..."); + if ( stack_exec && cpu_has_avx512bw ) + { + decl_insn(vmovdqu16_from_mem); + + asm volatile ( "knotd %%k1, %%k2\n" + put_insn(vmovdqu16_from_mem, + "vmovdqu16 64(%0), %%zmm3%{%%k2%}") + :: "d" (NULL) ); + + set_insn(vmovdqu16_from_mem); + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu16_from_mem) ) + goto fail; + asm ( "vpcmpeqw %1, %%zmm3, %%k0\n\t" + "kmovd %%k0, %0" : "=r" (rc) : "m" (res[0]) ); + if ( rc != 0xffffffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movsd %xmm5,(%ecx)..."); memset(res, 0x77, 64); memset(res + 10, 0x66, 8); @@ -2186,6 +2345,71 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vmovsd %xmm5,16(%ecx){%k3}..."); + memset(res, 0x88, 128); + memset(res + 20, 0x77, 8); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(vmovsd_masked_to_mem); + + asm volatile ( "vbroadcastsd %0, %%ymm5\n\t" + "kxorw %%k3, %%k3, %%k3\n" + put_insn(vmovsd_masked_to_mem, + "vmovsd %%xmm5, 16(%1)%{%%k3%}") + :: "m" (res[20]), "c" (NULL) ); + + set_insn(vmovsd_masked_to_mem); + regs.ecx = 0; + regs.edx = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) ) + goto fail; + + asm volatile ( "kmovw %0, %%k3\n" :: "m" (res[20]) ); + + set_insn(vmovsd_masked_to_mem); + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) || + memcmp(res, res + 16, 64) ) + goto fail; + + printf("okay\n"); + } + else + { + printf("skipped\n"); + memset(res + 4, 0x77, 8); + } + + printf("%-40s", "Testing vmovaps (%edx),%zmm7{%k3}{z}..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(vmovaps_masked_from_mem); + + asm volatile ( "vpcmpeqd %%xmm7, %%xmm7, %%xmm7\n\t" + "vbroadcastss %%xmm7, %%zmm7\n" + put_insn(vmovaps_masked_from_mem, + "vmovaps (%0), %%zmm7%{%%k3%}%{z%}") + :: "d" (NULL) ); + + set_insn(vmovaps_masked_from_mem); + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vmovaps_masked_from_mem) ) + goto fail; + asm ( "vcmpeqps %1, %%zmm7, %%k0\n\t" + "vxorps %%xmm0, %%xmm0, %%xmm0\n\t" + "vcmpeqps %%zmm0, %%zmm7, %%k1\n\t" + "kxorw %%k1, %%k0, %%k0\n\t" + "kmovw %%k0, %0" : "=r" (rc) : "m" (res[16]) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movd %mm3,32(%ecx)..."); if ( stack_exec && cpu_has_mmx ) { @@ -2341,6 +2565,55 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing {evex} vmovd %xmm3,32(%ecx)..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovd_to_mem); + + asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n" + put_insn(evex_vmovd_to_mem, + "%{evex%} vmovd %%xmm3, 32(%0)") + :: "c" (NULL) ); + + memset(res, 0xbd, 64); + set_insn(evex_vmovd_to_mem); + regs.ecx = (unsigned long)res; + regs.edx = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_to_mem) || + res[8] + 1 || + memcmp(res, res + 9, 28) || + memcmp(res, res + 6, 8) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing {evex} vmovd 32(%ecx),%xmm4..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovd_from_mem); + + asm volatile ( "pcmpeqb %%xmm4, %%xmm4\n" + put_insn(evex_vmovd_from_mem, + "%{evex%} vmovd 32(%0), %%xmm4") + :: "c" (NULL) ); + + set_insn(evex_vmovd_from_mem); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_from_mem) ) + goto fail; + asm ( "vmovd %1, %%xmm0\n\t" + "vpcmpeqd %%zmm4, %%zmm0, %%k0\n\t" + "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movd %mm3,%ebx..."); if ( stack_exec && cpu_has_mmx ) { @@ -2507,6 +2780,57 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing {evex} vmovd %xmm2,%ebx..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovd_to_reg); + + /* See comment next to movd above. */ + asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n" + put_insn(evex_vmovd_to_reg, + "%{evex%} vmovd %%xmm2, %%ebx") + :: ); + + set_insn(evex_vmovd_to_reg); +#ifdef __x86_64__ + regs.rbx = 0xbdbdbdbdbdbdbdbdUL; +#else + regs.ebx = 0xbdbdbdbdUL; +#endif + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_to_reg) || + regs.ebx != 0xffffffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing {evex} vmovd %ebx,%xmm1..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovd_from_reg); + + /* See comment next to movd above. */ + asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n" + put_insn(evex_vmovd_from_reg, + "%{evex%} vmovd %%ebx, %%xmm1") + :: ); + + set_insn(evex_vmovd_from_reg); + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_from_reg) ) + goto fail; + asm ( "vmovd %1, %%xmm0\n\t" + "vpcmpeqd %%zmm1, %%zmm0, %%k0\n\t" + "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + #ifdef __x86_64__ printf("%-40s", "Testing movq %mm3,32(%ecx)..."); if ( stack_exec && cpu_has_mmx ) @@ -2584,6 +2908,36 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing {evex} vmovq %xmm11,32(%ecx)..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovq_to_mem2); + + asm volatile ( "pcmpeqb %%xmm11, %%xmm11\n" +#if 0 /* This may not work, as the assembler might pick opcode D6. */ + put_insn(evex_vmovq_to_mem2, + "{evex} vmovq %%xmm11, 32(%0)") +#else + put_insn(evex_vmovq_to_mem2, + ".byte 0x62, 0xf1, 0xfd, 0x08, 0x7e, 0x49, 0x04") +#endif + :: "c" (NULL) ); + + memset(res, 0xbd, 64); + set_insn(evex_vmovq_to_mem2); + regs.ecx = (unsigned long)res; + regs.edx = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem2) || + *((long *)res + 4) + 1 || + memcmp(res, res + 10, 24) || + memcmp(res, res + 6, 8) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movq %mm3,%rbx..."); if ( stack_exec && cpu_has_mmx ) { @@ -2643,6 +2997,28 @@ int main(int argc, char **argv) } else printf("skipped\n"); + + printf("%-40s", "Testing vmovq %xmm22,%rbx..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovq_to_reg); + + /* See comment next to movd above. */ + asm volatile ( "pcmpeqq %%xmm2, %%xmm2\n\t" + "vmovq %%xmm2, %%xmm22\n" + put_insn(evex_vmovq_to_reg, "vmovq %%xmm22, %%rbx") + :: ); + + set_insn(evex_vmovq_to_reg); + regs.rbx = 0xbdbdbdbdbdbdbdbdUL; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_reg) || + regs.rbx + 1 ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); #endif printf("%-40s", "Testing maskmovq %mm4,%mm4..."); @@ -2812,6 +3188,32 @@ int main(int argc, char **argv) goto fail; printf("okay\n"); } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovntdqa 64(%ecx),%zmm4..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vmovntdqa); + + asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n" + put_insn(evex_vmovntdqa, "vmovntdqa 64(%0), %%zmm4") + :: "c" (NULL) ); + + set_insn(evex_vmovntdqa); + memset(res, 0x55, 192); + memset(res + 16, 0xff, 64); + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovntdqa) ) + goto fail; + asm ( "vpbroadcastd %1, %%zmm2\n\t" + "vpcmpeqd %%zmm4, %%zmm2, %%k0\n\t" + "kmovw %%k0, %0" : "=r" (rc) : "0" (~0) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } else printf("skipped\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -222,6 +222,7 @@ int emul_test_get_fpu( if ( cpu_has_avx ) break; case X86EMUL_FPU_opmask: + case X86EMUL_FPU_zmm: if ( cpu_has_avx512f ) break; default: --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -132,6 +132,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_avx512f (cp.feat.avx512f && xcr0_mask(0xe6)) #define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6)) #define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6)) +#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) #define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -243,9 +243,27 @@ enum simd_opsize { }; typedef uint8_t simd_opsize_t; +enum disp8scale { + /* Values 0 ... 4 are explicit sizes. */ + d8s_bw = 5, + d8s_dq, + /* EVEX.W ignored outside of 64-bit mode */ + d8s_dq64, + /* + * All further values must strictly be last and in the order + * given so that arithmetic on the values works. + */ + d8s_vl, + d8s_vl_by_2, + d8s_vl_by_4, + d8s_vl_by_8, +}; +typedef uint8_t disp8scale_t; + static const struct twobyte_table { opcode_desc_t desc; - simd_opsize_t size; + simd_opsize_t size:4; + disp8scale_t d8s:4; } twobyte_table[256] = { [0x00] = { ModRM }, [0x01] = { ImplicitOps|ModRM }, @@ -260,8 +278,8 @@ static const struct twobyte_table { [0x0d] = { ImplicitOps|ModRM }, [0x0e] = { ImplicitOps }, [0x0f] = { ModRM|SrcImmByte }, - [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp }, - [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp }, + [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl }, + [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl }, [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other }, [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp }, @@ -270,10 +288,10 @@ static const struct twobyte_table { [0x18 ... 0x1f] = { ImplicitOps|ModRM }, [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM }, [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM }, - [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp }, - [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp }, + [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl }, + [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp, d8s_vl }, [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, - [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp }, + [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl }, [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp }, [0x30 ... 0x35] = { ImplicitOps }, @@ -292,8 +310,8 @@ static const struct twobyte_table { [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other }, [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, - [0x6e] = { DstImplicit|SrcMem|ModRM|Mov }, - [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int }, + [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 }, + [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl }, [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other }, [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM }, [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, @@ -301,8 +319,8 @@ static const struct twobyte_table { [0x78] = { ImplicitOps|ModRM }, [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int }, [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other }, - [0x7e] = { DstMem|SrcImplicit|ModRM|Mov }, - [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int }, + [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 }, + [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl }, [0x80 ... 0x8f] = { DstImplicit|SrcImm }, [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov }, [0xa0 ... 0xa1] = { ImplicitOps|Mov }, @@ -344,14 +362,14 @@ static const struct twobyte_table { [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other }, [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other }, [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, - [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other }, + [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 }, [0xd7] = { DstReg|SrcImplicit|ModRM|Mov }, [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other }, [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, - [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int }, + [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl }, [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other }, @@ -406,6 +424,7 @@ static const struct ext0f38_table { uint8_t to_mem:1; uint8_t two_op:1; uint8_t vsib:1; + disp8scale_t d8s:4; } ext0f38_table[256] = { [0x00 ... 0x0b] = { .simd_size = simd_packed_int }, [0x0c ... 0x0f] = { .simd_size = simd_packed_fp }, @@ -418,7 +437,7 @@ static const struct ext0f38_table { [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 }, [0x28 ... 0x29] = { .simd_size = simd_packed_int }, - [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl }, [0x2b] = { .simd_size = simd_packed_int }, [0x2c ... 0x2d] = { .simd_size = simd_other }, [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 }, @@ -656,6 +675,22 @@ union evex { }; }; +#define EVEX_PFX_BYTES 4 +#define init_evex(stub) ({ \ + uint8_t *buf_ = get_stub(stub); \ + buf_[0] = 0x62; \ + buf_ + EVEX_PFX_BYTES; \ +}) + +#define copy_EVEX(ptr, evex) ({ \ + if ( !mode_64bit() ) \ + (evex).reg |= 8; \ + (ptr)[1 - EVEX_PFX_BYTES] = (evex).raw[0]; \ + (ptr)[2 - EVEX_PFX_BYTES] = (evex).raw[1]; \ + (ptr)[3 - EVEX_PFX_BYTES] = (evex).raw[2]; \ + container_of((ptr) + 1 - EVEX_PFX_BYTES, typeof(evex), raw[0]); \ +}) + #define rep_prefix() (vex.pfx >= vex_f3) #define repe_prefix() (vex.pfx == vex_f3) #define repne_prefix() (vex.pfx == vex_f2) @@ -768,6 +803,7 @@ typedef union { uint64_t mmx; uint64_t __attribute__ ((aligned(16))) xmm[2]; uint64_t __attribute__ ((aligned(32))) ymm[4]; + uint64_t __attribute__ ((aligned(64))) zmm[8]; } mmval_t; /* @@ -1201,6 +1237,11 @@ static int _get_fpu( switch ( type ) { + case X86EMUL_FPU_zmm: + if ( !(xcr0 & X86_XCR0_ZMM) || !(xcr0 & X86_XCR0_HI_ZMM) || + !(xcr0 & X86_XCR0_OPMASK) ) + return X86EMUL_UNHANDLEABLE; + /* fall through */ case X86EMUL_FPU_ymm: if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_YMM) ) return X86EMUL_UNHANDLEABLE; @@ -1787,6 +1828,7 @@ static bool vcpu_has( #define vcpu_has_clwb() vcpu_has( 7, EBX, 24, ctxt, ops) #define vcpu_has_sha() vcpu_has( 7, EBX, 29, ctxt, ops) #define vcpu_has_avx512bw() vcpu_has( 7, EBX, 30, ctxt, ops) +#define vcpu_has_avx512vl() vcpu_has( 7, EBX, 31, ctxt, ops) #define vcpu_has_rdpid() vcpu_has( 7, ECX, 22, ctxt, ops) #define vcpu_has_clzero() vcpu_has(0x80000008, EBX, 0, ctxt, ops) @@ -2160,6 +2202,65 @@ static unsigned long *decode_vex_gpr( return decode_gpr(regs, ~vex_reg & (mode_64bit() ? 0xf : 7)); } +static unsigned int decode_disp8scale(enum disp8scale scale, + const struct x86_emulate_state *state) +{ + switch ( scale ) + { + case d8s_bw: + return state->evex.w; + + default: + if ( scale < d8s_vl ) + return scale; + if ( state->evex.br ) + { + case d8s_dq: + return 2 + state->evex.w; + } + break; + + case d8s_dq64: + return 2 + (state->op_bytes == 8); + } + + switch ( state->simd_size ) + { + case simd_any_fp: + case simd_single_fp: + if ( !(state->evex.pfx & VEX_PREFIX_SCALAR_MASK) ) + break; + /* fall through */ + case simd_scalar_opc: + case simd_scalar_vexw: + return 2 + state->evex.w; + + case simd_128: + /* These should have an explicit size specified. */ + ASSERT_UNREACHABLE(); + return 4; + + default: + break; + } + + return 4 + state->evex.lr - (scale - d8s_vl); +} + +#define avx512_vlen_check(lig) do { \ + switch ( evex.lr ) \ + { \ + default: \ + generate_exception(EXC_UD); \ + case 2: \ + break; \ + case 0: case 1: \ + if ( !(lig) ) \ + host_and_vcpu_must_have(avx512vl); \ + break; \ + } \ +} while ( false ) + static bool is_aligned(enum x86_segment seg, unsigned long offs, unsigned int size, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) @@ -2406,6 +2507,7 @@ x86_decode_twobyte( if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */ { case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */ + case X86EMUL_OPC_EVEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */ state->desc = DstImplicit | SrcMem | TwoOp; state->simd_size = simd_other; /* Avoid the state->desc clobbering of TwoOp below. */ @@ -2476,7 +2578,7 @@ x86_decode_twobyte( } /* - * Scalar forms of most VEX-encoded TwoOp instructions have + * Scalar forms of most VEX-/EVEX-encoded TwoOp instructions have * three operands. Those which do really have two operands * should have exited earlier. */ @@ -2841,6 +2943,8 @@ x86_decode( if ( d & ModRM ) { + unsigned int disp8scale = 0; + d &= ~ModRM; #undef ModRM /* Only its aliases are valid to use from here on. */ modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3); @@ -2883,6 +2987,9 @@ x86_decode( break; case ext_0f: + if ( evex_encoded() ) + disp8scale = decode_disp8scale(twobyte_table[b].d8s, state); + switch ( b ) { case 0x20: /* mov cr,reg */ @@ -2896,6 +3003,11 @@ x86_decode( */ modrm_mod = 3; break; + + case 0x7e: /* vmovq xmm/m64,xmm needs special casing */ + if ( disp8scale == 2 && evex.pfx == vex_f3 ) + disp8scale = 3; + break; } break; @@ -2907,6 +3019,8 @@ x86_decode( if ( ext0f38_table[b].vsib ) d |= vSIB; state->simd_size = ext0f38_table[b].simd_size; + if ( evex_encoded() ) + disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state); break; case ext_8f09: @@ -2975,7 +3089,7 @@ x86_decode( ea.mem.off = insn_fetch_type(int16_t); break; case 1: - ea.mem.off += insn_fetch_type(int8_t); + ea.mem.off += insn_fetch_type(int8_t) << disp8scale; break; case 2: ea.mem.off += insn_fetch_type(int16_t); @@ -3034,7 +3148,7 @@ x86_decode( pc_rel = mode_64bit(); break; case 1: - ea.mem.off += insn_fetch_type(int8_t); + ea.mem.off += insn_fetch_type(int8_t) << disp8scale; break; case 2: ea.mem.off += insn_fetch_type(int32_t); @@ -3235,10 +3349,11 @@ x86_emulate( struct x86_emulate_state state; int rc, cr4_rc; uint8_t b, d, *opc = NULL; - unsigned int first_byte = 0, insn_bytes = 0; + unsigned int first_byte = 0, elem_bytes, insn_bytes = 0; + uint64_t op_mask = ~0ULL; bool singlestep = (_regs.eflags & X86_EFLAGS_TF) && !is_branch_step(ctxt, ops); - bool sfence = false; + bool sfence = false, fault_suppression = false; struct operand src = { .reg = PTR_POISON }; struct operand dst = { .reg = PTR_POISON }; unsigned long cr4 = 0; @@ -3286,6 +3401,7 @@ x86_emulate( b = ctxt->opcode; d = state.desc; #define state (&state) + elem_bytes = 4 << evex.w; generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD); @@ -3360,6 +3476,28 @@ x86_emulate( break; } + /* With a memory operand, fetch the mask register in use (if any). */ + if ( ea.type == OP_MEM && evex.opmsk ) + { + uint8_t *stb = get_stub(stub); + + /* KMOV{W,Q} %k<n>, (%rax) */ + stb[0] = 0xc4; + stb[1] = 0xe1; + stb[2] = cpu_has_avx512bw ? 0xf8 : 0x78; + stb[3] = 0x91; + stb[4] = evex.opmsk << 3; + insn_bytes = 5; + stb[5] = 0xc3; + + invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask)); + + insn_bytes = 0; + put_stub(stub); + + fault_suppression = true; + } + /* Decode (but don't fetch) the destination operand: register or memory. */ switch ( d & DstMask ) { @@ -5716,6 +5854,41 @@ x86_emulate( insn_bytes = PFX_BYTES + 2; break; + CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2b): /* vmovntp{s,d} [xyz]mm,mem */ + generate_exception_if(ea.type != OP_MEM || evex.opmsk, EXC_UD); + sfence = true; + fault_suppression = false; + /* fall through */ + CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x10): /* vmovup{s,d} [xyz]mm/mem,[xyz]mm{k} */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm{k} */ + /* vmovs{s,d} xmm,xmm,xmm{k} */ + CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x11): /* vmovup{s,d} [xyz]mm,[xyz]mm/mem{k} */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem{k} */ + /* vmovs{s,d} xmm,xmm,xmm{k} */ + CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x28): /* vmovap{s,d} [xyz]mm/mem,[xyz]mm{k} */ + CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x29): /* vmovap{s,d} [xyz]mm,[xyz]mm/mem{k} */ + /* vmovs{s,d} to/from memory have only two operands. */ + if ( (b & ~1) == 0x10 && ea.type == OP_MEM ) + d |= TwoOp; + generate_exception_if(evex.br, EXC_UD); + generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK), + EXC_UD); + host_and_vcpu_must_have(avx512f); + avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK); + simd_zmm: + get_fpu(X86EMUL_FPU_zmm); + opc = init_evex(stub); + opc[0] = b; + opc[1] = modrm; + if ( ea.type == OP_MEM ) + { + /* convert memory operand to (%rAX) */ + evex.b = 1; + opc[1] &= 0x38; + } + insn_bytes = EVEX_PFX_BYTES + 2; + break; + case X86EMUL_OPC_66(0x0f, 0x12): /* movlpd m64,xmm */ case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd m64,xmm,xmm */ CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} xmm,m64 */ @@ -6355,6 +6528,41 @@ x86_emulate( ASSERT(!state->simd_size); break; + case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */ + generate_exception_if((evex.lr || evex.opmsk || evex.br || + evex.reg != 0xf || !evex.RX), + EXC_UD); + host_and_vcpu_must_have(avx512f); + get_fpu(X86EMUL_FPU_zmm); + + opc = init_evex(stub); + opc[0] = b; + /* Convert memory/GPR operand to (%rAX). */ + evex.b = 1; + if ( !mode_64bit() ) + evex.w = 0; + opc[1] = modrm & 0x38; + insn_bytes = EVEX_PFX_BYTES + 2; + opc[2] = 0xc3; + + copy_EVEX(opc, evex); + invoke_stub("", "", "+m" (src.val) : "a" (&src.val)); + dst.val = src.val; + + put_stub(stub); + ASSERT(!state->simd_size); + break; + + case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */ + case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */ + generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.br, + EXC_UD); + host_and_vcpu_must_have(avx512f); + d |= TwoOp; + op_bytes = 8; + goto simd_zmm; + case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */ generate_exception_if(ea.type != OP_MEM, EXC_UD); @@ -6375,6 +6583,30 @@ x86_emulate( goto simd_0f_avx; goto simd_0f_sse2; + case X86EMUL_OPC_EVEX_66(0x0f, 0xe7): /* vmovntdq [xyz]mm,mem */ + generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w, + EXC_UD); + sfence = true; + fault_suppression = false; + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x6f): /* vmovdqa{32,64} [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f, 0x6f): /* vmovdqu{32,64} [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f, 0x7f): /* vmovdqa{32,64} [xyz]mm,[xyz]mm/mem{k} */ + case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */ + vmovdqa: + generate_exception_if(evex.br, EXC_UD); + host_and_vcpu_must_have(avx512f); + avx512_vlen_check(false); + d |= TwoOp; + op_bytes = 16 << evex.lr; + goto simd_zmm; + + case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */ + host_and_vcpu_must_have(avx512bw); + elem_bytes = 1 << evex.w; + goto vmovdqa; + case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */ generate_exception_if(vex.l, EXC_UD); d |= TwoOp; @@ -7739,6 +7971,15 @@ x86_emulate( } goto movdqa; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x2a): /* vmovntdqa mem,[xyz]mm */ + generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w, + EXC_UD); + /* Ignore the non-temporal hint for now, using vmovdqa32 instead. */ + asm volatile ( "mfence" ::: "memory" ); + b = 0x6f; + evex.opcx = vex_0f; + goto vmovdqa; + case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */ @@ -8792,17 +9033,27 @@ x86_emulate( else if ( state->simd_size ) { generate_exception_if(!op_bytes, EXC_UD); - generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf, + generate_exception_if((vex.opcx && (d & TwoOp) && + (vex.reg != 0xf || (evex_encoded() && !evex.RX))), EXC_UD); if ( !opc ) BUG(); - opc[insn_bytes - PFX_BYTES] = 0xc3; - copy_REX_VEX(opc, rex_prefix, vex); + if ( evex_encoded() ) + { + opc[insn_bytes - EVEX_PFX_BYTES] = 0xc3; + copy_EVEX(opc, evex); + } + else + { + opc[insn_bytes - PFX_BYTES] = 0xc3; + copy_REX_VEX(opc, rex_prefix, vex); + } if ( ea.type == OP_MEM ) { uint32_t mxcsr = 0; + uint64_t full = 0; if ( op_bytes < 16 || (vex.opcx @@ -8824,6 +9075,45 @@ x86_emulate( !is_aligned(ea.mem.seg, ea.mem.off, op_bytes, ctxt, ops), EXC_GP, 0); + + IMPOSSIBLE(elem_bytes <= 0); + if ( evex.br ) + { + ASSERT((d & DstMask) != DstMem); + op_bytes = elem_bytes; + } + if ( evex.opmsk ) + { + ASSERT(!(op_bytes % elem_bytes)); + full = ~0ULL >> (64 - op_bytes / elem_bytes); + op_mask &= full; + } + if ( fault_suppression ) + { + if ( !op_mask ) + goto simd_no_mem; + if ( !evex.br ) + { + first_byte = __builtin_ctzll(op_mask); + op_mask >>= first_byte; + full >>= first_byte; + first_byte *= elem_bytes; + op_bytes = (64 - __builtin_clzll(op_mask)) * elem_bytes; + } + } + /* + * Independent of fault suppression we may need to read (parts of) + * the memory operand for the purpose of merging without splitting + * the write below into multiple ones. Note that the EVEX.Z check + * here isn't strictly needed, due to there not currently being + * any instructions allowing zeroing-merging on memory writes (and + * we raise #UD during DstMem processing far above in this case), + * yet conceptually the read is then unnecessary. + */ + if ( evex.opmsk && !evex.z && (d & DstMask) == DstMem && + op_mask != full ) + d = (d & ~SrcMask) | SrcMem; + switch ( d & SrcMask ) { case SrcMem: @@ -8865,7 +9155,10 @@ x86_emulate( } } else + { + simd_no_mem: dst.type = OP_NONE; + } /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */ if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK | --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -171,6 +171,7 @@ enum x86_emulate_fpu_type { X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */ X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */ X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */ + X86EMUL_FPU_zmm, /* AVX512 instruction set (%zmm0-%zmm7/31) */ /* This sentinel will never be passed to ->get_fpu(). */ X86EMUL_FPU_none }; --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -105,6 +105,7 @@ #define cpu_has_smap boot_cpu_has(X86_FEATURE_SMAP) #define cpu_has_sha boot_cpu_has(X86_FEATURE_SHA) #define cpu_has_avx512bw boot_cpu_has(X86_FEATURE_AVX512BW) +#define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL) /* CPUID level 0x80000007.edx */ #define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC) _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |