x86emul: support {,V}MOVNTDQA ... as the only post-SSE2 move insn. Signed-off-by: Jan Beulich --- v3: Re-base. v2: Re-base. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -2380,6 +2380,74 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing movntdqa 16(%edx),%xmm4..."); + if ( stack_exec && cpu_has_sse4_1 ) + { + decl_insn(movntdqa); + + asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n" + put_insn(movntdqa, "movntdqa 16(%0), %%xmm4") + :: "d" (NULL) ); + + set_insn(movntdqa); + memset(res, 0x55, 64); + memset(res + 4, 0xff, 16); + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(movntdqa) ) + goto fail; + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" + "pcmpeqb %%xmm4, %%xmm2\n\t" + "pmovmskb %%xmm2, %0" : "=r" (rc) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovntdqa (%ecx),%ymm4..."); + if ( stack_exec && cpu_has_avx2 ) + { + decl_insn(vmovntdqa); + +#if 0 /* Don't use AVX2 instructions for now */ + asm volatile ( "vpxor %%ymm4, %%ymm4, %%ymm4\n" + put_insn(vmovntdqa, "vmovntdqa (%0), %%ymm4") + :: "c" (NULL) ); +#else + asm volatile ( "vpxor %xmm4, %xmm4, %xmm4\n" + put_insn(vmovntdqa, + ".byte 0xc4, 0xe2, 0x7d, 0x2a, 0x21") ); +#endif + + set_insn(vmovntdqa); + memset(res, 0x55, 96); + memset(res + 8, 0xff, 32); + regs.ecx = (unsigned long)(res + 8); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vmovntdqa) ) + goto fail; +#if 0 /* Don't use AVX2 instructions for now */ + asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t" + "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t" + "vpmovmskb %%ymm0, %0" : "=r" (rc) ); +#else + asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t" + "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t" + "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t" + "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t" + "vpmovmskb %%xmm0, %0\n\t" + "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) ); + rc |= i << 16; +#endif + if ( ~rc ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing stmxcsr (%edx)..."); if ( cpu_has_sse ) { --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -94,6 +94,12 @@ static inline uint64_t xgetbv(uint32_t x (res.c & (1U << 0)) != 0; \ }) +#define cpu_has_sse4_1 ({ \ + struct cpuid_leaf res; \ + emul_test_cpuid(1, 0, &res, NULL); \ + (res.c & (1U << 19)) != 0; \ +}) + #define cpu_has_popcnt ({ \ struct cpuid_leaf res; \ emul_test_cpuid(1, 0, &res, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -1393,6 +1393,7 @@ static bool vcpu_has( #define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops) #define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops) #define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops) +#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops) #define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops) #define vcpu_has_movbe() vcpu_has( 1, ECX, 22, ctxt, ops) #define vcpu_has_popcnt() vcpu_has( 1, ECX, 23, ctxt, ops) @@ -5912,6 +5913,7 @@ x86_emulate( case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */ case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */ case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */ + movdqa: d |= TwoOp; op_bytes = 16 << vex.l; if ( vex.opcx != vex_none ) @@ -6806,6 +6808,23 @@ x86_emulate( sfence = true; break; + case X86EMUL_OPC_66(0x0f38, 0x2a): /* movntdqa m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */ + generate_exception_if(ea.type != OP_MEM, EXC_UD); + /* Ignore the non-temporal hint for now, using movdqa instead. */ + asm volatile ( "mfence" ::: "memory" ); + b = 0x6f; + if ( vex.opcx == vex_none ) + vcpu_must_have(sse4_1); + else + { + vex.opcx = vex_0f; + if ( vex.l ) + vcpu_must_have(avx2); + } + state->simd_size = simd_packed_int; + goto movdqa; + case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */ case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */ vcpu_must_have(movbe);