x86emul: support {,V}LDDQU Also take the opportunity and adjust the vmovdqu test case the new one here has been cloned from: To zero a ymm register we don't need to go through hoops, as 128-bit AVX insns zero the upper portion of the destination register, and in the disabled AVX2 code there was a wrong YMM register used. Signed-off-by: Jan Beulich --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -968,12 +968,7 @@ int main(int argc, char **argv) { decl_insn(vmovdqu_from_mem); -#if 0 /* Don't use AVX2 instructions for now */ - asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n" -#else - asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t" - "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n" -#endif + asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n" put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4") :: "d" (NULL) ); @@ -987,7 +982,7 @@ int main(int argc, char **argv) #if 0 /* Don't use AVX2 instructions for now */ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t" "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t" - "vpmovmskb %%ymm1, %0" : "=r" (rc) ); + "vpmovmskb %%ymm0, %0" : "=r" (rc) ); #else asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t" "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t" @@ -1404,6 +1399,67 @@ int main(int argc, char **argv) printf("skipped\n"); #endif + printf("%-40s", "Testing lddqu 4(%edx),%xmm4..."); + if ( stack_exec && cpu_has_sse3 ) + { + decl_insn(lddqu); + + asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n" + put_insn(lddqu, "lddqu 4(%0), %%xmm4") + :: "d" (NULL) ); + + set_insn(lddqu); + memset(res, 0x55, 64); + memset(res + 1, 0xff, 16); + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(lddqu) ) + goto fail; + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" + "pcmpeqb %%xmm4, %%xmm2\n\t" + "pmovmskb %%xmm2, %0" : "=r" (rc) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vlddqu (%ecx),%ymm4..."); + if ( stack_exec && cpu_has_avx ) + { + decl_insn(vlddqu); + + asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n" + put_insn(vlddqu, "vlddqu (%0), %%ymm4") + :: "c" (NULL) ); + + set_insn(vlddqu); + memset(res + 1, 0xff, 32); + regs.ecx = (unsigned long)(res + 1); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) ) + goto fail; +#if 0 /* Don't use AVX2 instructions for now */ + asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t" + "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t" + "vpmovmskb %%ymm0, %0" : "=r" (rc) ); +#else + asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t" + "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t" + "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t" + "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t" + "vpmovmskb %%xmm0, %0\n\t" + "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) ); + rc |= i << 16; +#endif + if ( ~rc ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + #undef decl_insn #undef put_insn #undef set_insn --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -71,6 +71,12 @@ static int cpuid( (edx & (1U << 26)) != 0; \ }) +#define cpu_has_sse3 ({ \ + unsigned int eax = 1, ecx = 0; \ + emul_test_cpuid(&eax, &eax, &ecx, &eax, NULL); \ + (ecx & (1U << 0)) != 0; \ +}) + #define cpu_has_xsave ({ \ unsigned int eax = 1, ecx = 0; \ emul_test_cpuid(&eax, &eax, &ecx, &eax, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -4993,6 +4993,9 @@ x86_emulate( case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */ /* vmovntdq ymm,m256 */ + case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu xmm,m128 */ + case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu xmm,m128 */ + /* vlddqu ymm,m256 */ fail_if(ea.type != OP_MEM); /* fall through */ case X86EMUL_OPC(0x0f, 0x6e): /* movd r/m32,mm */ @@ -5040,6 +5043,11 @@ x86_emulate( { switch ( vex.pfx ) { + case vex_f2: + /* Converting lddqu to movdqa (see also below). */ + vcpu_must_have(sse3); + buf[3] = 0x6f; + /* fall through */ case vex_66: case vex_f3: host_and_vcpu_must_have(sse2); @@ -5056,8 +5064,6 @@ x86_emulate( get_fpu(X86EMUL_FPU_mmx, &fic); ea.bytes = 8; break; - default: - goto cannot_emulate; } } else @@ -5079,6 +5085,7 @@ x86_emulate( ea.bytes = 8; /* fall through */ case 0x6f: + case 0xf0: load = true; } break; @@ -5094,7 +5101,7 @@ x86_emulate( { uint32_t mxcsr = 0; - if ( ea.bytes < 16 || vex.pfx == vex_f3 ) + if ( ea.bytes < 16 || vex.pfx >= vex_f3 ) mxcsr = MXCSR_MM; else if ( vcpu_has_misalignsse() ) asm ( "stmxcsr %0" : "=m" (mxcsr) );