x86emul: support SSE4.2 insns ... and their AVX equivalents. Signed-off-by: Jan Beulich --- v3: New. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -2542,6 +2542,149 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing pcmpestri $0x1a,(%ecx),%xmm2..."); + if ( stack_exec && cpu_has_sse4_2 ) + { + decl_insn(pcmpestri); + + memcpy(res, "abcdefgh\0\1\2\3\4\5\6\7", 16); + asm volatile ( "movq %0, %%xmm2\n" + put_insn(pcmpestri, "pcmpestri $0b00011010, (%1), %%xmm2") + :: "m" (res[0]), "c" (NULL) ); + + set_insn(pcmpestri); + regs.eax = regs.edx = 12; + regs.ecx = (unsigned long)res; + regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_IF | X86_EFLAGS_OF; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(pcmpestri) || + regs.ecx != 9 || + (regs.eflags & X86_EFLAGS_ARITH_MASK) != + (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing pcmpestrm $0x5a,(%ecx),%xmm2..."); + if ( stack_exec && cpu_has_sse4_2 ) + { + decl_insn(pcmpestrm); + + asm volatile ( "movq %0, %%xmm2\n" + put_insn(pcmpestrm, "pcmpestrm $0b01011010, (%1), %%xmm2") + :: "m" (res[0]), "c" (NULL) ); + + set_insn(pcmpestrm); + regs.ecx = (unsigned long)res; + regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_IF | X86_EFLAGS_OF; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(pcmpestrm) ) + goto fail; + asm ( "pmovmskb %%xmm0, %0" : "=r" (rc) ); + if ( rc != 0x0e00 || + (regs.eflags & X86_EFLAGS_ARITH_MASK) != + (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing pcmpistri $0x1a,(%ecx),%xmm2..."); + if ( stack_exec && cpu_has_sse4_2 ) + { + decl_insn(pcmpistri); + + asm volatile ( "movq %0, %%xmm2\n" + put_insn(pcmpistri, "pcmpistri $0b00011010, (%1), %%xmm2") + :: "m" (res[0]), "c" (NULL) ); + + set_insn(pcmpistri); + regs.eflags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_IF | X86_EFLAGS_OF; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(pcmpistri) || + regs.ecx != 16 || + (regs.eflags & X86_EFLAGS_ARITH_MASK) != + (X86_EFLAGS_ZF | X86_EFLAGS_SF) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing pcmpistrm $0x4a,(%ecx),%xmm2..."); + if ( stack_exec && cpu_has_sse4_2 ) + { + decl_insn(pcmpistrm); + + asm volatile ( "movq %0, %%xmm2\n" + put_insn(pcmpistrm, "pcmpistrm $0b01001010, (%1), %%xmm2") + :: "m" (res[0]), "c" (NULL) ); + + set_insn(pcmpistrm); + regs.ecx = (unsigned long)res; + regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_IF; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(pcmpistrm) ) + goto fail; + asm ( "pmovmskb %%xmm0, %0" : "=r" (rc) ); + if ( rc != 0xffff || + (regs.eflags & X86_EFLAGS_ARITH_MASK) != + (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vpcmpestri $0x7a,(%esi),%xmm2..."); + if ( stack_exec && cpu_has_avx ) + { + decl_insn(vpcmpestri); + +#ifdef __x86_64__ + /* + * gas up to at least 2.27 doesn't honor explict "rex.w" for + * VEX/EVEX encoded instructions, and also doesn't provide any + * other means to control VEX.W. + */ + asm volatile ( "movq %0, %%xmm2\n" + put_insn(vpcmpestri, + ".byte 0xC4, 0xE3, 0xF9, 0x61, 0x16, 0x7A") + :: "m" (res[0]) ); +#else + asm volatile ( "movq %0, %%xmm2\n" + put_insn(vpcmpestri, + "vpcmpestri $0b01111010, (%1), %%xmm2") + :: "m" (res[0]), "S" (NULL) ); +#endif + + set_insn(vpcmpestri); +#ifdef __x86_64__ + regs.rax = ~0U + 1UL; + regs.rcx = ~0UL; +#else + regs.eax = 0x7fffffff; +#endif + regs.esi = (unsigned long)res; + regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_IF | X86_EFLAGS_OF; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(vpcmpestri) || + regs.ecx != 11 || + (regs.eflags & X86_EFLAGS_ARITH_MASK) != + (X86_EFLAGS_ZF | X86_EFLAGS_CF) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing stmxcsr (%edx)..."); if ( cpu_has_sse ) { --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -100,6 +100,12 @@ static inline uint64_t xgetbv(uint32_t x (res.c & (1U << 19)) != 0; \ }) +#define cpu_has_sse4_2 ({ \ + struct cpuid_leaf res; \ + emul_test_cpuid(1, 0, &res, NULL); \ + (res.c & (1U << 20)) != 0; \ +}) + #define cpu_has_popcnt ({ \ struct cpuid_leaf res; \ emul_test_cpuid(1, 0, &res, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -359,7 +359,7 @@ static const struct { [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x2b] = { .simd_size = simd_packed_int }, [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 }, - [0x38 ... 0x3f] = { .simd_size = simd_packed_int }, + [0x37 ... 0x3f] = { .simd_size = simd_packed_int }, [0x40] = { .simd_size = simd_packed_int }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xf0] = { .two_op = 1 }, @@ -389,6 +389,7 @@ static const struct { [0x42] = { .simd_size = simd_packed_int }, [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xf0] = {}, }; @@ -5991,6 +5992,7 @@ x86_emulate( case X86EMUL_OPC_VEX_66(0x0f38, 0x28): /* vpmuldq {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x29): /* vpcmpeqq {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x2b): /* vpackusdw {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x37): /* vpcmpgtq {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x38): /* vpminsb {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x39): /* vpminsd {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x3a): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */ @@ -7138,6 +7140,10 @@ x86_emulate( } goto movdqa; + case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */ + host_and_vcpu_must_have(sse4_2); + goto simd_0f38_common; + case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */ case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */ vcpu_must_have(movbe); @@ -7426,6 +7432,63 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_int_imm8; + case X86EMUL_OPC_66(0x0f3a, 0x60): /* pcmpestrm $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_66(0x0f3a, 0x61): /* pcmpestri $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x61): /* vpcmpestri $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_66(0x0f3a, 0x62): /* pcmpistrm $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x62): /* vpcmpistrm $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_66(0x0f3a, 0x63): /* pcmpistri $imm8,xmm/m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x63): /* vpcmpistri $imm8,xmm/m128,xmm */ + if ( vex.opcx == vex_none ) + { + host_and_vcpu_must_have(sse4_2); + get_fpu(X86EMUL_FPU_xmm, &fic); + } + else + { + generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD); + host_and_vcpu_must_have(avx); + get_fpu(X86EMUL_FPU_ymm, &fic); + } + + opc = init_prefixes(stub); + if ( vex.opcx == vex_none ) + opc[0] = 0x3a; + opc[vex.opcx == vex_none] = b; + opc[1 + (vex.opcx == vex_none)] = modrm; + if ( ea.type == OP_MEM ) + { + /* Convert memory operand to (%rDI). */ + rex_prefix &= ~REX_B; + vex.b = 1; + opc[1 + (vex.opcx == vex_none)] &= 0x3f; + opc[1 + (vex.opcx == vex_none)] |= 0x07; + + rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 16, ctxt); + if ( rc != X86EMUL_OKAY ) + goto done; + } + opc[2 + (vex.opcx == vex_none)] = imm1; + fic.insn_bytes = PFX_BYTES + 3 + (vex.opcx == vex_none); + opc[3 + (vex.opcx == vex_none)] = 0xc3; + + copy_REX_VEX(opc, rex_prefix, vex); +#ifdef __x86_64__ + if ( rex_prefix & REX_W ) + emulate_stub("=c" (dst.val), "m" (*mmvalp), "D" (mmvalp), + "a" (_regs.rax), "d" (_regs.rdx)); + else +#endif + emulate_stub("=c" (dst.val), "m" (*mmvalp), "D" (mmvalp), + "a" (_regs._eax), "d" (_regs._edx)); + + state->simd_size = simd_none; + if ( b & 1 ) + _regs.r(cx) = (uint32_t)dst.val; + dst.type = OP_NONE; + break; + case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */ vcpu_must_have(bmi2); generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);