x86: AVX instruction emulation fixes - we used the C4/C5 (first prefix) byte instead of the apparent ModR/M one as the second prefix byte - early decoding normalized vex.reg, thus corrupting it for the main consumer (copy_REX_VEX()), resulting in #UD on the two-operand instructions we emulate Also add respective test cases to the testing utility plus - fix get_fpu() (the fall-through order was inverted) - add cpu_has_avx2, even if it's currently unused (as in the new test cases I decided to refrain from using AVX2 instructions in order to be able to actually run all the tests on the hardware I have) - slightly tweak cpu_has_avx to more consistently express the outputs we don't care about (sinking them all into the same variable) Signed-off-by: Jan Beulich --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -94,13 +94,25 @@ static inline uint64_t xgetbv(uint32_t x } #define cpu_has_avx ({ \ - unsigned int eax = 1, ecx = 0, edx; \ - cpuid(&eax, &edx, &ecx, &edx, NULL); \ + unsigned int eax = 1, ecx = 0; \ + cpuid(&eax, &eax, &ecx, &eax, NULL); \ if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \ ecx = 0; \ (ecx & (1U << 28)) != 0; \ }) +#define cpu_has_avx2 ({ \ + unsigned int eax = 1, ebx, ecx = 0; \ + cpuid(&eax, &ebx, &ecx, &eax, NULL); \ + if ( !(ecx & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \ + ebx = 0; \ + else { \ + eax = 7, ecx = 0; \ + cpuid(&eax, &ebx, &ecx, &eax, NULL); \ + } \ + (ebx & (1U << 5)) != 0; \ +}) + int get_fpu( void (*exception_callback)(void *, struct cpu_user_regs *), void *exception_callback_arg, @@ -111,14 +123,14 @@ int get_fpu( { case X86EMUL_FPU_fpu: break; - case X86EMUL_FPU_ymm: - if ( cpu_has_avx ) + case X86EMUL_FPU_mmx: + if ( cpu_has_mmx ) break; case X86EMUL_FPU_xmm: if ( cpu_has_sse ) break; - case X86EMUL_FPU_mmx: - if ( cpu_has_mmx ) + case X86EMUL_FPU_ymm: + if ( cpu_has_avx ) break; default: return X86EMUL_UNHANDLEABLE; @@ -629,6 +641,73 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vmovdqu %ymm2,(%ecx)..."); + if ( stack_exec && cpu_has_avx ) + { + extern const unsigned char vmovdqu_to_mem[]; + + asm volatile ( "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n" + ".pushsection .test, \"a\", @progbits\n" + "vmovdqu_to_mem: vmovdqu %%ymm2, (%0)\n" + ".popsection" :: "c" (NULL) ); + + memcpy(instr, vmovdqu_to_mem, 15); + memset(res, 0x55, 128); + memset(res + 16, 0xff, 16); + memset(res + 20, 0x00, 16); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 16, 64) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing vmovdqu (%edx),%ymm4..."); + if ( stack_exec && cpu_has_avx ) + { + extern const unsigned char vmovdqu_from_mem[]; + +#if 0 /* Don't use AVX2 instructions for now */ + asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n" +#else + asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t" + "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n" +#endif + ".pushsection .test, \"a\", @progbits\n" + "vmovdqu_from_mem: vmovdqu (%0), %%ymm4\n" + ".popsection" :: "d" (NULL) ); + + memcpy(instr, vmovdqu_from_mem, 15); + memset(res + 4, 0xff, 16); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY ) + goto fail; +#if 0 /* Don't use AVX2 instructions for now */ + asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t" + "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t" + "vpmovmskb %%ymm1, %0" : "=r" (rc) ); +#else + asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t" + "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t" + "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t" + "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t" + "vpmovmskb %%xmm0, %0\n\t" + "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) ); + rc |= i << 16; +#endif + if ( rc != 0xffffffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movsd %xmm5,(%ecx)..."); memset(res, 0x77, 64); memset(res + 10, 0x66, 8); @@ -683,6 +762,59 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vmovsd %xmm5,(%ecx)..."); + memset(res, 0x88, 64); + memset(res + 10, 0x77, 8); + if ( stack_exec && cpu_has_avx ) + { + extern const unsigned char vmovsd_to_mem[]; + + asm volatile ( "vbroadcastsd %0, %%ymm5\n" + ".pushsection .test, \"a\", @progbits\n" + "vmovsd_to_mem: vmovsd %%xmm5, (%1)\n" + ".popsection" :: "m" (res[10]), "c" (NULL) ); + + memcpy(instr, vmovsd_to_mem, 15); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)(res + 2); + regs.edx = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || memcmp(res, res + 8, 32) ) + goto fail; + printf("okay\n"); + } + else + { + printf("skipped\n"); + memset(res + 2, 0x77, 8); + } + + printf("%-40s", "Testing vmovaps (%edx),%ymm7..."); + if ( stack_exec && cpu_has_avx ) + { + extern const unsigned char vmovaps_from_mem[]; + + asm volatile ( "vxorps %%ymm7, %%ymm7, %%ymm7\n" + ".pushsection .test, \"a\", @progbits\n" + "vmovaps_from_mem: vmovaps (%0), %%ymm7\n" + ".popsection" :: "d" (NULL) ); + + memcpy(instr, vmovaps_from_mem, 15); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY ) + goto fail; + asm ( "vcmpeqps %1, %%ymm7, %%ymm0\n\t" + "vmovmskps %%ymm0, %0" : "=r" (rc) : "m" (res[8]) ); + if ( rc != 0xff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + for ( j = 1; j <= 2; j++ ) { #if defined(__i386__) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -1454,10 +1454,10 @@ x86_emulate( /* VEX */ generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1); - vex.raw[0] = b; + vex.raw[0] = modrm; if ( b & 1 ) { - vex.raw[1] = b; + vex.raw[1] = modrm; vex.opcx = vex_0f; vex.x = 1; vex.b = 1; @@ -1479,10 +1479,7 @@ x86_emulate( } } } - vex.reg ^= 0xf; - if ( !mode_64bit() ) - vex.reg &= 0x7; - else if ( !vex.r ) + if ( mode_64bit() && !vex.r ) rex_prefix |= REX_R; fail_if(vex.opcx != vex_0f); @@ -3899,8 +3896,9 @@ x86_emulate( else { fail_if((vex.opcx != vex_0f) || - (vex.reg && ((ea.type == OP_MEM) || - !(vex.pfx & VEX_PREFIX_SCALAR_MASK)))); + ((vex.reg != 0xf) && + ((ea.type == OP_MEM) || + !(vex.pfx & VEX_PREFIX_SCALAR_MASK)))); vcpu_must_have_avx(); get_fpu(X86EMUL_FPU_ymm, &fic); ea.bytes = 16 << vex.l; @@ -4168,7 +4166,7 @@ x86_emulate( } else { - fail_if((vex.opcx != vex_0f) || vex.reg || + fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) || ((vex.pfx != vex_66) && (vex.pfx != vex_f3))); vcpu_must_have_avx(); get_fpu(X86EMUL_FPU_ymm, &fic);