x86emul: support FMA insns Signed-off-by: Jan Beulich --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -12,7 +12,7 @@ run: $(TARGET) ./$(TARGET) SIMD := sse sse2 sse4 avx -FMA := fma4 +FMA := fma4 fma TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA) blowfish-cflags := "" @@ -33,6 +33,9 @@ avx-flts := 4 8 fma4-vecs := $(avx-vecs) fma4-ints := fma4-flts := $(avx-flts) +fma-vecs := $(avx-vecs) +fma-ints := +fma-flts := $(avx-flts) # When converting SSE to AVX, have the compiler avoid XMM0 to widen # coverage of the VEX.vvvv checks in the emulator. We must not do this, --- a/tools/tests/x86_emulator/simd-fma.c +++ b/tools/tests/x86_emulator/simd-fma.c @@ -21,24 +21,24 @@ ENTRY(fma_test); #if VEC_SIZE == 16 # if FLOAT_SIZE == 4 # define addsub(x, y) __builtin_ia32_addsubps(x, y) -# if defined(__FMA4__) +# if defined(__FMA4__) || defined(__FMA__) # define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z) # endif # elif FLOAT_SIZE == 8 # define addsub(x, y) __builtin_ia32_addsubpd(x, y) -# if defined(__FMA4__) +# if defined(__FMA4__) || defined(__FMA__) # define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z) # endif # endif #elif VEC_SIZE == 32 # if FLOAT_SIZE == 4 # define addsub(x, y) __builtin_ia32_addsubps256(x, y) -# if defined(__FMA4__) +# if defined(__FMA4__) || defined(__FMA__) # define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z) # endif # elif FLOAT_SIZE == 8 # define addsub(x, y) __builtin_ia32_addsubpd256(x, y) -# if defined(__FMA4__) +# if defined(__FMA4__) || defined(__FMA__) # define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z) # endif # endif --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -12,6 +12,7 @@ #include "sse4-avx.h" #include "avx.h" #include "fma4.h" +#include "fma.h" #define verbose false /* Switch to true for far more logging. */ @@ -53,6 +54,11 @@ static bool simd_check_fma4(void) return cpu_has_fma4; } +static bool simd_check_fma(void) +{ + return cpu_has_fma; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -155,6 +161,12 @@ static const struct { SIMD(FMA4 scalar double, fma4, f8), SIMD(FMA4 128bit double, fma4, 16f8), SIMD(FMA4 256bit double, fma4, 32f8), + SIMD(FMA scalar single, fma, f4), + SIMD(FMA 128bit single, fma, 16f4), + SIMD(FMA 256bit single, fma, 32f4), + SIMD(FMA scalar double, fma, f8), + SIMD(FMA 128bit double, fma, 16f8), + SIMD(FMA 256bit double, fma, 32f8), #undef SIMD_ #undef SIMD }; --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -94,6 +94,14 @@ static inline uint64_t xgetbv(uint32_t x (res.c & (1U << 0)) != 0; \ }) +#define cpu_has_fma ({ \ + struct cpuid_leaf res; \ + emul_test_cpuid(1, 0, &res, NULL); \ + if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \ + res.c = 0; \ + (res.c & (1U << 12)) != 0; \ +}) + #define cpu_has_sse4_1 ({ \ struct cpuid_leaf res; \ emul_test_cpuid(1, 0, &res, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -385,6 +385,9 @@ static const struct { [0x37 ... 0x3f] = { .simd_size = simd_packed_int }, [0x40] = { .simd_size = simd_packed_int }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0x96 ... 0x9f] = { .simd_size = simd_packed_fp }, + [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp }, + [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp }, [0xc8 ... 0xcd] = { .simd_size = simd_other }, [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, [0xdc ... 0xdf] = { .simd_size = simd_packed_int }, @@ -1605,6 +1608,7 @@ static bool vcpu_has( #define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops) #define vcpu_has_pclmulqdq() vcpu_has( 1, ECX, 1, ctxt, ops) #define vcpu_has_ssse3() vcpu_has( 1, ECX, 9, ctxt, ops) +#define vcpu_has_fma() vcpu_has( 1, ECX, 12, ctxt, ops) #define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops) #define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops) #define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops) @@ -7352,6 +7356,39 @@ x86_emulate( generate_exception_if(vex.l, EXC_UD); goto simd_0f_avx; + case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(fma); + goto simd_0f_ymm; + case X86EMUL_OPC(0x0f38, 0xc8): /* sha1nexte xmm/m128,xmm */ case X86EMUL_OPC(0x0f38, 0xc9): /* sha1msg1 xmm/m128,xmm */ case X86EMUL_OPC(0x0f38, 0xca): /* sha1msg2 xmm/m128,xmm */ --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -50,6 +50,7 @@ #define cpu_has_vmx boot_cpu_has(X86_FEATURE_VMX) #define cpu_has_eist boot_cpu_has(X86_FEATURE_EIST) #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) +#define cpu_has_fma boot_cpu_has(X86_FEATURE_FMA) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_pdcm boot_cpu_has(X86_FEATURE_PDCM) #define cpu_has_pcid boot_cpu_has(X86_FEATURE_PCID)