Xen project Mailing List

[Xen-devel] [PATCH v3 07/25] x86emul: support AVX2 gather insns

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Thu, 07 Dec 2017 07:03:35 -0700

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Thu, 07 Dec 2017 14:03:40 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v3: Re-base. --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -13,7 +13,8 @@ run: $(TARGET) SIMD := sse sse2 sse4 avx avx2 FMA := fma4 fma -TESTCASES := blowfish $(SIMD) $(FMA) +SG := avx2-sg +TESTCASES := blowfish $(SIMD) $(FMA) $(SG) blowfish-cflags := "" blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic=" @@ -39,6 +40,10 @@ fma-flts := $(avx-flts) avx2-vecs := $(avx-vecs) avx2-ints := 1 2 4 8 avx2-flts := 4 8 +avx2-sg-vecs := $(avx2-vecs) +avx2-sg-idxs := 4 8 +avx2-sg-ints := 4 8 +avx2-sg-flts := 4 8 # For AVX and later, have the compiler avoid XMM0 to widen coverage of # the VEX.vvvv checks in the emulator. @@ -55,8 +60,18 @@ $(1)-cflags := \ $(foreach flt,$($(1)-flts), \ "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)") endef +define simd-sg-defs +$(1)-cflags := \ + $(foreach vec,$($(1)-vecs), \ + $(foreach idx,$($(1)-idxs), \ + $(foreach int,$($(1)-ints), \ + "-D_$(vec)x$(idx)i$(int) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DINT_SIZE=$(int)") \ + $(foreach flt,$($(1)-flts), \ + "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)"))) +endef $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) +$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor)))) $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile rm -f $@.new $*.bin @@ -78,7 +93,10 @@ $(addsuffix .c,$(SIMD)): $(addsuffix .c,$(FMA)): ln -sf simd-fma.c $@ -$(addsuffix .o,$(SIMD) $(FMA)): simd.h +$(addsuffix .c,$(SG)): + ln -sf simd-sg.c $@ + +$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h $(TARGET): x86-emulate.o test_x86_emulator.o $(HOSTCC) $(HOSTCFLAGS) -o $@ $^ --- /dev/null +++ b/tools/tests/x86_emulator/simd-sg.c @@ -0,0 +1,209 @@ +#ifdef INT_SIZE +# define ELEM_SIZE INT_SIZE +#else +# define ELEM_SIZE FLOAT_SIZE +#endif + +#define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \ + : VEC_MAX * ELEM_SIZE / IDX_SIZE) +#if VEC_SIZE < 16 +# undef VEC_SIZE +# define VEC_SIZE 16 +#endif + +#include "simd.h" + +ENTRY(sg_test); + +#undef MODE +#if IDX_SIZE == 4 +# define MODE SI +#elif IDX_SIZE == 8 +# define MODE DI +#endif + +#define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \ + : VEC_MAX * IDX_SIZE / ELEM_SIZE) +#if IVEC_SIZE < 16 +# undef IVEC_SIZE +# define IVEC_SIZE 16 +#endif + +typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t; +typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t; + +#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \ + VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE) + +#if VEC_SIZE == 16 +# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0) +#else +# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0) +#endif + +#if defined(__AVX2__) +# if VEC_MAX == 16 +# if IDX_SIZE == 4 +# if INT_SIZE == 4 +# define gather __builtin_ia32_gathersiv4si +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \ + (const void *)(mem), \ + idx, (vdi_t)(msk), scl)) +# elif FLOAT_SIZE == 4 +# define gather __builtin_ia32_gathersiv4sf +# elif FLOAT_SIZE == 8 +# define gather __builtin_ia32_gathersiv2df +# endif +# elif IDX_SIZE == 8 +# if INT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl) +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \ + (const void *)(mem), \ + (vdi_t)(idx), (vdi_t)(msk), \ + scl)) +# elif FLOAT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl) +# elif FLOAT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl) +# endif +# endif +# elif VEC_MAX == 32 +# if IDX_SIZE == 4 +# if INT_SIZE == 4 +# define gather __builtin_ia32_gathersiv8si +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \ + (const void *)(mem), \ + idx, (vdi_t)(msk), scl)) + +# elif FLOAT_SIZE == 4 +# define gather __builtin_ia32_gathersiv8sf +# elif FLOAT_SIZE == 8 +# define gather __builtin_ia32_gathersiv4df +# endif +# elif IDX_SIZE == 8 +# if INT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl) +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \ + (const void *)(mem), \ + (vdi_t)(idx), (vdi_t)(msk), \ + scl)) + +# elif FLOAT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl) +# elif FLOAT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) \ + __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl) +# endif +# endif +# endif +#endif + +#define GLUE_(x, y) x ## y +#define GLUE(x, y) GLUE_(x, y) + +#define PUT2(n) (n), (n) + 1 +#define PUT4(n) PUT2(n), PUT2((n) + 2) +#define PUT8(n) PUT4(n), PUT4((n) + 4) +#define PUT16(n) PUT8(n), PUT8((n) + 8) +#define PUT32(n) PUT16(n), PUT16((n) + 16) + +const typeof((vec_t){}[0]) array[] = { + GLUE(PUT, VEC_MAX)(1), + GLUE(PUT, VEC_MAX)(VEC_MAX + 1) +}; + +int sg_test(void) +{ + unsigned int i; + vec_t x, y, full = (vec_t){} == 0; + idx_t idx, inv; + + for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i ) + { + idx[i] = i + 1; + inv[i] = ITEM_COUNT - i; + } + + touch(idx); + touch(inv); + + x = gather(full, array, (idx_t){}, full, 1); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( x[i] != 1 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( x[i] ) + return __LINE__; + + x = gather(full, array, idx, full, ELEM_SIZE); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( x[i] != i + 2 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( x[i] ) + return __LINE__; + + x = gather(full, array, idx * ELEM_SIZE, full, 2); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( x[i] != i * 2 + 3 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( x[i] ) + return __LINE__; + + x = gather(full, array, inv, full, ELEM_SIZE); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( x[i] != inv[i] + 1 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( x[i] ) + return __LINE__; + + y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE); +#if ITEM_COUNT == ELEM_COUNT + if ( !to_bool(y == x - 1) ) + return __LINE__; +#else + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( y[i] != x[i] - 1 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( y[i] ) + return __LINE__; +#endif + +#if ELEM_SIZE > 1 + x = gather(full, array, inv * 2, full, ELEM_SIZE / 2); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( x[i] != inv[i] + 1 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( x[i] ) + return __LINE__; + +# if ELEM_SIZE == IDX_SIZE + y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE); + for ( i = 0; i < ITEM_COUNT; ++i ) + if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 ) + return __LINE__; + for ( ; i < ELEM_COUNT; ++i ) + if ( y[i] ) + return __LINE__; +# endif +#endif + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -12,6 +12,7 @@ #include "fma4.h" #include "fma.h" #include "avx2.h" +#include "avx2-sg.h" #define verbose false /* Switch to true for far more logging. */ @@ -60,6 +61,7 @@ static bool simd_check_avx2(void) { return cpu_has_avx2; } +#define simd_check_avx2_sg simd_check_avx2 static void simd_set_regs(struct cpu_user_regs *regs) { @@ -173,6 +175,22 @@ static const struct { SIMD(AVX2 u32x8, avx2, 32u4), SIMD(AVX2 s64x4, avx2, 32i8), SIMD(AVX2 u64x4, avx2, 32u8), + SIMD(AVX2 S/G f32[4x32], avx2_sg, 16x4f4), + SIMD(AVX2 S/G f64[2x32], avx2_sg, 16x4f8), + SIMD(AVX2 S/G f32[2x64], avx2_sg, 16x8f4), + SIMD(AVX2 S/G f64[2x64], avx2_sg, 16x8f8), + SIMD(AVX2 S/G f32[8x32], avx2_sg, 32x4f4), + SIMD(AVX2 S/G f64[4x32], avx2_sg, 32x4f8), + SIMD(AVX2 S/G f32[4x64], avx2_sg, 32x8f4), + SIMD(AVX2 S/G f64[4x64], avx2_sg, 32x8f8), + SIMD(AVX2 S/G i32[4x32], avx2_sg, 16x4i4), + SIMD(AVX2 S/G i64[2x32], avx2_sg, 16x4i8), + SIMD(AVX2 S/G i32[2x64], avx2_sg, 16x8i4), + SIMD(AVX2 S/G i64[2x64], avx2_sg, 16x8i8), + SIMD(AVX2 S/G i32[8x32], avx2_sg, 32x4i4), + SIMD(AVX2 S/G i64[4x32], avx2_sg, 32x4i8), + SIMD(AVX2 S/G i32[4x64], avx2_sg, 32x8i4), + SIMD(AVX2 S/G i64[4x64], avx2_sg, 32x8i8), #undef SIMD_ #undef SIMD }; --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -391,6 +391,7 @@ static const struct { [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 }, [0x8c] = { .simd_size = simd_other }, [0x8e] = { .simd_size = simd_other, .to_mem = 1 }, + [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, [0x96 ... 0x9f] = { .simd_size = simd_packed_fp }, [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp }, [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp }, @@ -598,6 +599,7 @@ struct x86_emulate_state { ext_8f0a, } ext; uint8_t modrm, modrm_mod, modrm_reg, modrm_rm; + uint8_t sib_index, sib_scale; uint8_t rex_prefix; bool lock_prefix; bool not_64bit; /* Instruction not available in 64bit. */ @@ -2411,7 +2413,7 @@ x86_decode( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { - uint8_t b, d, sib, sib_index, sib_base; + uint8_t b, d; unsigned int def_op_bytes, def_ad_bytes, opcode; enum x86_segment override_seg = x86_seg_none; bool pc_rel = false; @@ -2745,6 +2747,7 @@ x86_decode( if ( modrm_mod == 3 ) { + generate_exception_if(d & vSIB, EXC_UD); modrm_rm |= (rex_prefix & 1) << 3; ea.type = OP_REG; } @@ -2805,13 +2808,17 @@ x86_decode( ea.type = OP_MEM; if ( modrm_rm == 4 ) { - sib = insn_fetch_type(uint8_t); - sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8); - sib_base = (sib & 7) | ((rex_prefix << 3) & 8); - if ( sib_index != 4 && !(d & vSIB) ) - ea.mem.off = *decode_register(sib_index, state->regs, - false); - ea.mem.off <<= (sib >> 6) & 3; + uint8_t sib = insn_fetch_type(uint8_t); + uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8); + + state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8); + state->sib_scale = (sib >> 6) & 3; + if ( state->sib_index != 4 && !(d & vSIB) ) + { + ea.mem.off = *decode_register(state->sib_index, + state->regs, false); + ea.mem.off <<= state->sib_scale; + } if ( (modrm_mod == 0) && ((sib_base & 7) == 5) ) ea.mem.off += insn_fetch_type(int32_t); else if ( sib_base == 4 ) @@ -7472,6 +7479,110 @@ x86_emulate( break; } + case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */ + { + unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7); + typeof(vex) *pvex; + union { + int32_t dw[8]; + int64_t qw[4]; + } index, mask; + + ASSERT(ea.type == OP_MEM); + generate_exception_if(modrm_reg == state->sib_index || + modrm_reg == mask_reg || + state->sib_index == mask_reg, EXC_UD); + generate_exception_if(!cpu_has_avx, EXC_UD); + vcpu_must_have(avx2); + get_fpu(X86EMUL_FPU_ymm, &fic); + + /* Read destination, index, and mask registers. */ + opc = init_prefixes(stub); + pvex = copy_VEX(opc, vex); + pvex->opcx = vex_0f; + opc[0] = 0x7f; /* vmovdqa */ + /* Use (%rax) as destination and modrm_reg as source. */ + pvex->r = !mode_64bit() || !(modrm_reg & 8); + pvex->b = 1; + opc[1] = (modrm_reg & 7) << 3; + pvex->reg = 0xf; + opc[2] = 0xc3; + + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); + + pvex->pfx = vex_f3; /* vmovdqu */ + /* Switch to sib_index as source. */ + pvex->r = !mode_64bit() || !(state->sib_index & 8); + opc[1] = (state->sib_index & 7) << 3; + + invoke_stub("", "", "=m" (index) : "a" (&index)); + + /* Switch to mask_reg as source. */ + pvex->r = !mode_64bit() || !(mask_reg & 8); + opc[1] = (mask_reg & 7) << 3; + + invoke_stub("", "", "=m" (mask) : "a" (&mask)); + put_stub(stub); + + /* Clear untouched parts of the destination and mask values. */ + n = 1 << (2 + vex.l - ((b & 1) | vex.w)); + op_bytes = 4 << vex.w; + memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes); + memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes); + + for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i ) + { + if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 ) + { + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; + + rc = ops->read(ea.mem.seg, + ea.mem.off + (idx << state->sib_scale), + (void *)mmvalp + i * op_bytes, op_bytes, ctxt); + if ( rc != X86EMUL_OKAY ) + break; + +#ifdef __XEN__ + if ( i + 1 < n && local_events_need_delivery() ) + rc = X86EMUL_RETRY; +#endif + } + + if ( vex.w ) + mask.qw[i] = 0; + else + mask.dw[i] = 0; + } + + /* Write destination and mask registers. */ + opc = init_prefixes(stub); + pvex = copy_VEX(opc, vex); + pvex->opcx = vex_0f; + opc[0] = 0x6f; /* vmovdqa */ + /* Use modrm_reg as destination and (%rax) as source. */ + pvex->r = !mode_64bit() || !(modrm_reg & 8); + pvex->b = 1; + opc[1] = (modrm_reg & 7) << 3; + pvex->reg = 0xf; + opc[2] = 0xc3; + + invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp)); + + pvex->pfx = vex_f3; /* vmovdqu */ + /* Switch to mask_reg as destination. */ + pvex->r = !mode_64bit() || !(mask_reg & 8); + opc[1] = (mask_reg & 7) << 3; + + invoke_stub("", "", "+m" (mask) : "a" (&mask)); + put_stub(stub); + + state->simd_size = simd_none; + break; + } + case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */ --- a/xen/arch/x86/x86_emulate.c +++ b/xen/arch/x86/x86_emulate.c @@ -10,6 +10,7 @@ */ #include <xen/domain_page.h> +#include <xen/event.h> #include <asm/x86_emulate.h> #include <asm/asm_defns.h> /* mark_regs_dirty() */ #include <asm/processor.h> /* current_cpu_info */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.