[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen master] x86emul: AVX512-FP16 testing



commit e291c4c3e1ec1a693e6b8de5562d6a9607ad2722
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Mon Jun 5 16:54:30 2023 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Mon Jun 5 16:54:30 2023 +0200

    x86emul: AVX512-FP16 testing
    
    Naming of some of the builtins isn't fully consistent with that of pre-
    existing ones, so there's a need for a new BR2() wrapper macro.
    
    With the tests providing some proof of proper functioning of the
    emulator code also enable use of the feature by guests, as there's no
    other infrastructure involved in enabling this ISA extension.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Acked-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Acked-by: Henry Wang <Henry.Wang@xxxxxxx> # CHANGELOG
---
 CHANGELOG.md                                 |   1 +
 tools/tests/x86_emulator/Makefile            |   7 +-
 tools/tests/x86_emulator/simd-fma.c          | 165 +++++++++++++++++++++++++++
 tools/tests/x86_emulator/simd.c              |  90 ++++++++++++++-
 tools/tests/x86_emulator/simd.h              |  16 +++
 tools/tests/x86_emulator/test_x86_emulator.c |  15 +++
 xen/include/public/arch-x86/cpufeatureset.h  |   2 +-
 7 files changed, 292 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5bfd3aa5c0..ceedaf333a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ The format is based on [Keep a 
Changelog](https://keepachangelog.com/en/1.0.0/)
    - Bus-lock detection, used by Xen to mitigate (by rate-limiting) the system
      wide impact of a guest misusing atomic instructions.
  - xl/libxl can customize SMBIOS strings for HVM guests.
+ - Add support for AVX512-FP16 on x86.
 
 ## 
[4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) 
- 2022-12-12
 
diff --git a/tools/tests/x86_emulator/Makefile 
b/tools/tests/x86_emulator/Makefile
index a6f5bfa53e..4079412d2e 100644
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
 
 CFLAGS += $(CFLAGS_xeninclude)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er 
avx512vbmi
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er 
avx512vbmi avx512fp16
 FMA := fma4 fma
 SG := avx2-sg avx512f-sg avx512vl-sg
 AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
@@ -91,6 +91,9 @@ avx512vbmi-vecs := $(avx512bw-vecs)
 avx512vbmi-ints := $(avx512bw-ints)
 avx512vbmi-flts := $(avx512bw-flts)
 avx512vbmi2-vecs := $(avx512bw-vecs)
+avx512fp16-vecs := $(avx512bw-vecs)
+avx512fp16-ints :=
+avx512fp16-flts := 2
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1 2
@@ -248,7 +251,7 @@ $(addsuffix .c,$(GF)):
 
 $(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)): simd.h
 
-xop.h avx512f.h: simd-fma.c
+xop.h avx512f.h avx512fp16.h: simd-fma.c
 
 endif # 32-bit override
 
diff --git a/tools/tests/x86_emulator/simd-fma.c 
b/tools/tests/x86_emulator/simd-fma.c
index d2ccefac9b..56c4d1ae8c 100644
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -28,6 +28,8 @@ ENTRY(fma_test);
 #  define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
 # elif FLOAT_SIZE == 8
 #  define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
+# elif FLOAT_SIZE == 2
+#  define fmaddsub(x, y, z) BR(vfmaddsubph, _mask, x, y, z, ~0)
 # endif
 #elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
@@ -70,6 +72,75 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#ifdef __AVX512FP16__
+# define I (1.if16)
+# if VEC_SIZE > FLOAT_SIZE
+#  define CELEM_COUNT (ELEM_COUNT / 2)
+static const unsigned int conj_mask = 0x80000000;
+#  define conj(z) ({ \
+    vec_t r_; \
+    asm ( "vpxord %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (z), "m" (conj_mask), "i" (CELEM_COUNT) ); \
+    r_; \
+})
+#  define _cmul_vv(a, b, c)  BR2(vf##c##mulcph, , a, b)
+#  define _cmul_vs(a, b, c) ({ \
+    vec_t r_; \
+    _Complex _Float16 b_ = (b); \
+    asm ( "vf"#c"mulcph %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (a), "m" (b_), "i" (CELEM_COUNT) ); \
+    r_; \
+})
+#  define cmadd_vv(a, b, c) BR2(vfmaddcph, , a, b, c)
+#  define cmadd_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    vec_t r_; \
+    asm ( "vfmaddcph %2%{1to%c3%}, %1, %0" \
+          : "=v" (r_) \
+          : "v" (a), "m" (b_), "i" (CELEM_COUNT), "0" (c) ); \
+    r_; \
+})
+# else
+#  define CELEM_COUNT 1
+typedef _Float16 __attribute__((vector_size(4))) cvec_t;
+#  define conj(z) ({ \
+    cvec_t r_; \
+    asm ( "xor $0x80000000, %0" : "=rm" (r_) : "0" (z) ); \
+    r_; \
+})
+#  define _cmul_vv(a, b, c) ({ \
+    cvec_t r_; \
+    /* "=&x" to force destination to be different from both sources */ \
+    asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b) ); \
+    r_; \
+})
+#  define _cmul_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    cvec_t r_; \
+    /* "=&x" to force destination to be different from both sources */ \
+    asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b_) ); \
+    r_; \
+})
+#  define cmadd_vv(a, b, c) ({ \
+    cvec_t r_ = (c); \
+    asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b) ); \
+    r_; \
+})
+#  define cmadd_vs(a, b, c) ({ \
+    _Complex _Float16 b_ = (b); \
+    cvec_t r_ = (c); \
+    asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b_) ); \
+    r_; \
+})
+# endif
+# define cmul_vv(a, b) _cmul_vv(a, b, )
+# define cmulc_vv(a, b) _cmul_vv(a, b, c)
+# define cmul_vs(a, b) _cmul_vs(a, b, )
+# define cmulc_vs(a, b) _cmul_vs(a, b, c)
+#endif
+
 int fma_test(void)
 {
     unsigned int i;
@@ -156,5 +227,99 @@ int fma_test(void)
     touch(inv);
 #endif
 
+#ifdef CELEM_COUNT
+
+# if VEC_SIZE > FLOAT_SIZE
+#  define cvec_t vec_t
+#  define ceq eq
+# else
+  {
+    /* Cannot re-use the function-scope variables (for being too small). */
+    cvec_t x, y, z, src = { 1, 2 }, inv = { 2, 1 }, one = { 1, 1 };
+#  define ceq(x, y) ({ \
+    unsigned int r_; \
+    asm ( "vcmpph $0, %1, %2, %0"  : "=k" (r_) : "x" (x), "x" (y) ); \
+    (r_ & 3) == 3; \
+})
+# endif
+
+    /* (a * i)² == -a² */
+    x = cmul_vs(src, I);
+    y = cmul_vv(x, x);
+    x = -src;
+    touch(src);
+    z = cmul_vv(x, src);
+    if ( !ceq(y, z) ) return __LINE__;
+
+    /* conj(a * b) == conj(a) * conj(b) */
+    touch(src);
+    x = conj(src);
+    touch(inv);
+    y = cmulc_vv(x, inv);
+    touch(src);
+    touch(inv);
+    z = conj(cmul_vv(src, inv));
+    if ( !ceq(y, z) ) return __LINE__;
+
+    /* a * conj(a) == |a|² */
+    touch(src);
+    y = src;
+    touch(src);
+    x = cmulc_vv(y, src);
+    y *= y;
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        if ( x[i] != y[i] + y[i + 1] ) return __LINE__;
+        if ( x[i + 1] ) return __LINE__;
+    }
+
+    /* a * b == b * a + 0 */
+    touch(src);
+    touch(inv);
+    x = cmul_vv(src, inv);
+    touch(src);
+    touch(inv);
+    y = cmadd_vv(inv, src, (cvec_t){});
+    if ( !ceq(x, y) ) return __LINE__;
+
+    /* a * 1 + b == b * 1 + a */
+    touch(src);
+    touch(inv);
+    x = cmadd_vs(src, 1, inv);
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        z[i] = 1;
+        z[i + 1] = 0;
+    }
+    touch(z);
+    y = cmadd_vv(inv, z, src);
+    if ( !ceq(x, y) ) return __LINE__;
+
+    /* (a + b) * c == a * c + b * c */
+    touch(one);
+    touch(inv);
+    x = cmul_vv(src + one, inv);
+    touch(inv);
+    y = cmul_vv(one, inv);
+    touch(inv);
+    z = cmadd_vv(src, inv, y);
+    if ( !ceq(x, z) ) return __LINE__;
+
+    /* a * i + conj(a) == (Re(a) - Im(a)) * (1 + i) */
+    x = cmadd_vs(src, I, conj(src));
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        typeof(x[0]) val = src[i] - src[i + 1];
+
+        if ( x[i] != val ) return __LINE__;
+        if ( x[i + 1] != val ) return __LINE__;
+    }
+
+# if VEC_SIZE == FLOAT_SIZE
+  }
+# endif
+
+#endif /* CELEM_COUNT */
+
     return 0;
 }
diff --git a/tools/tests/x86_emulator/simd.c b/tools/tests/x86_emulator/simd.c
index c805f1cc1e..263cea662d 100644
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -20,6 +20,14 @@ ENTRY(simd_test);
     asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
     r_ == 1; \
 })
+# elif VEC_SIZE == 2
+#  define eq(x, y) ({ \
+    _Float16 x_ = (x)[0]; \
+    _Float16 __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned int r_; \
+    asm ( "vcmpsh $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
 # elif FLOAT_SIZE == 4
 /*
  * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
@@ -31,6 +39,8 @@ ENTRY(simd_test);
 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
 # elif FLOAT_SIZE == 8
 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif FLOAT_SIZE == 2
+#  define eq(x, y) (B(cmpph, _mask, x, y, 0, -1) == ALL_TRUE)
 # elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
 #  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
 # elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
@@ -116,6 +126,14 @@ static inline bool _to_bool(byte_vec_t bv)
     asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
     (vec_t){ t_[0] }; \
 })
+#  elif FLOAT_SIZE == 2
+#   define to_u_int(type, x) ({ \
+    unsigned type u_; \
+    _Float16 __attribute__((vector_size(16))) t_; \
+    asm ( "vcvtsh2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
+    asm ( "vcvtusi2sh%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
+    (vec_t){ t_[0] }; \
+})
 #  endif
 #  define to_uint(x) to_u_int(int, x)
 #  ifdef __x86_64__
@@ -153,6 +171,43 @@ static inline bool _to_bool(byte_vec_t bv)
 #   define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, 
(vdi_t)undef(), ~0), undef(), ~0)
 #   define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, 
(vdi_t)undef(), ~0), undef(), ~0)
 #  endif
+# elif FLOAT_SIZE == 2
+#  define to_int(x) BR2(vcvtw2ph, _mask, BR2(vcvtph2w, _mask, x, 
(vhi_t)undef(), ~0), undef(), ~0)
+#  define to_uint(x) BR2(vcvtuw2ph, _mask, BR2(vcvtph2uw, _mask, x, 
(vhi_t)undef(), ~0), undef(), ~0)
+#  if VEC_SIZE == 16
+#   define low_half(x) (x)
+#   define high_half(x) ((vec_t)B_(movhlps, , (vsf_t)undef(), (vsf_t)(x)))
+#   define insert_half(x, y, p) ((vec_t)((p) ? B_(movlhps, , (vsf_t)(x), 
(vsf_t)(y)) \
+                                             : B_(shufps, , (vsf_t)(y), 
(vsf_t)(x), 0b11100100)))
+#  elif VEC_SIZE == 32
+#   define _half(x, lh) ((vhf_half_t)B(extracti32x4_, _mask, (vsi_t)(x), lh, 
(vsi_half_t){}, ~0))
+#   define low_half(x)  _half(x, 0)
+#   define high_half(x) _half(x, 1)
+#   define insert_half(x, y, p) \
+    ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_half_t)(y), p, 
(vsi_t)undef(), ~0))
+#  elif VEC_SIZE == 64
+#   define _half(x, lh) \
+    ((vhf_half_t)__builtin_ia32_extracti64x4_mask((vdi_t)(x), lh, 
(vdi_half_t){}, ~0))
+#   define low_half(x)  _half(x, 0)
+#   define high_half(x) _half(x, 1)
+#   define insert_half(x, y, p) \
+    ((vec_t)__builtin_ia32_inserti64x4_mask((vdi_t)(x), (vdi_half_t)(y), p, 
(vdi_t)undef(), ~0))
+#  endif
+#  define to_w_int(x, s) ({ \
+    vhf_half_t t_ = low_half(x); \
+    vsi_t lo_, hi_; \
+    touch(t_); \
+    lo_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
+    t_ = high_half(x); \
+    touch(t_); \
+    hi_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
+    touch(lo_); touch(hi_); \
+    insert_half(insert_half(undef(), \
+                            BR2(vcvt ## s ## dq2ph, _mask, lo_, 
(vhf_half_t){}, ~0), 0), \
+                BR2(vcvt ## s ## dq2ph, _mask, hi_, (vhf_half_t){}, ~0), 1); \
+})
+#  define to_wint(x) to_w_int(x, )
+#  define to_uwint(x) to_w_int(x, u)
 # endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
@@ -240,10 +295,18 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
 #  define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
 #  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
 #  define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
+# elif FLOAT_SIZE == 2
+#  define getexp(x) scalar_1op(x, "vgetexpsh %[in], %[out], %[out]")
+#  define getmant(x) scalar_1op(x, "vgetmantsh $0, %[in], %[out], %[out]")
+#  define recip(x) scalar_1op(x, "vrcpsh %[in], %[out], %[out]")
+#  define rsqrt(x) scalar_1op(x, "vrsqrtsh %[in], %[out], %[out]")
+#  define scale(x, y) scalar_2op(x, y, "vscalefsh %[in2], %[in1], %[out]")
+#  define sqrt(x) scalar_1op(x, "vsqrtsh %[in], %[out], %[out]")
+#  define trunc(x) scalar_1op(x, "vrndscalesh $0b1011, %[in], %[out], %[out]")
 # endif
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
-# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+# if (ELEM_COUNT == 8 && ELEM_SIZE >= 4) /* vextractf{32,64}x4 */ || \
      (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* 
vextractf32x8 */ || \
      (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* 
vextractf64x2 */
 #  define _half(x, lh) ({ \
@@ -398,6 +461,21 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
                          VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
                        0b01010101, undef(), ~0)
 #  endif
+# elif FLOAT_SIZE == 2
+#  define frac(x) BR2(reduceph, _mask, x, 0b00001011, undef(), ~0)
+#  define getexp(x) BR(getexpph, _mask, x, undef(), ~0)
+#  define getmant(x) BR(getmantph, _mask, x, 0, undef(), ~0)
+#  define max(x, y) BR2(maxph, _mask, x, y, undef(), ~0)
+#  define min(x, y) BR2(minph, _mask, x, y, undef(), ~0)
+#  define scale(x, y) BR2(scalefph, _mask, x, y, undef(), ~0)
+#  define recip(x) B(rcpph, _mask, x, undef(), ~0)
+#  define rsqrt(x) B(rsqrtph, _mask, x, undef(), ~0)
+#  define shrink1(x) BR2(vcvtps2phx, _mask, (vsf_t)(x), (vhf_half_t){}, ~0)
+#  define shrink2(x) BR2(vcvtpd2ph, _mask, (vdf_t)(x), (vhf_quarter_t){}, ~0)
+#  define sqrt(x) BR2(sqrtph, _mask, x, undef(), ~0)
+#  define trunc(x) BR2(rndscaleph, _mask, x, 0b1011, undef(), ~0)
+#  define widen1(x) ((vec_t)BR2(vcvtph2psx, _mask, x, (vsf_t)undef(), ~0))
+#  define widen2(x) ((vec_t)BR2(vcvtph2pd, _mask, x, (vdf_t)undef(), ~0))
 # endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
@@ -920,6 +998,16 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
 #  define dup_lo(x) B(movddup, _mask, x, undef(), ~0)
 # endif
 #endif
+#if FLOAT_SIZE == 2 && ELEM_COUNT > 1
+# define dup_hi(x) ((vec_t)B(pshufhw, _mask, \
+                             B(pshuflw, _mask, (vhi_t)(x), 0b11110101, \
+                               (vhi_t)undef(), ~0), \
+                             0b11110101, (vhi_t)undef(), ~0))
+# define dup_lo(x) ((vec_t)B(pshufhw, _mask, \
+                             B(pshuflw, _mask, (vhi_t)(x), 0b10100000, \
+                               (vhi_t)undef(), ~0), \
+                             0b10100000, (vhi_t)undef(), ~0))
+#endif
 #if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
diff --git a/tools/tests/x86_emulator/simd.h b/tools/tests/x86_emulator/simd.h
index 936952ab52..74a2169d7b 100644
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -53,6 +53,9 @@ float
 # elif FLOAT_SIZE == 8
 #  define MODE DF
 #  define ELEM_SFX "d"
+# elif FLOAT_SIZE == 2
+#  define MODE HF
+#  define ELEM_SFX "h"
 # endif
 #endif
 #ifndef VEC_SIZE
@@ -67,7 +70,10 @@ typedef unsigned int __attribute__((mode(QI), 
vector_size(VEC_SIZE))) byte_vec_t
 /* Various builtins want plain char / int / long long vector types ... */
 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+#if VEC_SIZE >= 4
 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+typedef float __attribute__((vector_size(VEC_SIZE))) vsf_t;
+#endif
 #if VEC_SIZE >= 8
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
@@ -96,6 +102,9 @@ typedef char __attribute__((vector_size(HALF_SIZE))) 
vqi_half_t;
 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+#ifdef __AVX512FP16__
+typedef _Float16 __attribute__((vector_size(HALF_SIZE))) vhf_half_t;
+#endif
 typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
 # endif
 
@@ -110,6 +119,9 @@ typedef char __attribute__((vector_size(QUARTER_SIZE))) 
vqi_quarter_t;
 typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
 typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
 typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
+#ifdef __AVX512FP16__
+typedef _Float16 __attribute__((vector_size(QUARTER_SIZE))) vhf_quarter_t;
+#endif
 # endif
 
 # if ELEM_COUNT >= 8
@@ -163,6 +175,7 @@ DECL_OCTET(half);
 #elif VEC_SIZE == 64
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
 # define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
+# define BR2(n, s, a...) __builtin_ia32_ ## n ## 512 ## s ## _round(a, 4)
 #endif
 #ifndef B_
 # define B_ B
@@ -171,6 +184,9 @@ DECL_OCTET(half);
 # define BR B
 # define BR_ B_
 #endif
+#ifndef BR2
+# define BR2 BR
+#endif
 #ifndef BR_
 # define BR_ BR
 #endif
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c 
b/tools/tests/x86_emulator/test_x86_emulator.c
index bbb0d67a3b..0757bd48b8 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -43,6 +43,7 @@ asm ( ".pushsection .test, \"ax\", @progbits; .popsection" );
 #include "avx512er.h"
 #include "avx512vbmi.h"
 #include "avx512vbmi2-vpclmulqdq.h"
+#include "avx512fp16.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -249,6 +250,16 @@ static bool simd_check_avx512bw_gf_vl(void)
     return cpu_has_gfni && cpu_has_avx512vl;
 }
 
+static bool simd_check_avx512fp16(void)
+{
+    return cpu_has_avx512_fp16;
+}
+
+static bool simd_check_avx512fp16_vl(void)
+{
+    return cpu_has_avx512_fp16 && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -513,6 +524,10 @@ static const struct {
     AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
     AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
     AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(AVX512_FP16 f16 scal,avx512fp16,     f2),
+    SIMD(AVX512_FP16 f16x32, avx512fp16,    64f2),
+    AVX512VL(_FP16+VL f16x8, avx512fp16,    16f2),
+    AVX512VL(_FP16+VL f16x16,avx512fp16,    32f2),
     SIMD(SHA,                sse4_sha,        16),
     SIMD(AVX+SHA,             avx_sha,        16),
     AVX512VL(VL+SHA,      avx512f_sha,        16),
diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index 4edf9aba7f..ea779c2987 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -267,7 +267,7 @@ XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* 
MSR_TSX_FORCE_ABORT.RTM_ABORT */
 XEN_CPUFEATURE(SERIALIZE,     9*32+14) /*A  SERIALIZE insn */
 XEN_CPUFEATURE(TSXLDTRK,      9*32+16) /*a  TSX load tracking suspend/resume 
insns */
 XEN_CPUFEATURE(CET_IBT,       9*32+20) /*   CET - Indirect Branch Tracking */
-XEN_CPUFEATURE(AVX512_FP16,   9*32+23) /*   AVX512 FP16 instructions */
+XEN_CPUFEATURE(AVX512_FP16,   9*32+23) /*A  AVX512 FP16 instructions */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by 
Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */
 XEN_CPUFEATURE(L1D_FLUSH,     9*32+28) /*S  MSR_FLUSH_CMD and L1D flush. */
--
generated by git-patchbot for /home/xen/git/xen.git#master



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.