[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v8 14/50] x86emul: basic AVX512DQ testing



Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v6: Re-base.
v5: Re-base over changes earlier in the series.
v4: Wrap OVR(pmullq) in __AVX512VL__ conditional.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
 
 CFLAGS += $(CFLAGS_xeninclude)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -69,9 +69,12 @@ avx512f-flts := 4 8
 avx512bw-vecs := $(avx512f-vecs)
 avx512bw-ints := 1 2
 avx512bw-flts :=
+avx512dq-vecs := $(avx512f-vecs)
+avx512dq-ints := $(avx512f-ints)
+avx512dq-flts := $(avx512f-flts)
 
 avx512f-opmask-vecs := 2
-avx512dq-opmask-vecs := 1
+avx512dq-opmask-vecs := 1 2
 avx512bw-opmask-vecs := 4 8
 
 # Suppress building by default of the harness if the compiler can't deal
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -121,6 +121,34 @@ typedef int __attribute__((vector_size(E
 typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
 # endif
 
+# define DECL_PAIR(w) \
+typedef w ## _t pair_t; \
+typedef vsi_ ## w ## _t vsi_pair_t; \
+typedef vdi_ ## w ## _t vdi_pair_t
+# define DECL_QUARTET(w) \
+typedef w ## _t quartet_t; \
+typedef vsi_ ## w ## _t vsi_quartet_t; \
+typedef vdi_ ## w ## _t vdi_quartet_t
+# define DECL_OCTET(w) \
+typedef w ## _t octet_t; \
+typedef vsi_ ## w ## _t vsi_octet_t; \
+typedef vdi_ ## w ## _t vdi_octet_t
+
+# if ELEM_COUNT == 4
+DECL_PAIR(half);
+# elif ELEM_COUNT == 8
+DECL_PAIR(quarter);
+DECL_QUARTET(half);
+# elif ELEM_COUNT == 16
+DECL_PAIR(eighth);
+DECL_QUARTET(quarter);
+DECL_OCTET(half);
+# endif
+
+# undef DECL_OCTET
+# undef DECL_QUARTET
+# undef DECL_PAIR
+
 #endif
 
 #if VEC_SIZE == 16
@@ -146,6 +174,14 @@ typedef long long __attribute__((vector_
 #ifdef __AVX512F__
 
 /* Sadly there are a few exceptions to the general naming rules. */
+# define __builtin_ia32_broadcastf32x4_512_mask 
__builtin_ia32_broadcastf32x4_512
+# define __builtin_ia32_broadcasti32x4_512_mask 
__builtin_ia32_broadcasti32x4_512
+# define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
+# define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
+# define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
+# define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
+# define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
+# define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
 # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
 # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
 # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -331,6 +367,20 @@ OVR(punpcklwd);
 #  endif
 # endif
 
+# ifdef __AVX512DQ__
+OVR_VFP(and);
+OVR_VFP(andn);
+OVR_VFP(or);
+OVR(pextrd);
+OVR(pextrq);
+OVR(pinsrd);
+OVR(pinsrq);
+#  ifdef __AVX512VL__
+OVR(pmullq);
+#  endif
+OVR_VFP(xor);
+# endif
+
 # undef OVR_VFP
 # undef OVR_SFP
 # undef OVR_INT
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -139,6 +139,27 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+     (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* 
vextractf32x8 */ || \
+     (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* 
vextractf64x2 */
+#  define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+# endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
+     (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* 
vextractf64x2 */
+#  define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+# endif
 # if FLOAT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -146,6 +167,17 @@ static inline bool _to_bool(byte_vec_t b
           : "=v" (t_) : "m" (*(float[1]){ x }) ); \
     t_; \
 })
+#  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#   define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#  endif
+#  if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#   define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
+#   define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
+#  endif
 #  define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #  define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
@@ -155,6 +187,13 @@ static inline bool _to_bool(byte_vec_t b
 #   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #   define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
 #  else
+#   define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
+#   define insert_pair(x, y, p) \
+    B(insertf32x4_, _mask, x, \
+      /* Cast needed below to work around gcc 7.x quirk. */ \
+      (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
+      (p) >> 1, x, 3 << ((p) * 2))
+#   define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
 #   define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #   define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
 #   define swap(x) ({ \
@@ -178,6 +217,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#   define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
+#   define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
+#  endif
+#  if VEC_SIZE == 64
+#   define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
+#   define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
+#  endif
 #  define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
 #  define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
@@ -306,6 +353,16 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 # endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
+       (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* 
vextracti64x2 */
+#  define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -318,11 +375,30 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  ifdef __AVX512DQ__
+#   define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#  endif
+#  if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#   define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, 
(vsi_octet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), 
(vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), 
(vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), 
(vsi_t)(y), (vsi_t)undef(), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, 
(vsi_t)undef(), ~0))
 #  else
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, 
(vsi_quartet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_pair(x, y, p) \
+    (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
+              /* First cast needed below to work around gcc 7.x quirk. */ \
+              (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 
0b01000100) \
+                      : (vsi_pair_t)(y), \
+              (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), 
(vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), 
interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, 
(vsi_t)(x), (vsi_t)(y), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, \
@@ -347,6 +423,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#   define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, 
(vdi_pair_t)(x), (vdi_t)undef(), ~0))
+#   define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), 
(vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
+#  if VEC_SIZE == 64
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , 
(vdi_quartet_t)(x), (vdi_t)undef(), ~0))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), 
(vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), 
(vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), 
(vdi_t)(y), (vdi_t)undef(), ~0))
@@ -898,7 +982,7 @@ static inline eighth_t low_eighth(vec_t
     eighth_t y;
     unsigned int i;
 
-    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+    for ( i = 0; i < ELEM_COUNT / 8; ++i )
         y[i] = x[i];
 
     return y;
@@ -910,6 +994,50 @@ static inline eighth_t low_eighth(vec_t
 
 #endif
 
+#ifdef broadcast_pair
+# if ELEM_COUNT == 4
+#  define broadcast_half broadcast_pair
+# elif ELEM_COUNT == 8
+#  define broadcast_quarter broadcast_pair
+# elif ELEM_COUNT == 16
+#  define broadcast_eighth broadcast_pair
+# endif
+#endif
+
+#ifdef insert_pair
+# if ELEM_COUNT == 4
+#  define insert_half insert_pair
+# elif ELEM_COUNT == 8
+#  define insert_quarter insert_pair
+# elif ELEM_COUNT == 16
+#  define insert_eighth insert_pair
+# endif
+#endif
+
+#ifdef broadcast_quartet
+# if ELEM_COUNT == 8
+#  define broadcast_half broadcast_quartet
+# elif ELEM_COUNT == 16
+#  define broadcast_quarter broadcast_quartet
+# endif
+#endif
+
+#ifdef insert_quartet
+# if ELEM_COUNT == 8
+#  define insert_half insert_quartet
+# elif ELEM_COUNT == 16
+#  define insert_quarter insert_quartet
+# endif
+#endif
+
+#if defined(broadcast_octet) && ELEM_COUNT == 16
+# define broadcast_half broadcast_octet
+#endif
+
+#if defined(insert_octet) && ELEM_COUNT == 16
+# define insert_half insert_octet
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1205,6 +1333,60 @@ int simd_test(void)
     if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#if defined(broadcast_half) && defined(insert_half)
+    {
+        half_t aux = low_half(src);
+
+        touch(aux);
+        x = broadcast_half(aux);
+        touch(aux);
+        y = insert_half(src, aux, 1);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_quarter) && defined(insert_quarter)
+    {
+        quarter_t aux = low_quarter(src);
+
+        touch(aux);
+        x = broadcast_quarter(aux);
+        touch(aux);
+        y = insert_quarter(src, aux, 1);
+        touch(aux);
+        y = insert_quarter(y, aux, 2);
+        touch(aux);
+        y = insert_quarter(y, aux, 3);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_eighth) && defined(insert_eighth) && \
+    /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
+    __GNUC__ >= 8
+    {
+        eighth_t aux = low_eighth(src);
+
+        touch(aux);
+        x = broadcast_eighth(aux);
+        touch(aux);
+        y = insert_eighth(src, aux, 1);
+        touch(aux);
+        y = insert_eighth(y, aux, 2);
+        touch(aux);
+        y = insert_eighth(y, aux, 3);
+        touch(aux);
+        y = insert_eighth(y, aux, 4);
+        touch(aux);
+        y = insert_eighth(y, aux, 5);
+        touch(aux);
+        y = insert_eighth(y, aux, 6);
+        touch(aux);
+        y = insert_eighth(y, aux, 7);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -23,6 +23,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
 #include "avx512bw.h"
+#include "avx512dq.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -100,6 +101,11 @@ static bool simd_check_avx512dq(void)
 }
 #define simd_check_avx512dq_opmask simd_check_avx512dq
 
+static bool simd_check_avx512dq_vl(void)
+{
+    return cpu_has_avx512dq && cpu_has_avx512vl;
+}
+
 static bool simd_check_avx512bw(void)
 {
     return cpu_has_avx512bw;
@@ -267,9 +273,10 @@ static const struct {
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
     SIMD(OPMASK/w,     avx512f_opmask,         2),
-    SIMD(OPMASK/b,    avx512dq_opmask,         1),
-    SIMD(OPMASK/d,    avx512bw_opmask,         4),
-    SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(OPMASK+DQ/b, avx512dq_opmask,         1),
+    SIMD(OPMASK+DQ/w, avx512dq_opmask,         2),
+    SIMD(OPMASK+BW/d, avx512bw_opmask,         4),
+    SIMD(OPMASK+BW/q, avx512bw_opmask,         8),
     SIMD(AVX512F f32 scalar,  avx512f,        f4),
     SIMD(AVX512F f32x16,      avx512f,      64f4),
     SIMD(AVX512F f64 scalar,  avx512f,        f8),
@@ -302,6 +309,24 @@ static const struct {
     AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
     AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
     AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
+    SIMD(AVX512DQ f32x16,    avx512dq,      64f4),
+    SIMD(AVX512DQ f64x8,     avx512dq,      64f8),
+    SIMD(AVX512DQ s32x16,    avx512dq,      64i4),
+    SIMD(AVX512DQ u32x16,    avx512dq,      64u4),
+    SIMD(AVX512DQ s64x8,     avx512dq,      64i8),
+    SIMD(AVX512DQ u64x8,     avx512dq,      64u8),
+    AVX512VL(DQ+VL f32x4,    avx512dq,      16f4),
+    AVX512VL(DQ+VL f64x2,    avx512dq,      16f8),
+    AVX512VL(DQ+VL f32x8,    avx512dq,      32f4),
+    AVX512VL(DQ+VL f64x4,    avx512dq,      32f8),
+    AVX512VL(DQ+VL s32x4,    avx512dq,      16i4),
+    AVX512VL(DQ+VL u32x4,    avx512dq,      16u4),
+    AVX512VL(DQ+VL s32x8,    avx512dq,      32i4),
+    AVX512VL(DQ+VL u32x8,    avx512dq,      32u4),
+    AVX512VL(DQ+VL s64x2,    avx512dq,      16i8),
+    AVX512VL(DQ+VL u64x2,    avx512dq,      16u8),
+    AVX512VL(DQ+VL s64x4,    avx512dq,      32i8),
+    AVX512VL(DQ+VL u64x4,    avx512dq,      32u8),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.