[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v7 14/49] x86emul: basic AVX512BW testing



Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v6: Re-base over changes earlier in the series.
v4: Add __AVX512VL__ conditional around majority of OVR() additions.
    Correct eq() for 1- and 2-byte cases.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,7 +19,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
 
 CFLAGS += $(CFLAGS_xeninclude)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -69,6 +69,9 @@ xop-flts := $(avx-flts)
 avx512f-vecs := 64 16 32
 avx512f-ints := 4 8
 avx512f-flts := 4 8
+avx512bw-vecs := $(avx512f-vecs)
+avx512bw-ints := 1 2
+avx512bw-flts :=
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -31,6 +31,10 @@ ENTRY(simd_test);
 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
 # elif FLOAT_SIZE == 8
 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
+#  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
+#  define eq(x, y) (B(pcmpeqw, _mask, (vhi_t)(x), (vhi_t)(y), -1) == ALL_TRUE)
 # elif INT_SIZE == 4 || UINT_SIZE == 4
 #  define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
 # elif INT_SIZE == 8 || UINT_SIZE == 8
@@ -374,6 +378,87 @@ static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), 
(vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), 
(vdi_t)undef(), ~0))
 # endif
+#elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \
+      defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastb %1, %0" \
+          : "=v" (t_) : "m" (*(char[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), 
(vqi_t)(y), (vqi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), 
(vqi_t)(y), (vqi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), 
(vqi_t)undef(), ~0))
+#  elif defined(__AVX512VBMI__)
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), 
interleave_hi, (vqi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, 
(vqi_t)(x), (vqi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
+                              
(0b0101010101010101010101010101010101010101010101010101010101010101LL & 
ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), 
(vqi_quarter_t){}, ~0))
+#  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, 
~0))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastw %1, %0" \
+          : "=v" (t_) : "m" (*(short[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), 
(vhi_t)(y), (vhi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), 
(vhi_t)(y), (vhi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(pshufhw, _mask, \
+                                      B(pshuflw, _mask, (vhi_t)(x), 
0b00011011, (vhi_t)undef(), ~0), \
+                                      0b00011011, (vhi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), 
interleave_hi, (vhi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, 
(vhi_t)(x), (vhi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
+                              (0b01010101010101010101010101010101 & ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), 
(vhi_quarter_t){}, ~0))
+# endif
+# if INT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), 
(vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), 
(vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), 
(vdi_t)undef(), ~0))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), 
(vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), 
(vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), 
(vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), 
(vdi_t)undef(), ~0))
+# elif INT_SIZE == 2
+#  define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
+#  define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
+#  define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), 
(vhi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), 
(vhi_t)undef(), ~0))
+#  define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), 
(vhi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), 
(vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), 
(vdi_t)undef(), ~0))
+# endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), 
(vqi_t)(y)))
@@ -565,7 +650,7 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSSE3__)
+#if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
 # elif INT_SIZE == 2
@@ -789,6 +874,40 @@ static inline half_t low_half(vec_t x)
 }
 # endif
 
+# if !defined(low_quarter) && defined(QUARTER_SIZE)
+static inline quarter_t low_quarter(vec_t x)
+{
+#  if QUARTER_SIZE < VEC_SIZE
+    quarter_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+# if !defined(low_eighth) && defined(EIGHTH_SIZE)
+static inline eighth_t low_eighth(vec_t x)
+{
+#  if EIGHTH_SIZE < VEC_SIZE
+    eighth_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
 #endif
 
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
@@ -1117,7 +1236,7 @@ int simd_test(void)
     y = interleave_lo(alt < 0, alt < 0);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen2(x);
+    z = widen2(low_quarter(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 
@@ -1126,7 +1245,7 @@ int simd_test(void)
     y = interleave_lo(y, y);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen3(x);
+    z = widen3(low_eighth(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 #  endif
@@ -1148,14 +1267,14 @@ int simd_test(void)
 
 # ifdef widen2
     touch(src);
-    x = widen2(src);
+    x = widen2(low_quarter(src));
     touch(src);
     if ( !eq(x, z) ) return __LINE__;
 # endif
 
 # ifdef widen3
     touch(src);
-    x = widen3(src);
+    x = widen3(low_eighth(src));
     touch(src);
     if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
 # endif
@@ -1175,6 +1294,36 @@ int simd_test(void)
             if ( aux2[i] != src[i] )
                 return __LINE__;
     }
+#endif
+
+#if defined(widen2) && defined(shrink2)
+    {
+        quarter_t aux1 = low_quarter(src), aux2;
+
+        touch(aux1);
+        x = widen2(aux1);
+        touch(x);
+        aux2 = shrink2(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 4; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
+#if defined(widen3) && defined(shrink3)
+    {
+        eighth_t aux1 = low_eighth(src), aux2;
+
+        touch(aux1);
+        x = widen3(aux1);
+        touch(x);
+        aux2 = shrink3(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 8; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
 #endif
 
 #ifdef dup_lo
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -95,6 +95,32 @@ typedef int __attribute__((vector_size(H
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
 # endif
 
+# if ELEM_COUNT >= 4
+#  if VEC_SIZE > 64
+#   define QUARTER_SIZE (VEC_SIZE / 4)
+#  else
+#   define QUARTER_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) 
quarter_t;
+typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t;
+typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
+typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
+typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
+# endif
+
+# if ELEM_COUNT >= 8
+#  if VEC_SIZE > 128
+#   define EIGHTH_SIZE (VEC_SIZE / 8)
+#  else
+#   define EIGHTH_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) 
eighth_t;
+typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t;
+typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t;
+typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t;
+typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
+# endif
+
 #endif
 
 #if VEC_SIZE == 16
@@ -182,6 +208,9 @@ OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_FP(add);
 OVR_INT(add);
+OVR_BW(adds);
+OVR_BW(addus);
+OVR_BW(avg);
 OVR_FP(div);
 OVR(extractps);
 OVR_FMA(fmadd, FP);
@@ -214,6 +243,8 @@ OVR_INT(srl);
 OVR_DQ(srlv);
 OVR_FP(sub);
 OVR_INT(sub);
+OVR_BW(subs);
+OVR_BW(subus);
 OVR_SFP(ucomi);
 OVR_VFP(unpckh);
 OVR_VFP(unpckl);
@@ -275,6 +306,31 @@ OVR(punpckldq);
 OVR(punpcklqdq);
 # endif
 
+# ifdef __AVX512BW__
+OVR(pextrb);
+OVR(pextrw);
+OVR(pinsrb);
+OVR(pinsrw);
+#  ifdef __AVX512VL__
+OVR(pmaddwd);
+OVR(pmovsxbw);
+OVR(pmovzxbw);
+OVR(pmulhuw);
+OVR(pmulhw);
+OVR(pmullw);
+OVR(psadbw);
+OVR(pshufb);
+OVR(pshufhw);
+OVR(pshuflw);
+OVR(punpckhbw);
+OVR(punpckhwd);
+OVR(punpcklbw);
+OVR(punpcklwd);
+OVR(slldq);
+OVR(srldq);
+#  endif
+# endif
+
 # undef OVR_VFP
 # undef OVR_SFP
 # undef OVR_INT
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -22,6 +22,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
+#include "avx512bw.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -105,6 +106,11 @@ static bool simd_check_avx512bw(void)
 }
 #define simd_check_avx512bw_opmask simd_check_avx512bw
 
+static bool simd_check_avx512bw_vl(void)
+{
+    return cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -284,6 +290,18 @@ static const struct {
     AVX512VL(VL u64x2,        avx512f,      16u8),
     AVX512VL(VL s64x4,        avx512f,      32i8),
     AVX512VL(VL u64x4,        avx512f,      32u8),
+    SIMD(AVX512BW s8x64,     avx512bw,      64i1),
+    SIMD(AVX512BW u8x64,     avx512bw,      64u1),
+    SIMD(AVX512BW s16x32,    avx512bw,      64i2),
+    SIMD(AVX512BW u16x32,    avx512bw,      64u2),
+    AVX512VL(BW+VL s8x16,    avx512bw,      16i1),
+    AVX512VL(BW+VL u8x16,    avx512bw,      16u1),
+    AVX512VL(BW+VL s8x32,    avx512bw,      32i1),
+    AVX512VL(BW+VL u8x32,    avx512bw,      32u1),
+    AVX512VL(BW+VL s16x8,    avx512bw,      16i2),
+    AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
+    AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
+    AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.