[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging] x86emul: support AVX512{F, BW} down conversion moves



commit 30e0bdf79828db975f0248cba933d563038f1e37
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Fri Apr 5 10:42:39 2019 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Fri Apr 5 10:42:39 2019 +0200

    x86emul: support AVX512{F,BW} down conversion moves
    
    Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are
    slightly different from what one would expect, due to them requiring
    EVEX.W to be zero.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Acked-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
 tools/tests/x86_emulator/evex-disp8.c  | 18 ++++++++++++
 tools/tests/x86_emulator/simd.c        | 48 ++++++++++++++++++++++++++++++
 tools/tests/x86_emulator/simd.h        | 17 +++++++++++
 xen/arch/x86/x86_emulate/x86_emulate.c | 53 ++++++++++++++++++++++++++++++++--
 4 files changed, 133 insertions(+), 3 deletions(-)

diff --git a/tools/tests/x86_emulator/evex-disp8.c 
b/tools/tests/x86_emulator/evex-disp8.c
index f3435fa1e4..c41621c587 100644
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -177,11 +177,26 @@ static const struct test avx512f_all[] = {
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovdb,       f3, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovdw,       f3, 0f38, 33,    vl_2,    b, vl),
+    INSN(pmovqb,       f3, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovqd,       f3, 0f38, 35,    vl_2, d_nb, vl),
+    INSN(pmovqw,       f3, 0f38, 34,    vl_4,    b, vl),
+    INSN(pmovsdb,      f3, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsdw,      f3, 0f38, 23,    vl_2,    b, vl),
+    INSN(pmovsqb,      f3, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsqd,      f3, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovsqw,      f3, 0f38, 24,    vl_4,    b, vl),
     INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
     INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
     INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
     INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
     INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovusdb,     f3, 0f38, 11,    vl_4,    b, vl),
+    INSN(pmovusdw,     f3, 0f38, 13,    vl_2,    b, vl),
+    INSN(pmovusqb,     f3, 0f38, 12,    vl_8,    b, vl),
+    INSN(pmovusqd,     f3, 0f38, 15,    vl_2, d_nb, vl),
+    INSN(pmovusqw,     f3, 0f38, 14,    vl_4,    b, vl),
     INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
     INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
     INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
@@ -284,7 +299,10 @@ static const struct test avx512bw_all[] = {
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmovswb,     f3, 0f38, 20,    vl_2,  b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
+    INSN(pmovuswb,    f3, 0f38, 10,    vl_2,  b, vl),
+    INSN(pmovwb,      f3, 0f38, 30,    vl_2,  b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
diff --git a/tools/tests/x86_emulator/simd.c b/tools/tests/x86_emulator/simd.c
index 10235e8e0a..9c2a0a5a5a 100644
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -277,6 +277,17 @@ static inline bool _to_bool(byte_vec_t bv)
 #endif
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
+     (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* 
vextracti32x8 */ || \
+     (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* 
vextracti64x2 */
+#  define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -291,6 +302,7 @@ static inline bool _to_bool(byte_vec_t bv)
 })
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -720,6 +732,27 @@ static inline bool _to_bool(byte_vec_t bv)
 # endif
 #endif
 
+#if VEC_SIZE >= 16
+
+# if !defined(low_half) && defined(HALF_SIZE)
+static inline half_t low_half(vec_t x)
+{
+#  if HALF_SIZE < VEC_SIZE
+    half_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 2; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1087,6 +1120,21 @@ int simd_test(void)
 
 #endif
 
+#if defined(widen1) && defined(shrink1)
+    {
+        half_t aux1 = low_half(src), aux2;
+
+        touch(aux1);
+        x = widen1(aux1);
+        touch(x);
+        aux2 = shrink1(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 2; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
 #ifdef dup_lo
     touch(src);
     x = dup_lo(src);
diff --git a/tools/tests/x86_emulator/simd.h b/tools/tests/x86_emulator/simd.h
index fb986be66a..d8f62ba472 100644
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,23 @@ typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE >= 16
+
+# if ELEM_COUNT >= 2
+#  if VEC_SIZE > 32
+#   define HALF_SIZE (VEC_SIZE / 2)
+#  else
+#   define HALF_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
+typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
+typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
+typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
+typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+# endif
+
+#endif
+
 #if VEC_SIZE == 16
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c 
b/xen/arch/x86/x86_emulate/x86_emulate.c
index d0d1c04b04..ec77d125e7 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3068,7 +3068,22 @@ x86_decode(
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
             if ( evex_encoded() )
-                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+            {
+                /*
+                 * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but
+                 * their attributes don't match those of the vex_66 encoded
+                 * insns with the same base opcodes. Rather than adding new
+                 * columns to the table, handle this here for now.
+                 */
+                if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 )
+                    disp8scale = decode_disp8scale(ext0f38_table[b].d8s, 
state);
+                else
+                {
+                    disp8scale = decode_disp8scale(ext0f38_table[b ^ 0x30].d8s,
+                                                   state);
+                    state->simd_size = simd_other;
+                }
+            }
             break;
 
         case ext_0f3a:
@@ -8359,10 +8374,14 @@ x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb 
[xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw 
{x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} 
*/
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw 
{x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} 
*/
         host_and_vcpu_must_have(avx512bw);
-        /* fall through */
+        if ( evex.pfx != vex_f3 )
+        {
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd 
{x,y}mm/mem,[xyz]mm{k} */
@@ -8373,7 +8392,29 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd 
{x,y}mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq 
{x,y}mm/mem,[xyz]mm{k} */
-        generate_exception_if(evex.brs || (evex.w && (b & 7) == 5), EXC_UD);
+            generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        }
+        else
+        {
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw 
[xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd 
[xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} 
*/
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} 
*/
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} 
*/
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} 
*/
+            generate_exception_if(evex.w || (ea.type != OP_REG && evex.z), 
EXC_UD);
+            d = DstMem | SrcReg | TwoOp;
+        }
+        generate_exception_if(evex.brs, EXC_UD);
         op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;
@@ -10215,6 +10256,12 @@ x86_insn_is_mem_write(const struct x86_emulate_state 
*state,
     case X86EMUL_OPC(0x0f, 0xab):        /* BTS */
     case X86EMUL_OPC(0x0f, 0xb3):        /* BTR */
     case X86EMUL_OPC(0x0f, 0xbb):        /* BTC */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* VPMOVUS* */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* VPMOVS* */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30) ...
+         X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* VPMOV{D,Q,W}* */
         return true;
 
     case 0xd9:
--
generated by git-patchbot for /home/xen/git/xen.git#staging

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.