[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5 05/47] x86emul: use AVX512 logic for emulating V{, P}MASKMOV*



The more generic AVX512 implementation allows quite a bit of insn-
specific code to be dropped/shared.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Acked-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -439,8 +439,8 @@ static const struct ext0f38_table {
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
-    [0x2c ... 0x2d] = { .simd_size = simd_other },
-    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
+    [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -449,8 +449,8 @@ static const struct ext0f38_table {
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
-    [0x8c] = { .simd_size = simd_other },
-    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x8c] = { .simd_size = simd_packed_int },
+    [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
     [0x99] = { .simd_size = simd_scalar_vexw },
@@ -7989,6 +7989,8 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
+        elem_bytes = 4 << (b & 1);
+    vmaskmov:
         get_fpu(X86EMUL_FPU_ymm);
 
         /*
@@ -8003,7 +8005,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         pvex = copy_VEX(opc, vex);
         pvex->opcx = vex_0f;
-        if ( !(b & 1) )
+        if ( elem_bytes == 4 )
             pvex->pfx = vex_none;
         opc[0] = 0x50; /* vmovmskp{s,d} */
         /* Use %rax as GPR destination and VEX.vvvv as source. */
@@ -8016,21 +8018,9 @@ x86_emulate(
         invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
         put_stub(stub);
 
-        if ( !ea.val )
-            goto complete_insn;
-
-        op_bytes = 4 << (b & 1);
-        first_byte = __builtin_ctz(ea.val);
-        ea.val >>= first_byte;
-        first_byte *= op_bytes;
-        op_bytes *= 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + 1) )
-            d = (d & ~SrcMask) | SrcMem;
+        evex.opmsk = 1; /* fake */
+        op_mask = ea.val;
+        fault_suppression = true;
 
         opc = init_prefixes(stub);
         opc[0] = b;
@@ -8081,63 +8071,10 @@ x86_emulate(
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} 
mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} 
{x,y}mm,{x,y}mm,mem */
-    {
-        typeof(vex) *pvex;
-        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
-
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm);
-
-        /*
-         * While we can't reasonably provide fully correct behavior here
-         * (in particular, for writes, avoiding the memory read in anticipation
-         * of all elements in the range eventually being written), we can (and
-         * should) still limit the memory access to the smallest possible range
-         * (suppressing it altogether if all mask bits are clear), to provide
-         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
-         * for that purpose.
-         */
-        opc = init_prefixes(stub);
-        pvex = copy_VEX(opc, vex);
-        pvex->opcx = vex_0f;
-        opc[0] = 0xd7; /* vpmovmskb */
-        /* Use %rax as GPR destination and VEX.vvvv as source. */
-        pvex->r = 1;
-        pvex->b = !mode_64bit() || (vex.reg >> 3);
-        opc[1] = 0xc0 | (~vex.reg & 7);
-        pvex->reg = 0xf;
-        opc[2] = 0xc3;
-
-        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
-        put_stub(stub);
-
-        /* Convert byte granular result to dword/qword granularity. */
-        ea.val &= mask;
-        if ( !ea.val )
-            goto complete_insn;
-
-        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
-        ea.val >>= first_byte;
-        op_bytes = 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + ~mask + 1) )
-            d = (d & ~SrcMask) | SrcMem;
-
-        opc = init_prefixes(stub);
-        opc[0] = b;
-        /* Convert memory operand to (%rAX). */
-        rex_prefix &= ~REX_B;
-        vex.b = 1;
-        opc[1] = modrm & 0x38;
-        insn_bytes = PFX_BYTES + 2;
-
-        break;
-    }
+        elem_bytes = 4 << vex.w;
+        goto vmaskmov;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} 
{x,y}mm,mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} 
{x,y}mm,mem,{x,y}mm */




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.