[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC] x86/emulate: implement hvmemul_cmpxchg() with an actual CMPXCHG



hvmemul_cmpxchg() is no presently SMP-safe, as it ends up doing
a memcpy(). Use an actual CMPXCHG instruction in the implementation.

Signed-off-by: Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>

---
Questions:
 - I've used __cmpxchg(), as it's already there, but it always locks.
   In my tests and in x86_emulate()'s context, the lock is already taken
   anyway, so it doesn't appear to make a difference at present. Have I
   missed a case where it does, or should we add a lock prefix anyway
   for the future? If so, should I use another version of cmpxchg()
   that we already have in the source code but I've missed, modify this
   one, bring another implementation in?
 - Is there anything I've missed in the implementation that could crash
   the host / guest / cause trouble?
 - In the introspection context, before this change a RETRY return in
   hvm_emulate_one_vm_event() would do nothing - when emulating because
   of a mem_access vm_event, this would mean that the VCPU would then
   simply try to execute the instruction again, which would trigger
   another mem_access event (which would hopefully this time trigger
   a successful insn emulation). However, with these changes, this
   would mean that, in a SMP scenario where a VCPU failed it's
   CMPXCHG, the instruction would be re-executed in the guest, which
   I've found leads to bad things happening (mostly BSODs). I've
   mitigated this in this patch by trying to emulate the failed
   instruction in a loop until it succeeds, but I'm not convinced it
   is a good idea. What are the alternatives?
---
 xen/arch/x86/hvm/emulate.c | 230 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 181 insertions(+), 49 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 2d92957..b5754a1 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -20,6 +20,7 @@
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/ioreq.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/svm/svm.h>
@@ -1029,6 +1030,77 @@ static int hvmemul_wbinvd_discard(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_vaddr_to_mfn(
+    unsigned long addr,
+    mfn_t *mfn,
+    uint32_t pfec,
+    struct x86_emulate_ctxt *ctxt)
+{
+    paddr_t gpa = addr & ~PAGE_MASK;
+    struct page_info *page;
+    p2m_type_t p2mt;
+    unsigned long gfn;
+    struct vcpu *curr = current;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
+    gfn = paging_gva_to_gfn(curr, addr, &pfec);
+
+    if ( gfn == gfn_x(INVALID_GFN) )
+    {
+        pagefault_info_t pfinfo = {};
+
+        if ( ( pfec & PFEC_page_paged ) || ( pfec & PFEC_page_shared ) )
+            return X86EMUL_RETRY;
+
+        pfinfo.linear = addr;
+        pfinfo.ec = pfec;
+
+        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &hvmemul_ctxt->ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    gpa |= (paddr_t)gfn << PAGE_SHIFT;
+
+    /*
+     * No need to do the P2M lookup for internally handled MMIO, benefiting
+     * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
+     * - newer Windows (like Server 2012) for HPET accesses.
+     */
+    if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) )
+        return X86EMUL_UNHANDLEABLE;
+
+    page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE);
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( p2m_is_paging(p2mt) )
+    {
+        put_page(page);
+        p2m_mem_paging_populate(curr->domain, gfn);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_shared(p2mt) )
+    {
+        put_page(page);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_grant(p2mt) )
+    {
+        put_page(page);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    *mfn = _mfn(page_to_mfn(page));
+
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1037,8 +1109,70 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
+    unsigned long addr, reps = 1;
+    int rc = X86EMUL_OKAY;
+    unsigned long old = 0, new = 0;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    mfn_t mfn[2];
+    void *map = NULL;
+    struct domain *currd = current->domain;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
+
+    if ( rc != X86EMUL_OKAY || !bytes )
+        return rc;
+
+    rc = hvmemul_vaddr_to_mfn(addr, &mfn[0], pfec, ctxt);
+
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( likely(((addr + bytes - 1) & PAGE_MASK) == (addr & PAGE_MASK)) )
+    {
+        /* Whole write fits on a single page. */
+        mfn[1] = INVALID_MFN;
+        map = map_domain_page(mfn[0]);
+    }
+    else
+    {
+        rc = hvmemul_vaddr_to_mfn((addr + bytes - 1) & PAGE_MASK, &mfn[1],
+                                  pfec, ctxt);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        map = vmap(mfn, 2);
+    }
+
+    if ( !map )
+        return X86EMUL_UNHANDLEABLE;
+
+    map += (addr & ~PAGE_MASK);
+
+    memcpy(&old, p_old, bytes);
+    memcpy(&new, p_new, bytes);
+
+    if ( __cmpxchg(map, old, new, bytes) != old )
+        rc = X86EMUL_RETRY;
+
+    paging_mark_dirty(currd, mfn[0]);
+
+    if ( unlikely(mfn_valid(mfn[1])) )
+    {
+        paging_mark_dirty(currd, mfn[1]);
+        vunmap((void *)((unsigned long)map & PAGE_MASK));
+    }
+    else
+        unmap_domain_page(map);
+
+    return rc;
 }
 
 static int hvmemul_validate(
@@ -1961,59 +2095,57 @@ int hvm_emulate_one_mmio(unsigned long mfn, unsigned 
long gla)
 void hvm_emulate_one_vm_event(enum emul_kind kind, unsigned int trapnr,
     unsigned int errcode)
 {
-    struct hvm_emulate_ctxt ctx = {{ 0 }};
-    int rc;
+    int rc = X86EMUL_OKAY;
 
-    hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs());
+    do {
+        struct hvm_emulate_ctxt ctx = {{ 0 }};
 
-    switch ( kind )
-    {
-    case EMUL_KIND_NOWRITE:
-        rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write);
-        break;
-    case EMUL_KIND_SET_CONTEXT_INSN: {
-        struct vcpu *curr = current;
-        struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+        hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs());
 
-        BUILD_BUG_ON(sizeof(vio->mmio_insn) !=
-                     sizeof(curr->arch.vm_event->emul.insn.data));
-        ASSERT(!vio->mmio_insn_bytes);
+        switch ( kind )
+        {
+        case EMUL_KIND_NOWRITE:
+            rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write);
+            break;
+        case EMUL_KIND_SET_CONTEXT_INSN: {
+            struct vcpu *curr = current;
+            struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+
+            BUILD_BUG_ON(sizeof(vio->mmio_insn) !=
+                         sizeof(curr->arch.vm_event->emul.insn.data));
+            ASSERT(!vio->mmio_insn_bytes);
+
+            /*
+             * Stash insn buffer into mmio buffer here instead of ctx
+             * to avoid having to add more logic to hvm_emulate_one.
+             */
+            vio->mmio_insn_bytes = sizeof(vio->mmio_insn);
+            memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data,
+                   vio->mmio_insn_bytes);
+        }
+        /* Fall-through */
+        default:
+            ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA);
+            rc = hvm_emulate_one(&ctx);
+        }
 
-        /*
-         * Stash insn buffer into mmio buffer here instead of ctx
-         * to avoid having to add more logic to hvm_emulate_one.
-         */
-        vio->mmio_insn_bytes = sizeof(vio->mmio_insn);
-        memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data,
-               vio->mmio_insn_bytes);
-    }
-    /* Fall-through */
-    default:
-        ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA);
-        rc = hvm_emulate_one(&ctx);
-    }
+        switch ( rc )
+        {
+        case X86EMUL_RETRY:
+            break;
+        case X86EMUL_UNHANDLEABLE:
+            hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx);
+            hvm_inject_hw_exception(trapnr, errcode);
+            break;
+        case X86EMUL_EXCEPTION:
+            if ( ctx.ctxt.event_pending )
+                hvm_inject_event(&ctx.ctxt.event);
+            break;
+        }
 
-    switch ( rc )
-    {
-    case X86EMUL_RETRY:
-        /*
-         * This function is called when handling an EPT-related vm_event
-         * reply. As such, nothing else needs to be done here, since simply
-         * returning makes the current instruction cause a page fault again,
-         * consistent with X86EMUL_RETRY.
-         */
-        return;
-    case X86EMUL_UNHANDLEABLE:
-        hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx);
-        hvm_inject_hw_exception(trapnr, errcode);
-        break;
-    case X86EMUL_EXCEPTION:
-        if ( ctx.ctxt.event_pending )
-            hvm_inject_event(&ctx.ctxt.event);
-        break;
-    }
+        hvm_emulate_writeback(&ctx);
 
-    hvm_emulate_writeback(&ctx);
+    } while( rc == X86EMUL_RETRY );
 }
 
 void hvm_emulate_init_once(
-- 
1.9.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.