[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 15/17] vmx: nest: virtual ept for nested



This patch adds virtual ept capability to L1.
It's implemented as a simple per vCPU vTLB like component
independent to domain wide p2m.

Signed-off-by: Qing He <qing.he@xxxxxxxxx>

---
 b/xen/arch/x86/hvm/vmx/vept.c        |  574 +++++++++++++++++++++++++++++++++++
 b/xen/include/asm-x86/hvm/vmx/vept.h |   10 
 xen/arch/x86/hvm/vmx/Makefile        |    1 
 xen/arch/x86/hvm/vmx/nest.c          |  136 +++++++-
 xen/arch/x86/hvm/vmx/vmx.c           |   13 
 xen/include/asm-x86/hvm/vmx/nest.h   |    7 
 6 files changed, 734 insertions(+), 7 deletions(-)

diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/Makefile
--- a/xen/arch/x86/hvm/vmx/Makefile     Thu Apr 22 22:30:09 2010 +0800
+++ b/xen/arch/x86/hvm/vmx/Makefile     Thu Apr 22 22:30:10 2010 +0800
@@ -6,3 +6,4 @@
 obj-y += vpmu.o
 obj-y += vpmu_core2.o
 obj-y += nest.o
+obj-y += vept.o
diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c
--- a/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:09 2010 +0800
+++ b/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:10 2010 +0800
@@ -26,6 +26,7 @@
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vvmcs.h>
 #include <asm/hvm/vmx/nest.h>
+#include <asm/hvm/vmx/vept.h>
 
 /*
  * VMX instructions support functions
@@ -295,6 +296,9 @@
     __vmptrld(virt_to_maddr(nest->hvmcs));
     v->arch.hvm_vmx.launched = 0;
 
+    nest->geptp = 0;
+    nest->vept = vept_init(v);
+
     vmreturn(regs, VMSUCCEED);
 
 out:
@@ -313,6 +317,9 @@
     if ( unlikely(!nest->guest_vmxon_pa) )
         goto invalid_op;
 
+    vept_teardown(nest->vept);
+    nest->vept = 0;
+
     nest->guest_vmxon_pa = 0;
     __vmpclear(virt_to_maddr(nest->svmcs));
 
@@ -529,6 +536,67 @@
     return vmx_nest_handle_vmresume(regs);
 }
 
+int vmx_nest_handle_invept(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    struct vmx_inst_decoded decode;
+    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
+    mfn_t mfn;
+    u64 eptp;
+    int type;
+
+    if ( unlikely(!nest->guest_vmxon_pa) )
+        goto invalid_op;
+
+    decode_vmx_inst(regs, &decode);
+
+    hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0);
+    type = reg_read(regs, decode.reg2);
+
+    /* TODO: physical invept on other cpus */
+    switch ( type )
+    {
+    case 1:
+        mfn = vept_invalidate(nest->vept, eptp);
+        if ( eptp == nest->geptp )
+            nest->geptp = 0;
+
+        if ( __mfn_valid(mfn_x(mfn)) )
+            __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0);
+        break;
+    case 2:
+        vept_invalidate_all(nest->vept);
+        nest->geptp = 0;
+        break;
+    default:
+        gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type);
+        break;
+    }
+
+    vmreturn(regs, VMSUCCEED);
+
+    return X86EMUL_OKAY;
+
+invalid_op:
+    hvm_inject_exception(TRAP_invalid_op, 0, 0);
+    return X86EMUL_EXCEPTION;
+}
+
+int vmx_nest_vept(struct vcpu *v)
+{
+    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
+    int r = 0;
+
+    if ( paging_mode_hap(v->domain) &&
+         (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) &
+          CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+         (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) &
+          SECONDARY_EXEC_ENABLE_EPT) )
+        r = 1;
+
+    return r;
+}
+
 /*
  * Nested VMX context switch
  */
@@ -739,7 +807,14 @@
     vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK);
     vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK);
 
-    /* TODO: PDPTRs for nested ept */
+    if ( vmx_nest_vept(v) )
+    {
+        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0);
+        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1);
+        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2);
+        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3);
+    }
+
     /* TODO: CR3 target control */
 }
 
@@ -787,14 +862,32 @@
     }
 #endif
 
+
+    /* loading EPT_POINTER for L2 */
+    if ( vmx_nest_vept(v) )
+    {
+        u64 geptp;
+        mfn_t mfn;
+
+        geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER);
+        if ( geptp != nest->geptp )
+        {
+            mfn = vept_load_eptp(nest->vept, geptp);
+            nest->geptp = geptp;
+
+            __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e);
+#ifdef __i386__
+            __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32);
+#endif
+        }
+    }
+
     regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP);
     regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP);
     regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS);
 
     /* updating host cr0 to sync TS bit */
     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
-
-    /* TODO: EPT_POINTER */
 }
 
 static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest)
@@ -1064,8 +1157,26 @@
         break;
     }
 
+    case EXIT_REASON_EPT_VIOLATION:
+    {
+        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef __i386__
+        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+        if ( vmx_nest_vept(v) )
+        {
+            if ( !vept_ept_violation(nest->vept, nest->geptp,
+                     exit_qualification, gpa) )
+                bypass_l0 = 1;
+            else
+                nest->vmexit_pending = 1;
+        }
+
+        break;
+    }
+
     case EXIT_REASON_WBINVD:
-    case EXIT_REASON_EPT_VIOLATION:
     case EXIT_REASON_EPT_MISCONFIG:
     case EXIT_REASON_EXTERNAL_INTERRUPT:
         /* pass to L0 handler */
@@ -1229,11 +1340,14 @@
         data = (data << 32) | eax;
         break;
     case MSR_IA32_VMX_PROCBASED_CTLS:
+        mask = paging_mode_hap(current->domain)?
+                   0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
         rdmsr(regs->ecx, eax, edx);
 #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \
-            | CPU_BASED_ACTIVATE_MSR_BITMAP            \
-            | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+            | CPU_BASED_ACTIVATE_MSR_BITMAP)
         data = edx & ~REMOVED_EXEC_CONTROL_CAP;
+        data = edx & ~mask;
         data = (data << 32) | eax;
         break;
     case MSR_IA32_VMX_EXIT_CTLS:
@@ -1254,12 +1368,20 @@
         data = (data << 32) | eax;
         break;
     case MSR_IA32_VMX_PROCBASED_CTLS2:
-        mask = 0;
+        mask = paging_mode_hap(current->domain)?
+                   SECONDARY_EXEC_ENABLE_EPT : 0;
 
         rdmsr(regs->ecx, eax, edx);
         data = edx & mask;
         data = (data << 32) | eax;
         break;
+    case MSR_IA32_VMX_EPT_VPID_CAP:
+        rdmsr(regs->ecx, eax, edx);
+#define REMOVED_EPT_VPID_CAP_HIGH   ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 )
+#define REMOVED_EPT_VPID_CAP_LOW    ( 1<<16 | 1<<17 | 1<<26 )
+        data = edx & ~REMOVED_EPT_VPID_CAP_HIGH;
+        data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW);
+        break;
 
     /* pass through MSRs */
     case IA32_FEATURE_CONTROL_MSR:
diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vept.c       Thu Apr 22 22:30:10 2010 +0800
@@ -0,0 +1,574 @@
+/*
+ * vept.c: virtual EPT for nested virtualization
+ *
+ * Copyright (c) 2010, Intel Corporation.
+ * Author: Qing He <qing.he@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/list.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <xen/numa.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vept.h>
+
+#undef mfn_to_page
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+/*
+ * This virtual EPT implementation is independent to p2m facility
+ * and has some different characteristics. It works in a similar
+ * way as shadow page table (guest table and host table composition),
+ * but is per-vcpu, and of vTLB style
+ *   - per vCPU so no lock is required
+ *   - vTLB style signifies honoring all invalidations, and not
+ * write protection. Unlike ordinary page table, since EPT updates
+ * and invalidations are minimal in a well written VMM, overhead
+ * is also minimized.
+ *
+ * The physical root is loaded directly to L2 sVMCS, without entering
+ * any other host controls. Multiple `cache slots' are maintained
+ * for multiple guest EPTPs, with simple LRU replacement.
+ *
+ * One of the limitations so far, is that it doesn't work with
+ * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm
+ * is not supported as for now.
+ */
+
+#define VEPT_MAX_SLOTS 8
+#define VEPT_ALLOCATION_SIZE 512
+
+struct vept_slot {
+    u64               eptp;   /* guest eptp */
+    mfn_t             root;   /* root of phys table */
+    struct list_head  list;
+
+    struct page_list_head page_list;
+};
+
+struct vept {
+    struct list_head   used_slots; /* lru: new->tail, old->head */
+    struct list_head   free_slots;
+
+    int                total_pages;
+    int                free_pages;
+    struct page_list_head freelist;
+
+    struct vcpu       *vcpu;
+};
+
+
+static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp)
+{
+    struct vept_slot *slot, *tmp;
+
+    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
+        if ( slot->eptp == geptp )
+            return slot;
+
+    return NULL;
+}
+
+static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp)
+{
+    struct vept_slot *slot;
+
+    slot = __get_eptp_slot(vept, geptp);
+    if ( slot != NULL )
+        list_del(&slot->list);
+
+    return slot;
+}
+
+static void __clear_slot(struct vept *vept, struct vept_slot *slot)
+{
+    struct page_info *pg;
+
+    slot->eptp = 0;
+
+    while ( !page_list_empty(&slot->page_list) )
+    {
+        pg = page_list_remove_head(&slot->page_list);
+        page_list_add_tail(pg, &vept->freelist);
+
+        vept->free_pages++;
+    }
+}
+
+static struct vept_slot *get_free_slot(struct vept *vept)
+{
+    struct vept_slot *slot = NULL;
+
+    if ( !list_empty(&vept->free_slots) )
+    {
+        slot = list_entry(vept->free_slots.next, struct vept_slot, list);
+        list_del(&slot->list);
+    }
+    else if ( !list_empty(&vept->used_slots) )
+    {
+        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
+        list_del(&slot->list);
+        __clear_slot(vept, slot);
+    }
+
+    return slot;
+}
+
+static void clear_all_slots(struct vept *vept)
+{
+    struct vept_slot *slot, *tmp;
+
+    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
+    {
+        list_del(&slot->list);
+        __clear_slot(vept, slot);
+        list_add_tail(&slot->list, &vept->free_slots);
+    }
+}
+
+static int free_some_pages(struct vept *vept, struct vept_slot *curr)
+{
+    struct vept_slot *slot;
+    int r = 0;
+
+    if ( !list_empty(&vept->used_slots) )
+    {
+        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
+        if ( slot != curr )
+        {
+            list_del(&slot->list);
+            __clear_slot(vept, slot);
+            list_add_tail(&slot->list, &vept->free_slots);
+
+            r = 1;
+        }
+    }
+
+    return r;
+}
+
+struct vept *vept_init(struct vcpu *v)
+{
+    struct vept *vept;
+    struct vept_slot *slot;
+    struct page_info *pg;
+    int i;
+
+    vept = xmalloc(struct vept);
+    if ( vept == NULL )
+        goto out;
+
+    memset(vept, 0, sizeof(*vept));
+    vept->vcpu = v;
+
+    INIT_PAGE_LIST_HEAD(&vept->freelist);
+    INIT_LIST_HEAD(&vept->used_slots);
+    INIT_LIST_HEAD(&vept->free_slots);
+
+    for ( i = 0; i < VEPT_MAX_SLOTS; i++ )
+    {
+        slot = xmalloc(struct vept_slot);
+        if ( slot == NULL )
+            break;
+
+        memset(slot, 0, sizeof(*slot));
+
+        INIT_LIST_HEAD(&slot->list);
+        INIT_PAGE_LIST_HEAD(&slot->page_list);
+
+        list_add(&slot->list, &vept->free_slots);
+    }
+
+    for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ )
+    {
+        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain)));
+        if ( pg == NULL )
+            break;
+
+        page_list_add_tail(pg, &vept->freelist);
+        vept->total_pages++;
+        vept->free_pages++;
+    }
+
+ out:
+    return vept;
+}
+
+void vept_teardown(struct vept *vept)
+{
+    struct page_info *pg;
+    struct vept_slot *slot, *tmp;
+
+    clear_all_slots(vept);
+
+    while ( !page_list_empty(&vept->freelist) )
+    {
+        pg = page_list_remove_head(&vept->freelist);
+        free_domheap_page(pg);
+        vept->free_pages++;
+        vept->total_pages++;
+    }
+
+    list_for_each_entry_safe( slot, tmp, &vept->free_slots, list )
+        xfree(slot);
+
+    xfree(vept);
+}
+
+mfn_t vept_load_eptp(struct vept *vept, u64 geptp)
+{
+    struct page_info *pg;
+    struct vept_slot *slot;
+    mfn_t mfn = _mfn(INVALID_MFN);
+    void *addr;
+
+    ASSERT(vept->vcpu == current);
+
+    slot = get_eptp_slot(vept, geptp);
+    if ( slot == NULL )
+    {
+        slot = get_free_slot(vept);
+        if ( unlikely(slot == NULL) )
+        {
+            gdprintk(XENLOG_ERR, "nest: can't get free slot\n");
+            return mfn;
+        }
+
+        while ( !vept->free_pages )
+            if ( !free_some_pages(vept, slot) )
+            {
+                slot->eptp = 0;
+                list_add_tail(&slot->list, &vept->free_slots);
+                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
+
+                return mfn;
+            }
+
+        vept->free_pages--;
+        pg = page_list_remove_head(&vept->freelist);
+
+        mfn = page_to_mfn(pg);
+        addr = map_domain_page(mfn_x(mfn));
+        clear_page(addr);
+        unmap_domain_page(addr);
+        page_list_add_tail(pg, &slot->page_list);
+        slot->eptp = geptp;
+        slot->root = mfn;
+    }
+
+    mfn = slot->root;
+    list_add_tail(&slot->list, &vept->used_slots);
+
+    return mfn;
+}
+
+mfn_t vept_invalidate(struct vept *vept, u64 geptp)
+{
+    struct vept_slot *slot;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    ASSERT(vept->vcpu == current);
+
+    slot = get_eptp_slot(vept, geptp);
+    if ( slot != NULL )
+    {
+        mfn = slot->root;
+        __clear_slot(vept, slot);
+        list_add_tail(&slot->list, &vept->free_slots);
+    }
+
+    return mfn;
+}
+
+void vept_invalidate_all(struct vept *vept)
+{
+    ASSERT(vept->vcpu == current);
+
+    clear_all_slots(vept);
+}
+
+/*
+ * guest EPT walk and EPT violation
+ */
+struct ept_walk {
+    unsigned long gfn;
+    unsigned long gfn_remainder;
+    ept_entry_t l4e, l3e, l2e, l1e;
+    mfn_t l4mfn, l3mfn, l2mfn, l1mfn;
+    int sp;
+};
+typedef struct ept_walk ept_walk_t;
+
+#define GEPT_NORMAL_PAGE  0
+#define GEPT_SUPER_PAGE   1
+#define GEPT_NOT_PRESENT  2
+static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table,
+               unsigned long *gfn_remainder, int level, u32 *ar,
+               ept_entry_t *entry, mfn_t *next_mfn)
+{
+    int index;
+    ept_entry_t *ept_entry;
+    ept_entry_t *next;
+    p2m_type_t p2mt;
+    int rc = GEPT_NORMAL_PAGE;
+    mfn_t mfn;
+
+    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
+
+    ept_entry = (*table) + index;
+    *entry = *ept_entry;
+    *ar &= entry->epte & 0x7;
+
+    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
+
+    if ( !(ept_entry->epte & 0x7) )
+        rc = GEPT_NOT_PRESENT;
+    else if ( ept_entry->sp_avail )
+        rc = GEPT_SUPER_PAGE;
+    else
+    {
+        mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt);
+        if ( !p2m_is_ram(p2mt) )
+            return GEPT_NOT_PRESENT;
+
+        if ( next_mfn )
+        {
+            next = map_domain_page(mfn_x(mfn));
+            unmap_domain_page(*table);
+
+            *table = next;
+            *next_mfn = mfn;
+        }
+    }
+
+    return rc;
+}
+
+static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw,
+                          u64 geptp, u64 ggpa)
+{
+    ept_entry_t *table;
+    p2m_type_t p2mt;
+    int rc;
+    u32 ar = 0x7;
+
+    unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT);
+    unsigned long gfn_remainder = gfn;
+
+    memset(gw, 0, sizeof(*gw));
+    gw->gfn = gfn;
+    gw->sp = 0;
+
+    gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt);
+    if ( !p2m_is_ram(p2mt) )
+        return 0;
+
+    table = map_domain_page(mfn_x(gw->l4mfn));
+
+    rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar,
+                              &gw->l4e, &gw->l3mfn);
+
+    if ( rc )
+        goto out;
+
+    rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar,
+                              &gw->l3e, &gw->l2mfn);
+
+    if ( rc == GEPT_SUPER_PAGE )
+        gw->sp = 2;
+    if ( rc )
+        goto out;
+
+    rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar,
+                              &gw->l2e, &gw->l1mfn);
+
+    if ( rc == GEPT_SUPER_PAGE )
+        gw->sp = 1;
+    if ( rc )
+        goto out;
+
+    rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar,
+                              &gw->l1e, NULL);
+
+ out:
+    gw->gfn_remainder = gfn_remainder;
+    unmap_domain_page(*table);
+    return ar;
+}
+
+static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar)
+{
+    entry->epte &= ~0x7f;
+    entry->epte |= ar & 0x7f;
+}
+
+static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot,
+                       ept_entry_t **table, unsigned long *gfn_remainder,
+                       int level, u32 *ar, ept_entry_t gentry)
+{
+    int index;
+    ept_entry_t *sentry;
+    ept_entry_t *next;
+    mfn_t mfn;
+    struct page_info *pg;
+
+    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
+
+    sentry = (*table) + index;
+    *ar = sentry->epte & 0x7;
+
+    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
+
+    if ( !(sentry->epte & 0x7) )
+    {
+        while ( !vept->free_pages )
+            if ( !free_some_pages(vept, slot) )
+            {
+                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
+                return 0;
+            }
+
+        vept->free_pages--;
+        pg = page_list_remove_head(&vept->freelist);
+        page_list_add_tail(pg, &slot->page_list);
+        mfn = page_to_mfn(pg);
+        next = map_domain_page(mfn_x(mfn));
+        clear_page(next);
+
+        sentry->mfn = mfn_x(mfn);
+    }
+    else
+    {
+        next = map_domain_page(sentry->mfn);
+    }
+
+    epte_set_ar_bits(sentry, gentry.epte);
+
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+int vept_ept_violation(struct vept *vept, u64 geptp,
+                       unsigned long qualification, paddr_t addr)
+{
+    ept_walk_t gw;
+    struct vept_slot *slot;
+    ept_entry_t *table, *gept;
+    ept_entry_t *sentry, *gentry;
+    u32 old_entry, sp_ar = 0;
+    p2m_type_t p2mt;
+    unsigned long mfn_start = 0;
+    unsigned long gfn_remainder;
+    int rc, i;
+
+    ASSERT(vept->vcpu == current);
+
+    slot = __get_eptp_slot(vept, geptp);
+    if ( unlikely(slot == NULL) )
+        return 0;
+
+    rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr);
+
+    if ( !(rc & (qualification & 0x7)) )    /* inject to guest */
+        return 1;
+
+    if ( gw.sp == 2 )  /* 1G */
+    {
+        sp_ar = gw.l3e.epte & 0x7;
+        mfn_start = gw.l3e.mfn +
+                    (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1));
+    }
+    if ( gw.sp == 1 )  /* 2M */
+    {
+        sp_ar = gw.l2e.epte & 0x7;
+        mfn_start = gw.l2e.mfn;
+    }
+    else
+        mfn_start = 0;
+
+    table = map_domain_page(mfn_x(slot->root));
+    gfn_remainder = gw.gfn;
+
+    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3,
+                          &old_entry, gw.l4e);
+
+    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2,
+                          &old_entry, gw.l3e);
+
+    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1,
+                          &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e);
+
+    /* if l1p is just allocated, do a full prefetch */
+    if ( !old_entry && !gw.sp )
+    {
+        gept = map_domain_page(mfn_x(gw.l1mfn));
+        for ( i = 0; i < 512; i++ )
+        {
+            gentry = gept + i;
+            sentry = table + i;
+            if ( gentry->epte & 0x7 )
+            {
+                sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
+                                        gentry->mfn, &p2mt));
+                epte_set_ar_bits(sentry, gentry->epte);
+            }
+            else
+                sentry->epte = 0;
+        }
+        unmap_domain_page(gept);
+    }
+    else if ( !old_entry && gw.sp )
+    {
+        for ( i = 0; i < 512; i++ )
+        {
+            sentry = table + i;
+            sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
+                                    mfn_start + i, &p2mt));
+            epte_set_ar_bits(sentry, sp_ar);
+        }
+    }
+    else if ( old_entry && !gw.sp )
+    {
+        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
+        sentry = table + i;
+        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
+                                gw.l1e.mfn, &p2mt));
+        epte_set_ar_bits(sentry, gw.l1e.epte);
+    }
+    else    // old_entry && gw.sp
+    {
+        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
+        sentry = table + i;
+        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
+                                mfn_start + i, &p2mt));
+        epte_set_ar_bits(sentry, sp_ar);
+    }
+
+    unmap_domain_page(table);
+    return 0;
+}
diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:09 2010 +0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:10 2010 +0800
@@ -1032,6 +1032,14 @@
     p2m_type_t p2mt;
     char *p;
 
+    /*
+     * If in nesting EPT operation, L0 doesn't have the knowledge on
+     * how to interpret CR3, it's L1's responsibility to provide
+     * GUEST_PDPTRn, we rely solely on them.
+     */
+    if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) )
+        return;
+
     /* EPT needs to load PDPTRS into VMCS for PAE. */
     if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
         return;
@@ -2705,6 +2713,11 @@
         if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY )
             __update_guest_eip(inst_len);
         break;
+    case EXIT_REASON_INVEPT:
+        inst_len = __get_instruction_length();
+        if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY )
+            __update_guest_eip(inst_len);
+        break;
 
     case EXIT_REASON_MWAIT_INSTRUCTION:
     case EXIT_REASON_MONITOR_INSTRUCTION:
diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h
--- a/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:09 2010 +0800
+++ b/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:10 2010 +0800
@@ -47,6 +47,9 @@
 
     unsigned long        intr_info;
     unsigned long        error_code;
+
+    u64                  geptp;
+    struct vept         *vept;
 };
 
 asmlinkage void vmx_nest_switch_mode(void);
@@ -64,6 +67,8 @@
 int vmx_nest_handle_vmresume(struct cpu_user_regs *regs);
 int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs);
 
+int vmx_nest_handle_invept(struct cpu_user_regs *regs);
+
 void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value);
 void vmx_nest_update_secondary_exec_control(struct vcpu *v,
                                             unsigned long value);
@@ -81,4 +86,6 @@
 int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs,
                                  u64 msr_content);
 
+int vmx_nest_vept(struct vcpu *v);
+
 #endif /* __ASM_X86_HVM_NEST_H__ */
diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vept.h        Thu Apr 22 22:30:10 2010 +0800
@@ -0,0 +1,10 @@
+#include <asm/hvm/vmx/vmx.h>
+
+
+struct vept *vept_init(struct vcpu *v);
+void vept_teardown(struct vept *vept);
+mfn_t vept_load_eptp(struct vept *vept, u64 eptp);
+mfn_t vept_invalidate(struct vept *vept, u64 eptp);
+void vept_invalidate_all(struct vept *vept);
+int vept_ept_violation(struct vept *vept, u64 eptp,
+                       unsigned long qualification, paddr_t addr);

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.