[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Implement SVM specific part for Nested Virtualization



# HG changeset patch
# User cegger
# Date 1299670565 -3600
# Node ID a5e69b6fdd16a2c16d14afaad7025dfd794a44e1
# Parent  a21d019bb8fe8535a0bbbf4d2ecf1adab4783dc8
Implement SVM specific part for Nested Virtualization

Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Committed-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---


diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/Makefile
--- a/xen/arch/x86/hvm/svm/Makefile     Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/Makefile     Wed Mar 09 12:36:05 2011 +0100
@@ -2,6 +2,8 @@
 obj-y += emulate.o
 obj-bin-y += entry.o
 obj-y += intr.o
+obj-y += nestedsvm.o
 obj-y += svm.o
+obj-y += svmdebug.o
 obj-y += vmcb.o
 obj-y += vpmu.o
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c    Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/emulate.c    Wed Mar 09 12:36:05 2011 +0100
@@ -102,6 +102,11 @@
 MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
 MAKE_INSTR(PAUSE,  1, 0x90);
 MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1);
+MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
+MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
+MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
+MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
+MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
 
 static const u8 *opc_bytes[INSTR_MAX_COUNT] = 
 {
@@ -116,6 +121,11 @@
     [INSTR_RDTSC]  = OPCODE_RDTSC,
     [INSTR_PAUSE]  = OPCODE_PAUSE,
     [INSTR_XSETBV] = OPCODE_XSETBV,
+    [INSTR_VMRUN]  = OPCODE_VMRUN,
+    [INSTR_VMLOAD] = OPCODE_VMLOAD,
+    [INSTR_VMSAVE] = OPCODE_VMSAVE,
+    [INSTR_STGI]   = OPCODE_STGI,
+    [INSTR_CLGI]   = OPCODE_CLGI,
 };
 
 static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len)
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S      Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/entry.S      Wed Mar 09 12:36:05 2011 +0100
@@ -54,6 +54,7 @@
 
 ENTRY(svm_asm_do_resume)
         call svm_intr_assist
+        call_with_regs(nsvm_vcpu_switch)
 
         get_current(bx)
         CLGI
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/nestedsvm.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c  Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,1279 @@
+/*
+ * nestedsvm.c: Nested Virtualization
+ * Copyright (c) 2011, Advanced Micro Devices, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/hvm/support.h>
+#include <asm/hvm/svm/emulate.h>
+#include <asm/hvm/svm/svm.h>
+#include <asm/hvm/svm/vmcb.h>
+#include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/svm/nestedsvm.h>
+#include <asm/hvm/svm/svmdebug.h>
+#include <asm/paging.h> /* paging_mode_hap */
+
+static int
+nestedsvm_vmcb_isvalid(struct vcpu *v, uint64_t vmcxaddr)
+{
+    if ( !hvm_svm_enabled(v) || hvm_guest_x86_mode(v) < 2 )
+        return 0;
+
+    /* Maximum valid physical address.
+     * See AMD BKDG for HSAVE_PA MSR.
+     */
+    if ( vmcxaddr > 0xfd00000000ULL )
+        return 0;
+
+    return 1;
+}
+
+int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if (nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr) {
+        ASSERT(nv->nv_vvmcx != NULL);
+        ASSERT(nv->nv_vvmcxaddr != VMCX_EADDR);
+        hvm_unmap_guest_frame(nv->nv_vvmcx);
+        nv->nv_vvmcx = NULL;
+        nv->nv_vvmcxaddr = VMCX_EADDR;
+    }
+
+    if (nv->nv_vvmcx == NULL) {
+        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT);
+        if (nv->nv_vvmcx == NULL)
+            return 0;
+        nv->nv_vvmcxaddr = vmcbaddr;
+    }
+
+    return 1;
+}
+
+/* Interface methods */
+int nsvm_vcpu_initialise(struct vcpu *v)
+{
+    void *msrpm;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    svm->ns_cached_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    svm->ns_merged_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+
+    nv->nv_n2vmcx = alloc_vmcb();
+    if (nv->nv_n2vmcx == NULL)
+        goto err;
+    nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx);
+
+    return 0;
+
+err:
+    nsvm_vcpu_destroy(v);
+    return -ENOMEM;
+}
+
+int nsvm_vcpu_destroy(struct vcpu *v)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    if (svm->ns_cached_msrpm) {
+        free_xenheap_pages(svm->ns_cached_msrpm,
+                           get_order_from_bytes(MSRPM_SIZE));
+        svm->ns_cached_msrpm = NULL;
+    }
+    if (svm->ns_merged_msrpm) {
+        free_xenheap_pages(svm->ns_merged_msrpm,
+                           get_order_from_bytes(MSRPM_SIZE));
+        svm->ns_merged_msrpm = NULL;
+    }
+    if (nv->nv_n2vmcx) {
+        free_vmcb(nv->nv_n2vmcx);
+        nv->nv_n2vmcx = NULL;
+        nv->nv_n2vmcx_pa = VMCX_EADDR;
+    }
+    if (svm->ns_iomap)
+        svm->ns_iomap = NULL;
+
+    return 0;
+}
+
+int nsvm_vcpu_reset(struct vcpu *v)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    svm->ns_msr_hsavepa = VMCX_EADDR;
+    svm->ns_ovvmcb_pa = VMCX_EADDR;
+
+    svm->ns_cr_intercepts = 0;
+    svm->ns_dr_intercepts = 0;
+    svm->ns_exception_intercepts = 0;
+    svm->ns_general1_intercepts = 0;
+    svm->ns_general2_intercepts = 0;
+    svm->ns_lbr_control.bytes = 0;
+
+    svm->ns_hap_enabled = 0;
+    svm->ns_vmcb_guestcr3 = 0;
+    svm->ns_vmcb_hostcr3 = 0;
+    svm->ns_guest_asid = 0;
+    svm->ns_hostflags.bytes = 0;
+    svm->ns_vmexit.exitinfo1 = 0;
+    svm->ns_vmexit.exitinfo2 = 0;
+
+    if (svm->ns_iomap)
+        svm->ns_iomap = NULL;
+
+    return 0;
+}
+
+static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *n1vmcb;
+
+    n1vmcb = nv->nv_n1vmcx;
+    ASSERT(n1vmcb != NULL);
+
+    n1vmcb->rip += inst_len;
+
+    /* Remember the host interrupt flag */
+    svm->ns_hostflags.fields.rflagsif =
+        (n1vmcb->rflags & X86_EFLAGS_IF) ? 1 : 0;
+
+    return 0;
+}
+
+int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *n1vmcb, *n2vmcb;
+    int rc;
+
+    n1vmcb = nv->nv_n1vmcx;
+    n2vmcb = nv->nv_n2vmcx;
+    ASSERT(n1vmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+
+    /* nsvm_vmcb_prepare4vmexit() already saved register values
+     * handled by VMSAVE/VMLOAD into n1vmcb directly.
+     */
+
+    /* switch vmcb to l1 guest's vmcb */
+    v->arch.hvm_svm.vmcb = n1vmcb;
+    v->arch.hvm_svm.vmcb_pa = nv->nv_n1vmcx_pa;
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = n1vmcb->_efer;
+    rc = hvm_set_efer(n1vmcb->_efer);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4;
+    rc = hvm_set_cr4(n1vmcb->_cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE;
+    n1vmcb->rflags &= ~X86_EFLAGS_VM;
+    rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = n1vmcb->_cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* CR3 */
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else {
+        /* host shadow paging + guest shadow paging. */
+
+        /* Reset MMU context  -- XXX (hostrestore) not yet working*/
+        if (!pagetable_is_null(v->arch.guest_table))
+            put_page(pagetable_get_page(v->arch.guest_table));
+        v->arch.guest_table = pagetable_null();
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    }
+    rc = hvm_set_cr3(n1vmcb->_cr3);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+
+    regs->eax = n1vmcb->rax;
+    regs->esp = n1vmcb->rsp;
+    regs->eip = n1vmcb->rip;
+    regs->eflags = n1vmcb->rflags;
+    n1vmcb->_dr7 = 0; /* disable all breakpoints */
+    n1vmcb->_cpl = 0;
+
+    /* Clear exitintinfo to prevent a fault loop of re-injecting
+     * exceptions forever.
+     */
+    n1vmcb->exitintinfo.bytes = 0;
+
+    /* Cleanbits */
+    n1vmcb->cleanbits.bytes = 0;
+
+    hvm_asid_flush_vcpu(v);
+
+    return 0;
+}
+
+static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm)
+{
+    struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    struct vmcb_struct *host_vmcb = arch_svm->vmcb;
+    unsigned long *ns_msrpm_ptr;
+    unsigned int i;
+    enum hvm_copy_result ret;
+    unsigned long *ns_viomap;
+    bool_t ioport_80, ioport_ed;
+
+    ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm;
+
+    ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm,
+                                   ns_vmcb->_msrpm_base_pa, MSRPM_SIZE);
+    if (ret != HVMCOPY_okay) {
+        gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret);
+        return 1;
+    }
+
+    /* Check l1 guest io permission map and get a shadow one based on
+     * if l1 guest intercepts io ports 0x80 and/or 0xED.
+     */
+    svm->ns_oiomap_pa = svm->ns_iomap_pa;
+    svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa;
+
+    ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT);
+    ASSERT(ns_viomap != NULL);
+    ioport_80 = test_bit(0x80, ns_viomap);
+    ioport_ed = test_bit(0xed, ns_viomap);
+    hvm_unmap_guest_frame(ns_viomap);
+
+    svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed);
+
+    nv->nv_ioport80 = ioport_80;
+    nv->nv_ioportED = ioport_ed;
+
+    /* v->arch.hvm_svm.msrpm has type unsigned long, thus
+     * BYTES_PER_LONG.
+     */
+    for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++)
+        svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i];
+
+    host_vmcb->_iopm_base_pa =
+        (uint64_t)virt_to_maddr(svm->ns_iomap);
+    host_vmcb->_msrpm_base_pa =
+        (uint64_t)virt_to_maddr(svm->ns_merged_msrpm);
+
+    return 0;
+}
+
+static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb;
+    bool_t vcleanbits_valid;
+    int rc;
+
+    ns_vmcb = nv->nv_vvmcx;
+    n1vmcb = nv->nv_n1vmcx;
+    n2vmcb = nv->nv_n2vmcx;
+    ASSERT(ns_vmcb != NULL);
+    ASSERT(n1vmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+
+    /* Check if virtual VMCB cleanbits are valid */
+    vcleanbits_valid = 1;
+    if (svm->ns_ovvmcb_pa == VMCX_EADDR)
+        vcleanbits_valid = 0;
+    if (svm->ns_ovvmcb_pa != nv->nv_vvmcxaddr)
+        vcleanbits_valid = 0;
+
+#define vcleanbit_set(_name)   \
+    (vcleanbits_valid && ns_vmcb->cleanbits.fields._name)
+
+    /* Enable l2 guest intercepts */
+    if (!vcleanbit_set(intercepts)) {
+        svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts;
+        svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts;
+        svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts;
+        svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts;
+        svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts;
+    }
+
+    /* We could track the cleanbits of the n1vmcb from
+     * last emulated #VMEXIT to this emulated VMRUN to safe the merges
+     * below. Those cleanbits would be tracked in an integer field
+     * in struct nestedsvm.
+     * But this effort is not worth doing because:
+     * - Only the intercepts bit of the n1vmcb can effectively be used here 
+     * - The CPU runs more instructions for the tracking than can be
+     *   safed here.
+     * The overhead comes from (ordered from highest to lowest):
+     * - svm_ctxt_switch_to (CPU context switching)
+     * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching)
+     * - emulated CLGI (clears VINTR intercept)
+     * - host clears VINTR intercept
+     * Test results show that the overhead is high enough that the
+     * tracked intercepts bit of the n1vmcb is practically *always* cleared.
+     */
+
+    n2vmcb->_cr_intercepts =
+        n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts;
+    n2vmcb->_dr_intercepts =
+        n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts;
+    n2vmcb->_exception_intercepts =
+        n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts;
+    n2vmcb->_general1_intercepts =
+        n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts;
+    n2vmcb->_general2_intercepts =
+        n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts;
+
+    /* Nested Pause Filter */
+    if (ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE)
+        n2vmcb->_pause_filter_count =
+            min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count);
+    else
+        n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count;
+
+    /* TSC offset */
+    n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset;
+
+    /* Nested IO permission bitmaps */
+    rc = nsvm_vmrun_permissionmap(v, vcleanbit_set(iopm));
+    if (rc)
+        return rc;
+
+    /* ASID */
+    hvm_asid_flush_vcpu(v);
+    /* n2vmcb->_guest_asid = ns_vmcb->_guest_asid; */
+
+    /* TLB control */
+    n2vmcb->tlb_control = n1vmcb->tlb_control | ns_vmcb->tlb_control;
+
+    /* Virtual Interrupts */
+    if (!vcleanbit_set(tpr)) {
+        n2vmcb->_vintr = ns_vmcb->_vintr;
+        n2vmcb->_vintr.fields.intr_masking = 1;
+    }
+
+    /* Shadow Mode */
+    n2vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    n2vmcb->exitcode = ns_vmcb->exitcode;
+    n2vmcb->exitinfo1 = ns_vmcb->exitinfo1;
+    n2vmcb->exitinfo2 = ns_vmcb->exitinfo2;
+    n2vmcb->exitintinfo = ns_vmcb->exitintinfo;
+
+    /* Pending Interrupts */
+    n2vmcb->eventinj = ns_vmcb->eventinj;
+
+    /* LBR virtualization */
+    if (!vcleanbit_set(lbr)) {
+        svm->ns_lbr_control = ns_vmcb->lbr_control;
+    }
+    n2vmcb->lbr_control.bytes =
+        n1vmcb->lbr_control.bytes | ns_vmcb->lbr_control.bytes;
+
+    /* NextRIP */
+    n2vmcb->nextrip = ns_vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    if (!vcleanbit_set(seg)) {
+        n2vmcb->es = ns_vmcb->es;
+        n2vmcb->cs = ns_vmcb->cs;
+        n2vmcb->ss = ns_vmcb->ss;
+        n2vmcb->ds = ns_vmcb->ds;
+        /* CPL */
+        n2vmcb->_cpl = ns_vmcb->_cpl;
+    }
+    if (!vcleanbit_set(dt)) {
+        n2vmcb->gdtr = ns_vmcb->gdtr;
+        n2vmcb->idtr = ns_vmcb->idtr;
+    }
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = ns_vmcb->_efer;
+    rc = hvm_set_efer(ns_vmcb->_efer);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4;
+    rc = hvm_set_cr4(ns_vmcb->_cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0;
+    rc = hvm_set_cr0(ns_vmcb->_cr0);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->_cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        n2vmcb->_np_enable = 1;
+        /* Keep h_cr3 as it is. */
+        /* When l1 guest does shadow paging
+         * we assume it intercepts page faults.
+         */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        n2vmcb->_np_enable = 0;
+        n2vmcb->_h_cr3 = 0x0;
+
+        /* TODO: Once shadow-shadow paging is in place come back to here
+         * and set host_vmcb->_cr3 to the shadowed shadow table.
+         */
+    }
+
+    /* DRn */
+    if (!vcleanbit_set(dr)) {
+        n2vmcb->_dr7 = ns_vmcb->_dr7;
+        n2vmcb->_dr6 = ns_vmcb->_dr6;
+    }
+
+    /* RFLAGS */
+    n2vmcb->rflags = ns_vmcb->rflags;
+
+    /* RIP */
+    n2vmcb->rip = ns_vmcb->rip;
+
+    /* RSP */
+    n2vmcb->rsp = ns_vmcb->rsp;
+
+    /* RAX */
+    n2vmcb->rax = ns_vmcb->rax;
+
+    /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* Page tables */
+    n2vmcb->pdpe0 = ns_vmcb->pdpe0;
+    n2vmcb->pdpe1 = ns_vmcb->pdpe1;
+    n2vmcb->pdpe2 = ns_vmcb->pdpe2;
+    n2vmcb->pdpe3 = ns_vmcb->pdpe3;
+
+    /* PAT */
+    if (!vcleanbit_set(np)) {
+        n2vmcb->_g_pat = ns_vmcb->_g_pat;
+    }
+
+    if (!vcleanbit_set(lbr)) {
+        /* Debug Control MSR */
+        n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr;
+
+        /* LBR MSRs */
+        n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip;
+        n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip;
+        n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip;
+        n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip;
+    }
+
+    /* Cleanbits */
+    n2vmcb->cleanbits.bytes = 0;
+
+    rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "virtual vmcb invalid\n");
+        return rc;
+    }
+
+    rc = svm_vmcb_isvalid(__func__, n2vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "n2vmcb invalid\n");
+        return rc;
+    }
+
+    /* Switch guest registers to l2 guest */
+    regs->eax = ns_vmcb->rax;
+    regs->eip = ns_vmcb->rip;
+    regs->esp = ns_vmcb->rsp;
+    regs->eflags = ns_vmcb->rflags;
+
+#undef vcleanbit_set
+    return 0;
+}
+
+static int
+nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs,
+    unsigned int inst_len)
+{
+    int ret;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = nv->nv_vvmcx;
+    ASSERT(ns_vmcb != NULL);
+    ASSERT(nv->nv_n2vmcx != NULL);
+    ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR);
+
+    /* Save values for later use. Needed for Nested-on-Nested and
+     * Shadow-on-Shadow paging.
+     */
+    svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3;
+    svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3;
+
+    nv->nv_flushp2m = (ns_vmcb->tlb_control
+        || (svm->ns_guest_asid != ns_vmcb->_guest_asid));
+    svm->ns_guest_asid = ns_vmcb->_guest_asid;
+
+    /* nested paging for the guest */
+    svm->ns_hap_enabled = (ns_vmcb->_np_enable) ? 1 : 0;
+
+    /* Remember the V_INTR_MASK in hostflags */
+    svm->ns_hostflags.fields.vintrmask =
+        (ns_vmcb->_vintr.fields.intr_masking) ? 1 : 0;
+
+    /* Save l1 guest state (= host state) */
+    ret = nsvm_vcpu_hostsave(v, inst_len);
+    if (ret) {
+        gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret);
+        return ret;
+    }
+
+    /* switch vmcb to shadow vmcb */
+    v->arch.hvm_svm.vmcb = nv->nv_n2vmcx;
+    v->arch.hvm_svm.vmcb_pa = nv->nv_n2vmcx_pa;
+
+    ret = nsvm_vmcb_prepare4vmrun(v, regs);
+    if (ret) {
+        gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+int
+nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    inst_len = __get_instruction_length(v, INSTR_VMRUN);
+    if (inst_len == 0) {
+        svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN;
+        return -1;
+    }
+
+    nv->nv_vmswitch_in_progress = 1;
+    ASSERT(nv->nv_vvmcx != NULL);
+
+    /* save host state */
+    ret = nsvm_vcpu_vmentry(v, regs, inst_len);
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "nsvm_vcpu_vmentry failed, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        nv->nv_vmswitch_in_progress = 0;
+        return 1;
+    }
+
+    /* Switch vcpu to guest mode
+     */
+    nestedhvm_vcpu_enter_guestmode(v);
+    nv->nv_vmswitch_in_progress = 0;
+    return 0;
+}
+
+int
+nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = nv->nv_vvmcx;
+
+    if (nv->nv_vmexit_pending) {
+
+        switch (exitcode) {
+        case VMEXIT_INTR:
+            if ( unlikely(ns_vmcb->eventinj.fields.v)
+                && nv->nv_vmentry_pending
+                && hvm_event_needs_reinjection(ns_vmcb->eventinj.fields.type,
+                    ns_vmcb->eventinj.fields.vector) )
+            {
+                ns_vmcb->exitintinfo.bytes = ns_vmcb->eventinj.bytes;
+            }
+            break;
+        case VMEXIT_EXCEPTION_PF:
+            ns_vmcb->_cr2 = ns_vmcb->exitinfo2;
+            /* fall through */
+        case VMEXIT_NPF:
+            /* PF error code */
+            ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1;
+            /* fault address */
+            ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2;
+            break;
+        case VMEXIT_EXCEPTION_NP:
+        case VMEXIT_EXCEPTION_SS:
+        case VMEXIT_EXCEPTION_GP:
+        case VMEXIT_EXCEPTION_15:
+        case VMEXIT_EXCEPTION_MF:
+        case VMEXIT_EXCEPTION_AC:
+            ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1;
+            break;
+        default:
+            break;
+        }
+    }
+
+    ns_vmcb->exitcode = exitcode;
+    ns_vmcb->eventinj.bytes = 0;
+    return 0;
+}
+
+int
+nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr,
+                      int errcode, unsigned long cr2)
+{
+    ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL);
+
+    nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trapnr, errcode, cr2);
+    return NESTEDHVM_VMEXIT_DONE;
+}
+
+uint64_t nsvm_vcpu_guestcr3(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_vmcb_guestcr3;
+}
+
+uint64_t nsvm_vcpu_hostcr3(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_vmcb_hostcr3;
+}
+
+uint32_t nsvm_vcpu_asid(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_guest_asid;
+}
+
+static int
+nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap,
+    uint32_t msr, bool_t write)
+{
+    bool_t enabled;
+    unsigned long *msr_bit;
+
+    msr_bit = svm_msrbit(msr_bitmap, msr);
+
+    if (msr_bit == NULL)
+        /* MSR not in the permission map: Let the guest handle it. */
+        return NESTEDHVM_VMEXIT_INJECT;
+
+    BUG_ON(msr_bit == NULL);
+    msr &= 0x1fff;
+
+    if (write)
+        /* write access */
+        enabled = test_bit(msr * 2 + 1, msr_bit);
+    else
+        /* read access */
+        enabled = test_bit(msr * 2, msr_bit);
+
+    if (!enabled)
+        return NESTEDHVM_VMEXIT_HOST;
+
+    return NESTEDHVM_VMEXIT_INJECT;
+}
+
+static int
+nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1)
+{
+    unsigned long iopm_gfn = iopm_pa >> PAGE_SHIFT;
+    unsigned long *io_bitmap = NULL;
+    ioio_info_t ioinfo;
+    uint16_t port;
+    bool_t enabled;
+
+    ioinfo.bytes = exitinfo1;
+    port = ioinfo.fields.port;
+
+    switch (port) {
+    case 0 ... 32767: /* first 4KB page */
+        io_bitmap = hvm_map_guest_frame_ro(iopm_gfn);
+        break;
+    case 32768 ... 65535: /* second 4KB page */
+        port -= 32768;
+        io_bitmap = hvm_map_guest_frame_ro(iopm_gfn+1);
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    if (io_bitmap == NULL) {
+        gdprintk(XENLOG_ERR,
+            "IOIO intercept: mapping of permission map failed\n");
+        return NESTEDHVM_VMEXIT_ERROR;
+    }
+
+    enabled = test_bit(port, io_bitmap);
+    hvm_unmap_guest_frame(io_bitmap);
+    if (!enabled)
+        return NESTEDHVM_VMEXIT_HOST;
+
+    return NESTEDHVM_VMEXIT_INJECT;
+}
+
+int
+nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
+    struct cpu_user_regs *regs, uint64_t exitcode)
+{
+    uint64_t exit_bits;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    enum nestedhvm_vmexits vmexits;
+
+    switch (exitcode) {
+    case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
+    case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ);
+        if (svm->ns_cr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
+    case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ);
+        if (svm->ns_dr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF:
+        exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE);
+        if (svm->ns_exception_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_INTR ... VMEXIT_SHUTDOWN:
+        exit_bits = 1ULL << (exitcode - VMEXIT_INTR);
+        if (svm->ns_general1_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_VMRUN ... VMEXIT_XSETBV:
+        exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN);
+        if (svm->ns_general2_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_NPF:
+    case VMEXIT_INVALID:
+        /* Always intercepted */
+        break;
+
+    default:
+        gdprintk(XENLOG_ERR, "Illegal exitcode 0x%"PRIx64"\n", exitcode);
+        BUG();
+        break;
+    }
+
+    /* Special cases: Do more detailed checks */
+    switch (exitcode) {
+    case VMEXIT_MSR:
+        ASSERT(regs != NULL);
+        nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
+        ASSERT(nv->nv_vvmcx != NULL);
+        ns_vmcb = nv->nv_vvmcx;
+        vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm,
+            regs->ecx, ns_vmcb->exitinfo1 != 0);
+        if (vmexits == NESTEDHVM_VMEXIT_HOST)
+            return 0;
+        break;
+    case VMEXIT_IOIO:
+        nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr);
+        ASSERT(nv->nv_vvmcx != NULL);
+        ns_vmcb = nv->nv_vvmcx;
+        vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa,
+            ns_vmcb->exitinfo1);
+        if (vmexits == NESTEDHVM_VMEXIT_HOST)
+            return 0;
+        break;
+    }
+
+    return 1;
+}
+
+int
+nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr)
+{
+    return nsvm_vmcb_guest_intercepts_exitcode(v,
+        guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + trapnr);
+}
+
+static int
+nsvm_vmcb_prepare4vmexit(struct vcpu *v)
+{
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+    struct vmcb_struct *n2vmcb = nv->nv_n2vmcx;
+
+    svm_vmsave(nv->nv_n1vmcx);
+
+    /* Cache guest physical address of virtual vmcb
+     * for VMCB Cleanbit emulation.
+     */
+    svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr;
+
+    /* Intercepts - keep them as they are */
+
+    /* Pausefilter - keep it as is */
+
+    /* Nested IO permission bitmap */
+    /* Just keep the iopm_base_pa and msrpm_base_pa values.
+     * The guest must not see the virtualized values.
+     */
+
+    /* TSC offset */
+    /* Keep it. It's maintainted by the l1 guest. */ 
+
+    /* ASID */
+    /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */
+
+    /* TLB control */
+    ns_vmcb->tlb_control = 0;
+
+    /* Virtual Interrupts */
+    ns_vmcb->_vintr = n2vmcb->_vintr;
+    if (!(svm->ns_hostflags.fields.vintrmask))
+        ns_vmcb->_vintr.fields.intr_masking = 0;
+
+    /* Shadow mode */
+    ns_vmcb->interrupt_shadow = n2vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    ns_vmcb->exitcode = n2vmcb->exitcode;
+    ns_vmcb->exitinfo1 = n2vmcb->exitinfo1;
+    ns_vmcb->exitinfo2 = n2vmcb->exitinfo2;
+    ns_vmcb->exitintinfo = n2vmcb->exitintinfo;
+
+    /* Interrupts */
+    /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have
+     * to make sure that we do not lose injected events. So check eventinj
+     * here and copy it to exitintinfo if it is valid.
+     * exitintinfo and eventinj can't be both valid because the case below
+     * only happens on a VMRUN instruction intercept which has no valid
+     * exitintinfo set.
+     */
+    if ( unlikely(n2vmcb->eventinj.fields.v) &&
+         hvm_event_needs_reinjection(n2vmcb->eventinj.fields.type,
+                                     n2vmcb->eventinj.fields.vector) )
+    {
+        ns_vmcb->exitintinfo = n2vmcb->eventinj;
+    }
+
+    ns_vmcb->eventinj.bytes = 0;
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        ns_vmcb->_np_enable = n2vmcb->_np_enable;
+        ns_vmcb->_cr3 = n2vmcb->_cr3;
+        /* The vmcb->h_cr3 is the shadowed h_cr3. The original
+         * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3,
+         * hence we keep the ns_vmcb->h_cr3 value. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        ns_vmcb->_np_enable = 0;
+        /* Throw h_cr3 away. Guest is not allowed to set it or
+         * it can break out, otherwise (security hole!) */
+        ns_vmcb->_h_cr3 = 0x0;
+        /* Stop intercepting #PF (already done above
+         * by restoring cached intercepts). */
+        ns_vmcb->_cr3 = n2vmcb->_cr3;
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        ns_vmcb->_np_enable = 0;
+        ns_vmcb->_h_cr3 = 0x0;
+        /* The vmcb->_cr3 is the shadowed cr3. The original
+         * unshadowed guest cr3 is kept in ns_vmcb->_cr3,
+         * hence we keep the ns_vmcb->_cr3 value. */
+    }
+
+    /* LBR virtualization - keep lbr control as is */
+
+    /* NextRIP */
+    ns_vmcb->nextrip = n2vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    ns_vmcb->es = n2vmcb->es;
+    ns_vmcb->cs = n2vmcb->cs;
+    ns_vmcb->ss = n2vmcb->ss;
+    ns_vmcb->ds = n2vmcb->ds;
+    ns_vmcb->gdtr = n2vmcb->gdtr;
+    ns_vmcb->idtr = n2vmcb->idtr;
+
+    /* CPL */
+    ns_vmcb->_cpl = n2vmcb->_cpl;
+
+    /* EFER */
+    ns_vmcb->_efer = n2vmcb->_efer;
+
+    /* CRn */
+    ns_vmcb->_cr4 = n2vmcb->_cr4;
+    ns_vmcb->_cr0 = n2vmcb->_cr0;
+
+    /* DRn */
+    ns_vmcb->_dr7 = n2vmcb->_dr7;
+    ns_vmcb->_dr6 = n2vmcb->_dr6;
+
+    /* RFLAGS */
+    ns_vmcb->rflags = n2vmcb->rflags;
+
+    /* RIP */
+    ns_vmcb->rip = n2vmcb->rip;
+
+    /* RSP */
+    ns_vmcb->rsp = n2vmcb->rsp;
+
+    /* RAX */
+    ns_vmcb->rax = n2vmcb->rax;
+
+    /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* CR2 */
+    ns_vmcb->_cr2 = n2vmcb->_cr2;
+
+    /* Page tables */
+    ns_vmcb->pdpe0 = n2vmcb->pdpe0;
+    ns_vmcb->pdpe1 = n2vmcb->pdpe1;
+    ns_vmcb->pdpe2 = n2vmcb->pdpe2;
+    ns_vmcb->pdpe3 = n2vmcb->pdpe3;
+
+    /* PAT */
+    ns_vmcb->_g_pat = n2vmcb->_g_pat;
+
+    /* Debug Control MSR */
+    ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr;
+
+    /* LBR MSRs */
+    ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip;
+    ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip;
+    ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip;
+    ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip;
+
+    return 0;
+}
+
+bool_t
+nsvm_vmcb_hap_enabled(struct vcpu *v)
+{
+    return vcpu_nestedsvm(v).ns_hap_enabled;
+}
+
+/* MSR handling */
+int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+    int ret = 1;
+
+    *msr_content = 0;
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        *msr_content = svm->ns_msr_hsavepa;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
+int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content)
+{
+    int ret = 1;
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        /* ignore write. handle all bits as read-only. */
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        if (!nestedsvm_vmcb_isvalid(v, msr_content)) {
+            gdprintk(XENLOG_ERR,
+                "MSR_K8_VM_HSAVE_PA value invalid 0x%"PRIx64"\n", msr_content);
+            ret = -1; /* inject #GP */
+            break;
+        }
+        svm->ns_msr_hsavepa = msr_content;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
+/* VMEXIT emulation */
+void
+nestedsvm_vmexit_defer(struct vcpu *v,
+    uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2)
+{
+    struct nestedsvm *svm = &vcpu_nestedsvm(v);
+
+    svm->ns_vmexit.exitcode = exitcode;
+    svm->ns_vmexit.exitinfo1 = exitinfo1;
+    svm->ns_vmexit.exitinfo2 = exitinfo2;
+    vcpu_nestedhvm(v).nv_vmexit_pending = 1;
+}
+
+enum nestedhvm_vmexits
+nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    bool_t is_intercepted;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    ASSERT(nv->nv_vmexit_pending == 0);
+    is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode);
+
+    switch (exitcode) {
+    case VMEXIT_INVALID:
+        if (is_intercepted)
+            return NESTEDHVM_VMEXIT_INJECT;
+        return NESTEDHVM_VMEXIT_HOST;
+
+    case VMEXIT_INTR:
+    case VMEXIT_NMI:
+        return NESTEDHVM_VMEXIT_HOST;
+    case VMEXIT_EXCEPTION_NM:
+        /* Host must handle lazy fpu context switching first.
+         * Then inject the VMEXIT if L1 guest intercepts this.
+         */
+        return NESTEDHVM_VMEXIT_HOST;
+
+    case VMEXIT_NPF:
+        if (nestedhvm_paging_mode_hap(v)) {
+            if (!is_intercepted)
+                return NESTEDHVM_VMEXIT_FATALERROR;
+            /* host nested paging + guest nested paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        if (paging_mode_hap(v->domain)) {
+            if (is_intercepted)
+                return NESTEDHVM_VMEXIT_FATALERROR;
+            /* host nested paging + guest shadow paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        /* host shadow paging + guest shadow paging */
+        /* Can this happen? */
+        BUG();
+        return NESTEDHVM_VMEXIT_FATALERROR;
+    case VMEXIT_EXCEPTION_PF:
+        if (nestedhvm_paging_mode_hap(v)) {
+            /* host nested paging + guest nested paging */
+            if (!is_intercepted)
+                /* l1 guest intercepts #PF unnecessarily */
+                return NESTEDHVM_VMEXIT_HOST;
+            /* l2 guest intercepts #PF unnecessarily */
+            return NESTEDHVM_VMEXIT_INJECT;
+        }
+        if (!paging_mode_hap(v->domain)) {
+            /* host shadow paging + guest shadow paging */
+            return NESTEDHVM_VMEXIT_HOST;
+        }
+        /* host nested paging + guest shadow paging */
+        return NESTEDHVM_VMEXIT_INJECT;
+    case VMEXIT_VMMCALL:
+        /* Always let the guest handle VMMCALL/VMCALL */
+        return NESTEDHVM_VMEXIT_INJECT;
+    default:
+        break;
+    }
+
+    if (is_intercepted)
+        return NESTEDHVM_VMEXIT_INJECT;
+    return NESTEDHVM_VMEXIT_HOST;
+}
+
+enum nestedhvm_vmexits
+nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    int rc;
+    enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE;
+
+    ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress);
+    ASSERT(nestedhvm_vcpu_in_guestmode(v));
+
+    rc = nsvm_vmcb_prepare4vmexit(v);
+    if (rc)
+        ret = NESTEDHVM_VMEXIT_ERROR;
+
+    rc = nhvm_vcpu_hostrestore(v, regs);
+    if (rc)
+        ret = NESTEDHVM_VMEXIT_FATALERROR;
+
+    nestedhvm_vcpu_exit_guestmode(v);
+    return ret;
+}
+
+/* The exitcode is in native SVM/VMX format. The forced exitcode
+ * is in generic format.
+ */
+static enum nestedhvm_vmexits
+nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode)
+{
+    int rc;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    nv->nv_vmswitch_in_progress = 1;
+
+    ASSERT(nv->nv_vvmcx != NULL);
+
+    /* On special intercepts the host has to handle
+     * the vcpu is still in guest mode here.
+     */
+    if (nestedhvm_vcpu_in_guestmode(v)) {
+        enum nestedhvm_vmexits ret;
+
+        ret = nestedsvm_vmexit_n2n1(v, regs);
+        switch (ret) {
+        case NESTEDHVM_VMEXIT_FATALERROR:
+            gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n");
+            return ret;
+        case NESTEDHVM_VMEXIT_HOST:
+            BUG();
+            return ret;
+        case NESTEDHVM_VMEXIT_ERROR:
+            exitcode = VMEXIT_INVALID;
+            break;
+        default:
+            ASSERT(!nestedhvm_vcpu_in_guestmode(v));
+            break;
+        }
+
+        /* host state has been restored */
+    }
+
+    ASSERT(!nestedhvm_vcpu_in_guestmode(v));
+
+    /* Prepare for running the l1 guest. Make the actual
+     * modifications to the virtual VMCB/VMCS.
+     */
+    rc = nhvm_vcpu_vmexit(v, regs, exitcode);
+
+    nv->nv_vmswitch_in_progress = 0;
+
+    if (rc)
+        return NESTEDHVM_VMEXIT_FATALERROR;
+
+    return NESTEDHVM_VMEXIT_DONE;
+}
+
+/* VCPU switch */
+asmlinkage void nsvm_vcpu_switch(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    struct nestedvcpu *nv;
+    struct nestedsvm *svm;
+
+    if (!nestedhvm_enabled(v->domain))
+        return;
+
+    nv = &vcpu_nestedhvm(v);
+    svm = &vcpu_nestedsvm(v);
+    ASSERT(v->arch.hvm_svm.vmcb != NULL);
+    ASSERT(nv->nv_n1vmcx != NULL);
+    ASSERT(nv->nv_n2vmcx != NULL);
+    ASSERT(nv->nv_n1vmcx_pa != VMCX_EADDR);
+    ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR);
+
+    if (nv->nv_vmexit_pending) {
+ vmexit:
+        nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode);
+        nv->nv_vmexit_pending = 0;
+        nv->nv_vmentry_pending = 0;
+        return;
+    }
+    if (nv->nv_vmentry_pending) {
+        int ret;
+        ASSERT(!nv->nv_vmexit_pending);
+        ret = nsvm_vcpu_vmrun(v, regs);
+        if (ret < 0)
+            goto vmexit;
+        nv->nv_vmentry_pending = 0;
+        return;
+    }
+}
+
+
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Mar 09 12:36:05 2011 +0100
@@ -49,6 +49,9 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/intr.h>
+#include <asm/hvm/svm/svmdebug.h>
+#include <asm/hvm/svm/nestedsvm.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/x86_emulate.h>
 #include <public/sched.h>
 #include <asm/hvm/vpt.h>
@@ -106,6 +109,44 @@
     write_efer(read_efer() & ~EFER_SVME);
 }
 
+unsigned long *
+svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
+{
+    unsigned long *msr_bit = NULL;
+
+    /*
+     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
+     */
+    if ( msr <= 0x1fff )
+        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
+        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
+        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
+
+    return msr_bit;
+}
+
+void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
+{
+    unsigned long *msr_bit;
+
+    msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
+    BUG_ON(msr_bit == NULL);
+    msr &= 0x1fff;
+
+    if ( enable )
+    {
+        __set_bit(msr * 2, msr_bit);
+        __set_bit(msr * 2 + 1, msr_bit);
+    }
+    else
+    {
+        __clear_bit(msr * 2, msr_bit);
+        __clear_bit(msr * 2 + 1, msr_bit);
+    }
+}
+
 static void svm_save_dr(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -296,7 +337,7 @@
 {
     svm_load_cpu_state(v, ctxt);
     if (svm_vmcb_restore(v, ctxt)) {
-        printk("svm_vmcb restore failed!\n");
+        gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
         domain_crash(v->domain);
         return -EINVAL;
     }
@@ -588,7 +629,24 @@
 static void svm_set_tsc_offset(struct vcpu *v, u64 offset)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    vmcb_set_tsc_offset(vmcb, offset);
+    struct vmcb_struct *n1vmcb, *n2vmcb;
+    uint64_t n2_tsc_offset = 0;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        vmcb_set_tsc_offset(vmcb, offset);
+        return;
+    }
+
+    n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
+    n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx;
+
+    if ( nestedhvm_vcpu_in_guestmode(v) ) {
+        n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) -
+            vmcb_get_tsc_offset(n1vmcb);
+        vmcb_set_tsc_offset(n1vmcb, offset);
+    }
+
+    vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset);
 }
 
 static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
@@ -683,9 +741,13 @@
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     bool_t debug_state = v->domain->debugger_attached;
-    vintr_t intr;
+    bool_t vcpu_guestmode = 0;
 
-    if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
+    if ( !vcpu_guestmode &&
+        unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
     {
         uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
         uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
@@ -703,13 +765,19 @@
         hvm_asid_flush_vcpu(v);
     }
 
-    /* Reflect the vlapic's TPR in the hardware vtpr */
-    intr = vmcb_get_vintr(vmcb);
-    intr.fields.tpr =
-        (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
-    vmcb_set_vintr(vmcb, intr);
+    if ( !vcpu_guestmode )
+    {
+        vintr_t intr;
+
+        /* Reflect the vlapic's TPR in the hardware vtpr */
+        intr = vmcb_get_vintr(vmcb);
+        intr.fields.tpr =
+            (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
+        vmcb_set_vintr(vmcb, intr);
+    }
 
     hvm_do_resume(v);
+
     reset_stack_and_jump(svm_asm_do_resume);
 }
 
@@ -961,8 +1029,8 @@
         struct {
             uint64_t gpa;
             uint64_t mfn;
-            u32 qualification;
-            u32 p2mt;
+            uint32_t qualification;
+            uint32_t p2mt;
         } _d;
 
         _d.gpa = gpa;
@@ -984,12 +1052,21 @@
 
 static void svm_fpu_dirty_intercept(void)
 {
-    struct vcpu *curr = current;
-    struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
-    svm_fpu_enter(curr);
+    svm_fpu_enter(v);
 
-    if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) {
+       /* Check if guest must make FPU ready for the nested guest */
+       if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS )
+           hvm_inject_exception(TRAP_no_device, HVM_DELIVER_NO_ERROR_CODE, 0);
+       else
+           vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
+       return;
+    }
+
+    if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
         vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
 }
 
@@ -1003,11 +1080,14 @@
 
     hvm_cpuid(input, eax, ebx, ecx, edx);
 
-    if ( input == 0x80000001 )
-    {
+    switch (input) {
+    case 0x80000001:
         /* Fix up VLAPIC details. */
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
             __clear_bit(X86_FEATURE_APIC & 31, edx);
+        break;
+    default:
+        break;
     }
 
     HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
@@ -1043,6 +1123,7 @@
 
 static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -1076,9 +1157,6 @@
         *msr_content = 0;
         break;
 
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_DEBUGCTLMSR:
         *msr_content = vmcb_get_debugctlmsr(vmcb);
         break;
@@ -1111,6 +1189,11 @@
         break;
 
     default:
+        ret = nsvm_rdmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
 
         if ( rdmsr_viridian_regs(msr, msr_content) ||
              rdmsr_hypervisor_regs(msr, msr_content) )
@@ -1133,6 +1216,7 @@
 
 static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     int sync = 0;
@@ -1153,9 +1237,6 @@
 
     switch ( msr )
     {
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_SYSENTER_CS:
         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
@@ -1215,6 +1296,12 @@
         break;
 
     default:
+        ret = nsvm_wrmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
+
         if ( wrmsr_viridian_regs(msr, msr_content) )
             break;
 
@@ -1298,6 +1385,96 @@
     do_sched_op_compat(SCHEDOP_yield, 0);
 }
 
+static void
+svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
+                    struct vcpu *v, uint64_t vmcbaddr)
+{
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #UD\n");
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    vcpu_nestedhvm(v).nv_vmentry_pending = 1;
+    return;
+}
+
+static void
+svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
+        return;
+
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMLOAD: mapping vmcb failed, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    svm_vmload(nv->nv_vvmcx);
+    /* State in L1 VMCB is stale now */
+    v->arch.hvm_svm.vmcb_in_sync = 0;
+
+    __update_guest_eip(regs, inst_len);
+    return;
+
+ inject:
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
+static void
+svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
+        return;
+
+    if (!nestedhvm_enabled(v->domain)) {
+        gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    if (!nestedsvm_vmcb_map(v, vmcbaddr)) {
+        gdprintk(XENLOG_ERR, "VMSAVE: mapping vmcb failed, injecting #UD\n");
+        ret = TRAP_invalid_op;
+        goto inject;
+    }
+
+    svm_vmsave(nv->nv_vvmcx);
+
+    __update_guest_eip(regs, inst_len);
+    return;
+
+ inject:
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
 static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs)
 {
     struct hvm_emulate_ctxt ctxt;
@@ -1428,22 +1605,38 @@
     .msr_read_intercept   = svm_msr_read_intercept,
     .msr_write_intercept  = svm_msr_write_intercept,
     .invlpg_intercept     = svm_invlpg_intercept,
-    .set_rdtsc_exiting    = svm_set_rdtsc_exiting
+    .set_rdtsc_exiting    = svm_set_rdtsc_exiting,
+
+    .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
+    .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
+    .nhvm_vcpu_reset = nsvm_vcpu_reset,
+    .nhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore,
+    .nhvm_vcpu_vmexit = nsvm_vcpu_vmexit_inject,
+    .nhvm_vcpu_vmexit_trap = nsvm_vcpu_vmexit_trap,
+    .nhvm_vcpu_guestcr3 = nsvm_vcpu_guestcr3,
+    .nhvm_vcpu_hostcr3 = nsvm_vcpu_hostcr3,
+    .nhvm_vcpu_asid = nsvm_vcpu_asid,
+    .nhvm_vmcx_guest_intercepts_trap = nsvm_vmcb_guest_intercepts_trap,
+    .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
 };
 
 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
 {
-    unsigned int exit_reason;
+    uint64_t exit_reason;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     eventinj_t eventinj;
     int inst_len, rc;
     vintr_t intr;
+    bool_t vcpu_guestmode = 0;
 
     if ( paging_mode_hap(v->domain) )
         v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
             vmcb_get_cr3(vmcb);
 
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
     /*
      * Before doing anything else, we need to sync up the VLAPIC's TPR with
      * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
@@ -1451,13 +1644,73 @@
      * NB. We need to preserve the low bits of the TPR to make checked builds
      * of Windows work, even though they don't actually do anything.
      */
-    intr = vmcb_get_vintr(vmcb);
-    vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
+    if ( !vcpu_guestmode ) {
+        intr = vmcb_get_vintr(vmcb);
+        vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
                    ((intr.fields.tpr & 0x0F) << 4) |
                    (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F));
+    }
 
     exit_reason = vmcb->exitcode;
 
+    if ( vcpu_guestmode ) {
+        enum nestedhvm_vmexits nsret;
+        struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+        struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
+        uint64_t exitinfo1, exitinfo2;
+
+        /* Write real exitinfo1 back into virtual vmcb.
+         * nestedsvm_check_intercepts() expects to have the correct
+         * exitinfo1 value there.
+         */
+        exitinfo1 = ns_vmcb->exitinfo1;
+        ns_vmcb->exitinfo1 = vmcb->exitinfo1;
+        nsret = nestedsvm_check_intercepts(v, regs, exit_reason);
+        switch (nsret) {
+        case NESTEDHVM_VMEXIT_CONTINUE:
+            BUG();
+            break;
+        case NESTEDHVM_VMEXIT_HOST:
+            break;
+        case NESTEDHVM_VMEXIT_INJECT:
+            /* Switch vcpu from l2 to l1 guest. We must perform
+             * the switch here to have svm_do_resume() working
+             * as intended.
+             */
+            exitinfo1 = vmcb->exitinfo1;
+            exitinfo2 = vmcb->exitinfo2;
+            nv->nv_vmswitch_in_progress = 1;
+            nsret = nestedsvm_vmexit_n2n1(v, regs);
+            nv->nv_vmswitch_in_progress = 0;
+            switch (nsret) {
+            case NESTEDHVM_VMEXIT_DONE:
+                /* defer VMEXIT injection */
+                nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2);
+                goto out;
+            case NESTEDHVM_VMEXIT_FATALERROR:
+                gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n");
+                goto exit_and_crash;
+
+            default:
+                BUG();
+            case NESTEDHVM_VMEXIT_ERROR:
+                break;
+            }
+        case NESTEDHVM_VMEXIT_ERROR:
+            gdprintk(XENLOG_ERR,
+                "nestedsvm_check_intercepts() returned 
NESTEDHVM_VMEXIT_ERROR\n");
+            goto out;
+        case NESTEDHVM_VMEXIT_FATALERROR:
+            gdprintk(XENLOG_ERR,
+                "unexpected nestedsvm_check_intercepts() error\n");
+            goto exit_and_crash;
+        default:
+            gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n",
+                nsret);
+            goto exit_and_crash;
+        }
+    }
+
     if ( hvm_long_mode_enabled(v) )
         HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
                     (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
@@ -1469,7 +1722,7 @@
 
     if ( unlikely(exit_reason == VMEXIT_INVALID) )
     {
-        svm_dump_vmcb(__func__, vmcb);
+        svm_vmcb_dump(__func__, vmcb);
         goto exit_and_crash;
     }
 
@@ -1630,6 +1883,7 @@
     case VMEXIT_VMMCALL:
         if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
             break;
+        BUG_ON(vcpu_guestmode);
         HVMTRACE_1D(VMMCALL, regs->eax);
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
@@ -1662,9 +1916,18 @@
 
     case VMEXIT_MONITOR:
     case VMEXIT_MWAIT:
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        break;
+
     case VMEXIT_VMRUN:
+        svm_vmexit_do_vmrun(regs, v, regs->eax);
+        break;
     case VMEXIT_VMLOAD:
+        svm_vmexit_do_vmload(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_VMSAVE:
+        svm_vmexit_do_vmsave(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_STGI:
     case VMEXIT_CLGI:
     case VMEXIT_SKINIT:
@@ -1708,7 +1971,7 @@
 
     default:
     exit_and_crash:
-        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
+        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%"PRIx64", "
                  "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
                  exit_reason, 
                  (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
@@ -1716,6 +1979,11 @@
         break;
     }
 
+  out:
+    if ( vcpu_guestmode )
+        /* Don't clobber TPR of the nested guest. */
+        return;
+
     /* The exit may have updated the TPR: reflect this in the hardware vtpr */
     intr = vmcb_get_vintr(vmcb);
     intr.fields.tpr =
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/svmdebug.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/svm/svmdebug.c   Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,191 @@
+/*
+ * svmdebug.c: debug functions
+ * Copyright (c) 2011, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/msr-index.h>
+#include <asm/hvm/svm/svmdebug.h>
+
+static void svm_dump_sel(const char *name, svm_segment_register_t *s)
+{
+    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
+           name, s->sel, s->attr.bytes, s->limit,
+           (unsigned long long)s->base);
+}
+
+/* This function can directly access fields which are covered by clean bits. */
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb)
+{
+    printk("Dumping guest's current state at %s...\n", from);
+    printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n",
+           (int) sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb);
+
+    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
+           "exception_intercepts = 0x%08x\n", 
+           vmcb->_cr_intercepts, vmcb->_dr_intercepts, 
+           vmcb->_exception_intercepts);
+    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
+           vmcb->_general1_intercepts, vmcb->_general2_intercepts);
+    printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = "
+            "0x%016llx\n", 
+           (unsigned long long)vmcb->_iopm_base_pa,
+           (unsigned long long)vmcb->_msrpm_base_pa,
+           (unsigned long long)vmcb->_tsc_offset);
+    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
+            "0x%016llx\n", vmcb->tlb_control,
+           (unsigned long long)vmcb->_vintr.bytes,
+           (unsigned long long)vmcb->interrupt_shadow);
+    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
+           (unsigned long long)vmcb->exitcode,
+           (unsigned long long)vmcb->exitintinfo.bytes);
+    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
+           (unsigned long long)vmcb->exitinfo1,
+           (unsigned long long)vmcb->exitinfo2);
+    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
+           (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid);
+    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
+           vmcb->_cpl, (unsigned long long)vmcb->_efer,
+           (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar);
+    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
+           (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2);
+    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
+           (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4);
+    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
+           (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip);
+    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
+           (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags);
+    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
+           (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7);
+    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
+           (unsigned long long)vmcb->cstar, 
+           (unsigned long long)vmcb->sfmask);
+    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
+           (unsigned long long)vmcb->kerngsbase,
+           (unsigned long long)vmcb->_g_pat);
+    printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n",
+           (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes);
+
+    /* print out all the selectors */
+    svm_dump_sel("CS", &vmcb->cs);
+    svm_dump_sel("DS", &vmcb->ds);
+    svm_dump_sel("SS", &vmcb->ss);
+    svm_dump_sel("ES", &vmcb->es);
+    svm_dump_sel("FS", &vmcb->fs);
+    svm_dump_sel("GS", &vmcb->gs);
+    svm_dump_sel("GDTR", &vmcb->gdtr);
+    svm_dump_sel("LDTR", &vmcb->ldtr);
+    svm_dump_sel("IDTR", &vmcb->idtr);
+    svm_dump_sel("TR", &vmcb->tr);
+}
+
+bool_t
+svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                 bool_t verbose)
+{
+    bool_t ret = 0; /* ok */
+
+#define PRINTF(...) \
+    if (verbose) { ret = 1; printk("%s: ", from); printk(__VA_ARGS__); \
+    } else return 1;
+
+    if ((vmcb->_efer & EFER_SVME) == 0) {
+        PRINTF("EFER: SVME bit not set (0x%"PRIx64")\n", vmcb->_efer);
+    }
+
+    if ((vmcb->_cr0 & X86_CR0_CD) == 0 && (vmcb->_cr0 & X86_CR0_NW) != 0) {
+        PRINTF("CR0: CD bit is zero and NW bit set (0x%"PRIx64")\n",
+                vmcb->_cr0);
+    }
+
+    if ((vmcb->_cr0 >> 32U) != 0) {
+        PRINTF("CR0: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_cr0);
+    }
+
+    if ((vmcb->_cr3 & 0x7) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3);
+    }
+    if ((vmcb->_efer & EFER_LMA) && (vmcb->_cr3 & 0xfe) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3);
+    }
+
+    if ((vmcb->_cr4 >> 11U) != 0) {
+        PRINTF("CR4: bits [63:11] are not zero (0x%"PRIx64")\n",
+                vmcb->_cr4);
+    }
+
+    if ((vmcb->_dr6 >> 32U) != 0) {
+        PRINTF("DR6: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_dr6);
+    }
+
+    if ((vmcb->_dr7 >> 32U) != 0) {
+        PRINTF("DR7: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->_dr7);
+    }
+
+    if ((vmcb->_efer >> 15U) != 0) {
+        PRINTF("EFER: bits [63:15] are not zero (0x%"PRIx64")\n",
+                vmcb->_efer);
+    }
+
+    if ((vmcb->_efer & EFER_LME) != 0 && ((vmcb->_cr0 & X86_CR0_PG) != 0)) {
+        if ((vmcb->_cr4 & X86_CR4_PAE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR4.PAE is zero.\n");
+        }
+        if ((vmcb->_cr0 & X86_CR0_PE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR0.PE is zero.\n");
+        }
+    }
+
+    if ((vmcb->_efer & EFER_LME) != 0
+        && (vmcb->_cr0 & X86_CR0_PG) != 0
+        && (vmcb->_cr4 & X86_CR4_PAE) != 0
+        && (vmcb->cs.attr.fields.l != 0)
+        && (vmcb->cs.attr.fields.db != 0))
+    {
+        PRINTF("EFER_LME, CR0.PG, CR4.PAE, CS.L and CS.D are all non-zero.\n");
+    }
+
+    if ((vmcb->_general2_intercepts & GENERAL2_INTERCEPT_VMRUN) == 0) {
+        PRINTF("GENERAL2_INTERCEPT: VMRUN intercept bit is clear 
(0x%"PRIx32")\n",
+            vmcb->_general2_intercepts);
+    }
+
+    if (vmcb->eventinj.fields.resvd1 != 0) {
+        PRINTF("eventinj: MBZ bits are set (0x%"PRIx64")\n",
+                vmcb->eventinj.bytes);
+    }
+
+    if (vmcb->_np_enable && vmcb->_h_cr3 == 0) {
+        PRINTF("nested paging enabled but host cr3 is 0\n");
+    }
+
+#undef PRINTF
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar 09 12:36:05 2011 +0100
@@ -33,6 +33,7 @@
 #include <asm/hvm/svm/svm.h>
 #include <asm/hvm/svm/intr.h>
 #include <asm/hvm/svm/asid.h>
+#include <asm/hvm/svm/svmdebug.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
 #include <xen/domain_page.h>
@@ -40,9 +41,6 @@
 
 extern int svm_dbg_on;
 
-#define IOPM_SIZE   (12 * 1024)
-#define MSRPM_SIZE  (8  * 1024)
-
 struct vmcb_struct *alloc_vmcb(void) 
 {
     struct vmcb_struct *vmcb;
@@ -78,37 +76,6 @@
     return hsa;
 }
 
-void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
-{
-    unsigned long *msr_bitmap = v->arch.hvm_svm.msrpm;
-    unsigned long *msr_bit = NULL;
-
-    /*
-     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
-     */
-    if ( msr <= 0x1fff )
-        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
-        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
-        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
-
-    BUG_ON(msr_bit == NULL);
-
-    msr &= 0x1fff;
-
-    if ( enable )
-    {
-        __set_bit(msr * 2, msr_bit);
-        __set_bit(msr * 2 + 1, msr_bit);
-    }
-    else
-    {
-        __clear_bit(msr * 2, msr_bit);
-        __clear_bit(msr * 2 + 1, msr_bit);
-    }
-}
-
 /* This function can directly access fields which are covered by clean bits. */
 static int construct_vmcb(struct vcpu *v)
 {
@@ -257,7 +224,7 @@
 
     if ( cpu_has_pause_filter )
     {
-        vmcb->_pause_filter_count = 3000;
+        vmcb->_pause_filter_count = SVM_PAUSEFILTER_INIT;
         vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_PAUSE;
     }
 
@@ -268,34 +235,38 @@
 
 int svm_create_vmcb(struct vcpu *v)
 {
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
     int rc;
 
-    if ( (arch_svm->vmcb == NULL) &&
-         (arch_svm->vmcb = alloc_vmcb()) == NULL )
+    if ( (nv->nv_n1vmcx == NULL) &&
+         (nv->nv_n1vmcx = alloc_vmcb()) == NULL )
     {
         printk("Failed to create a new VMCB\n");
         return -ENOMEM;
     }
 
-    if ( (rc = construct_vmcb(v)) != 0 )
+    arch_svm->vmcb = nv->nv_n1vmcx;
+    rc = construct_vmcb(v);
+    if ( rc != 0 )
     {
-        free_vmcb(arch_svm->vmcb);
+        free_vmcb(nv->nv_n1vmcx);
+        nv->nv_n1vmcx = NULL;
         arch_svm->vmcb = NULL;
         return rc;
     }
 
-    arch_svm->vmcb_pa = virt_to_maddr(arch_svm->vmcb);
-
+    arch_svm->vmcb_pa = nv->nv_n1vmcx_pa = virt_to_maddr(arch_svm->vmcb);
     return 0;
 }
 
 void svm_destroy_vmcb(struct vcpu *v)
 {
+    struct nestedvcpu *nv = &vcpu_nestedhvm(v);
     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
 
-    if ( arch_svm->vmcb != NULL )
-        free_vmcb(arch_svm->vmcb);
+    if ( nv->nv_n1vmcx != NULL )
+        free_vmcb(nv->nv_n1vmcx);
 
     if ( arch_svm->msrpm != NULL )
     {
@@ -304,81 +275,11 @@
         arch_svm->msrpm = NULL;
     }
 
+    nv->nv_n1vmcx = NULL;
+    nv->nv_n1vmcx_pa = VMCX_EADDR;
     arch_svm->vmcb = NULL;
 }
 
-static void svm_dump_sel(char *name, svm_segment_register_t *s)
-{
-    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
-           name, s->sel, s->attr.bytes, s->limit,
-           (unsigned long long)s->base);
-}
-
-/* This function can directly access fields which are covered by clean bits. */
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb)
-{
-    printk("Dumping guest's current state at %s...\n", from);
-    printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n",
-           (int) sizeof(struct vmcb_struct),  virt_to_maddr(vmcb), vmcb);
-
-    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
-           "exception_intercepts = 0x%08x\n", 
-           vmcb->_cr_intercepts, vmcb->_dr_intercepts, 
-           vmcb->_exception_intercepts);
-    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
-           vmcb->_general1_intercepts, vmcb->_general2_intercepts);
-    printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = "
-            "0x%016llx\n", 
-           (unsigned long long)vmcb->_iopm_base_pa,
-           (unsigned long long)vmcb->_msrpm_base_pa,
-           (unsigned long long)vmcb->_tsc_offset);
-    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
-            "0x%016llx\n", vmcb->tlb_control,
-           (unsigned long long)vmcb->_vintr.bytes,
-           (unsigned long long)vmcb->interrupt_shadow);
-    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
-           (unsigned long long)vmcb->exitcode,
-           (unsigned long long)vmcb->exitintinfo.bytes);
-    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
-           (unsigned long long)vmcb->exitinfo1,
-           (unsigned long long)vmcb->exitinfo2);
-    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
-           (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid);
-    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
-           vmcb->_cpl, (unsigned long long)vmcb->_efer,
-           (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar);
-    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
-           (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2);
-    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
-           (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4);
-    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
-           (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip);
-    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
-           (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags);
-    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
-           (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7);
-    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
-           (unsigned long long)vmcb->cstar, 
-           (unsigned long long)vmcb->sfmask);
-    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
-           (unsigned long long)vmcb->kerngsbase,
-           (unsigned long long)vmcb->_g_pat);
-    printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n", 
-           (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes);
-
-    /* print out all the selectors */
-    svm_dump_sel("CS", &vmcb->cs);
-    svm_dump_sel("DS", &vmcb->ds);
-    svm_dump_sel("SS", &vmcb->ss);
-    svm_dump_sel("ES", &vmcb->es);
-    svm_dump_sel("FS", &vmcb->fs);
-    svm_dump_sel("GS", &vmcb->gs);
-    svm_dump_sel("GDTR", &vmcb->gdtr);
-    svm_dump_sel("LDTR", &vmcb->ldtr);
-    svm_dump_sel("IDTR", &vmcb->idtr);
-    svm_dump_sel("TR", &vmcb->tr);
-}
-
 static void vmcb_dump(unsigned char ch)
 {
     struct domain *d;
@@ -396,7 +297,7 @@
         for_each_vcpu ( d, v )
         {
             printk("\tVCPU %d\n", v->vcpu_id);
-            svm_dump_vmcb("key_handler", v->arch.hvm_svm.vmcb);
+            svm_vmcb_dump("key_handler", v->arch.hvm_svm.vmcb);
         }
     }
 
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h     Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar 09 12:36:05 2011 +0100
@@ -33,6 +33,11 @@
     INSTR_RDTSC,
     INSTR_PAUSE,
     INSTR_XSETBV,
+    INSTR_VMRUN,
+    INSTR_VMLOAD,
+    INSTR_VMSAVE,
+    INSTR_STGI,
+    INSTR_CLGI,
     INSTR_MAX_COUNT /* Must be last - Number of instructions supported */
 };
 
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/nestedsvm.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/svm/nestedsvm.h   Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,129 @@
+/*
+ * nestedsvm.h: Nested Virtualization
+ * Copyright (c) 2011, Advanced Micro Devices, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_HVM_SVM_NESTEDSVM_H__
+#define __ASM_X86_HVM_SVM_NESTEDSVM_H__
+
+#include <asm/config.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/svm/vmcb.h>
+
+struct nestedsvm {
+    uint64_t ns_msr_hsavepa; /* MSR HSAVE_PA value */
+
+    /* l1 guest physical address of virtual vmcb used by prior VMRUN.
+     * Needed for VMCB Cleanbit emulation.
+     */
+    uint64_t ns_ovvmcb_pa;
+
+    /* Cached real intercepts of the l2 guest */
+    uint32_t ns_cr_intercepts;
+    uint32_t ns_dr_intercepts;
+    uint32_t ns_exception_intercepts;
+    uint32_t ns_general1_intercepts;
+    uint32_t ns_general2_intercepts;
+
+    /* Cached real lbr of the l2 guest */
+    lbrctrl_t ns_lbr_control;
+
+    /* Cached real MSR permission bitmaps of the l2 guest */
+    unsigned long *ns_cached_msrpm;
+    /* Merged MSR permission bitmap */
+    unsigned long *ns_merged_msrpm;
+
+    /* guest physical address of virtual io permission map */
+    paddr_t ns_iomap_pa, ns_oiomap_pa;
+    /* Shadow io permission map */
+    unsigned long *ns_iomap;
+
+    /* Cache guest cr3/host cr3 the guest sets up for the l2 guest.
+     * Used by Shadow-on-Shadow and Nested-on-Nested.
+     * ns_vmcb_guestcr3: in l2 guest physical address space and points to
+     *     the l2 guest page table
+     * ns_vmcb_hostcr3: in l1 guest physical address space and points to
+     *     the l1 guest nested page table
+     */
+    uint64_t ns_vmcb_guestcr3, ns_vmcb_hostcr3;
+    uint32_t ns_guest_asid;
+
+    bool_t ns_hap_enabled;
+
+    /* Only meaningful when vmexit_pending flag is set */
+    struct {
+        uint64_t exitcode;  /* native exitcode to inject into l1 guest */
+        uint64_t exitinfo1; /* additional information to the exitcode */
+        uint64_t exitinfo2; /* additional information to the exitcode */
+    } ns_vmexit;
+    union {
+        uint32_t bytes;
+        struct {
+            uint32_t rflagsif: 1;
+            uint32_t vintrmask: 1;
+            uint32_t reserved: 30;
+        } fields;
+    } ns_hostflags;
+};
+
+#define vcpu_nestedsvm(v) (vcpu_nestedhvm(v).u.nsvm)
+
+/* True when l1 guest enabled SVM in EFER */
+#define hvm_svm_enabled(v) \
+    (!!((v)->arch.hvm_vcpu.guest_efer & EFER_SVME))
+
+int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr);
+void nestedsvm_vmexit_defer(struct vcpu *v,
+    uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2);
+enum nestedhvm_vmexits
+nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs);
+enum nestedhvm_vmexits
+nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode);
+
+/* Interface methods */
+int nsvm_vcpu_destroy(struct vcpu *v);
+int nsvm_vcpu_initialise(struct vcpu *v);
+int nsvm_vcpu_reset(struct vcpu *v);
+int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs);
+int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs);
+int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode);
+int nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr,
+                      int errcode, unsigned long cr2);
+uint64_t nsvm_vcpu_guestcr3(struct vcpu *v);
+uint64_t nsvm_vcpu_hostcr3(struct vcpu *v);
+uint32_t nsvm_vcpu_asid(struct vcpu *v);
+int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
+    struct cpu_user_regs *regs, uint64_t exitcode);
+int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr);
+bool_t nsvm_vmcb_hap_enabled(struct vcpu *v);
+
+/* MSRs */
+int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content);
+int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content);
+
+#endif /* ASM_X86_HVM_SVM_NESTEDSVM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 09 12:36:05 2011 +0100
@@ -29,8 +29,6 @@
 #include <asm/i387.h>
 #include <asm/hvm/vpmu.h>
 
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb);
-
 #define SVM_REG_EAX (0) 
 #define SVM_REG_ECX (1) 
 #define SVM_REG_EDX (2) 
@@ -62,6 +60,8 @@
         : : "a" (__pa(vmcb)) : "memory" );
 }
 
+unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr);
+
 extern u32 svm_feature_flags;
 
 #define SVM_FEATURE_NPT            0 /* Nested page table support */
@@ -82,4 +82,6 @@
 #define cpu_has_svm_cleanbits cpu_has_svm_feature(SVM_FEATURE_VMCBCLEAN)
 #define cpu_has_pause_filter  cpu_has_svm_feature(SVM_FEATURE_PAUSEFILTER)
 
+#define SVM_PAUSEFILTER_INIT    3000
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/svmdebug.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/svm/svmdebug.h    Wed Mar 09 12:36:05 2011 +0100
@@ -0,0 +1,30 @@
+/*
+ * svmdebug.h: SVM related debug defintions
+ * Copyright (c) 2011, AMD Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef __ASM_X86_HVM_SVM_SVMDEBUG_H__
+#define __ASM_X86_HVM_SVM_SVMDEBUG_H__
+
+#include <asm/types.h>
+#include <asm/hvm/svm/vmcb.h>
+
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb);
+bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                        bool_t verbose);
+
+#endif /* __ASM_X86_HVM_SVM_SVMDEBUG_H__ */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h        Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar 09 12:36:05 2011 +0100
@@ -398,6 +398,9 @@
     } fields;
 } __attribute__ ((packed)) vmcbcleanbits_t;
 
+#define IOPM_SIZE   (12 * 1024)
+#define MSRPM_SIZE  (8  * 1024)
+
 struct vmcb_struct {
     u32 _cr_intercepts;         /* offset 0x00 - cleanbit 0 */
     u32 _dr_intercepts;         /* offset 0x04 - cleanbit 0 */
diff -r a21d019bb8fe -r a5e69b6fdd16 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Mon Feb 28 12:21:57 2011 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Mar 09 12:36:05 2011 +0100
@@ -25,6 +25,7 @@
 #include <asm/hvm/vlapic.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/svm/vmcb.h>
+#include <asm/hvm/svm/nestedsvm.h>
 #include <asm/mtrr.h>
 
 enum hvm_io_state {
@@ -50,6 +51,7 @@
 
     /* SVM/VMX arch specific */
     union {
+        struct nestedsvm nsvm;
     } u;
 
     bool_t nv_flushp2m; /* True, when p2m table must be flushed */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.