[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] Implement Nested-on-Nested.



# HG changeset patch
# User cegger
# Date 1302011049 -7200
# Node ID 7714b42e72fad771a447d66dc9e2acdd0dc98c59
# Parent  9c3fbfa7d0d5ce94c764e126f158c2b6fc78fb28
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.

Signed-off-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Committed-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---


diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Tue Apr 05 15:44:09 2011 +0200
@@ -1186,21 +1186,50 @@
     hvm_funcs.inject_exception(trapnr, errcode, cr2);
 }
 
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
-                                 bool_t gla_valid,
-                                 unsigned long gla,
-                                 bool_t access_valid,
-                                 bool_t access_r,
-                                 bool_t access_w,
-                                 bool_t access_x)
+int hvm_hap_nested_page_fault(unsigned long gpa,
+                              bool_t gla_valid,
+                              unsigned long gla,
+                              bool_t access_valid,
+                              bool_t access_r,
+                              bool_t access_w,
+                              bool_t access_x)
 {
     unsigned long gfn = gpa >> PAGE_SHIFT;
     p2m_type_t p2mt;
     p2m_access_t p2ma;
     mfn_t mfn;
     struct vcpu *v = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-
+    struct p2m_domain *p2m = NULL;
+
+    /* On Nested Virtualization, walk the guest page table.
+     * If this succeeds, all is fine.
+     * If this fails, inject a nested page fault into the guest.
+     */
+    if ( nestedhvm_enabled(v->domain)
+        && nestedhvm_vcpu_in_guestmode(v)
+        && nestedhvm_paging_mode_hap(v) )
+    {
+        int rv;
+
+        /* The vcpu is in guest mode and the l1 guest
+         * uses hap. That means 'gpa' is in l2 guest
+         * physical address space.
+         * Fix the nested p2m or inject nested page fault
+         * into l1 guest if not fixable. The algorithm is
+         * the same as for shadow paging.
+         */
+        rv = nestedhvm_hap_nested_page_fault(v, gpa);
+        switch (rv) {
+        case NESTEDHVM_PAGEFAULT_DONE:
+            return 1;
+        case NESTEDHVM_PAGEFAULT_ERROR:
+            return 0;
+        case NESTEDHVM_PAGEFAULT_INJECT:
+            return -1;
+        }
+    }
+
+    p2m = p2m_get_hostp2m(v->domain);
     mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
 
     /* Check access permissions first, then handle faults */
@@ -1344,6 +1373,15 @@
         return X86EMUL_EXCEPTION;
     }
 
+    if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+       ((value & EFER_SVME) == 0 ) &&
+       ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+    {
+        /* Cleared EFER.SVME: Flush all nestedp2m tables */
+        p2m_flush_nestedp2m(v->domain);
+        nestedhvm_vcpu_reset(v);
+    }
+
     value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
     v->arch.hvm_vcpu.guest_efer = value;
     hvm_update_guest_efer(v);
@@ -1494,8 +1532,12 @@
     v->arch.hvm_vcpu.guest_cr[0] = value;
     hvm_update_guest_cr(v, 0);
 
-    if ( (value ^ old_value) & X86_CR0_PG )
-        paging_update_paging_modes(v);
+    if ( (value ^ old_value) & X86_CR0_PG ) {
+        if ( !nestedhvm_vmswitch_in_progress(v) && 
nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -1562,8 +1604,12 @@
     hvm_update_guest_cr(v, 4);
 
     /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
-    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-        paging_update_paging_modes(v);
+    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+        if ( !nestedhvm_vmswitch_in_progress(v) && 
nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -2076,7 +2122,7 @@
     void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
 {
     struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct p2m_domain *p2m;
     unsigned long gfn, mfn;
     p2m_type_t p2mt;
     char *p;
@@ -2098,6 +2144,8 @@
         return HVMCOPY_unhandleable;
 #endif
 
+    p2m = p2m_get_hostp2m(curr->domain);
+
     while ( todo > 0 )
     {
         count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/nestedhvm.c      Tue Apr 05 15:44:09 2011 +0200
@@ -20,6 +20,7 @@
 #include <asm/msr.h>
 #include <asm/hvm/support.h>   /* for HVM_DELIVER_NO_ERROR_CODE */
 #include <asm/hvm/hvm.h>
+#include <asm/p2m.h>    /* for struct p2m_domain */
 #include <asm/hvm/nestedhvm.h>
 #include <asm/event.h>  /* for local_event_delivery_(en|dis)able */
 #include <asm/paging.h> /* for paging_mode_hap() */
@@ -96,6 +97,54 @@
     return nhvm_vcpu_destroy(v);
 }
 
+static void
+nestedhvm_flushtlb_ipi(void *info)
+{
+    struct vcpu *v = current;
+    struct domain *d = info;
+
+    ASSERT(d != NULL);
+    if (v->domain != d) {
+        /* This cpu doesn't belong to the domain */
+        return;
+    }
+
+    /* Just flush the ASID (or request a new one).
+     * This is cheaper than flush_tlb_local() and has
+     * the same desired effect.
+     */
+    hvm_asid_flush_core();
+    vcpu_nestedhvm(v).nv_p2m = NULL;
+}
+
+void
+nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
+{
+    on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi,
+        p2m->domain, 1);
+    cpus_clear(p2m->p2m_dirty_cpumask);
+}
+
+void
+nestedhvm_vmcx_flushtlbdomain(struct domain *d)
+{
+    on_selected_cpus(d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, d, 1);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+    if (!nestedhvm_enabled(v->domain)
+      || nestedhvm_vmswitch_in_progress(v)
+      || !nestedhvm_paging_mode_hap(v))
+        return 0;
+
+    if (nestedhvm_vcpu_in_guestmode(v))
+        return 1;
+
+    return 0;
+}
+
 /* Common shadow IO Permission bitmap */
 
 /* There four global patterns of io bitmap each guest can
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c  Tue Apr 05 15:44:09 2011 +0200
@@ -26,6 +26,7 @@
 #include <asm/hvm/svm/svmdebug.h>
 #include <asm/paging.h> /* paging_mode_hap */
 #include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
 
 static void
 nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -320,6 +321,18 @@
     return 0;
 }
 
+static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v,
+    struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb)
+{
+    struct p2m_domain *p2m;
+
+    ASSERT(v != NULL);
+    ASSERT(vvmcb != NULL);
+    ASSERT(n2vmcb != NULL);
+    p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3);
+    n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m));
+}
+
 static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
 {
     struct nestedvcpu *nv = &vcpu_nestedhvm(v);
@@ -475,6 +488,9 @@
     /* Nested paging mode */
     if (nestedhvm_paging_mode_hap(v)) {
         /* host nested paging + guest nested paging. */
+        n2vmcb->_np_enable = 1;
+
+        nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
         rc = hvm_set_cr3(ns_vmcb->_cr3);
@@ -1318,8 +1334,20 @@
         ret = nsvm_vcpu_vmrun(v, regs);
         if (ret < 0)
             goto vmexit;
+
+        ASSERT(nestedhvm_vcpu_in_guestmode(v));
         nv->nv_vmentry_pending = 0;
-        return;
+    }
+
+    if (nestedhvm_vcpu_in_guestmode(v)
+       && nestedhvm_paging_mode_hap(v))
+    {
+        /* In case left the l2 guest due to a physical interrupt (e.g. IPI)
+         * that is not for the l1 guest then we continue running the l2 guest
+         * but check if the nestedp2m is still valid.
+         */
+        if (nv->nv_p2m == NULL)
+            nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx);
     }
 }
 
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Tue Apr 05 15:44:09 2011 +0200
@@ -1014,14 +1014,16 @@
     return &svm_function_table;
 }
 
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+    struct cpu_user_regs *regs, paddr_t gpa)
 {
+    int ret;
     unsigned long gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t p2mt;
-    struct p2m_domain *p2m;
+    struct p2m_domain *p2m = NULL;
 
-    p2m = p2m_get_hostp2m(current->domain);
+    ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0);
 
     if ( tb_init_done )
     {
@@ -1032,6 +1034,7 @@
             uint32_t p2mt;
         } _d;
 
+        p2m = p2m_get_p2m(v);
         _d.gpa = gpa;
         _d.qualification = 0;
         _d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1039,14 +1042,26 @@
         __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
     }
 
-    if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) )
+    switch (ret) {
+    case 0:
+        break;
+    case 1:
         return;
+    case -1:
+        ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+        /* inject #VMEXIT(NPF) into guest. */
+        nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+        return;
+    }
 
+    if ( p2m == NULL )
+        p2m = p2m_get_p2m(v);
     /* Everything else is an error. */
     mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
-    gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
-             gpa, mfn_x(mfn), p2mt);
-    domain_crash(current->domain);
+    gdprintk(XENLOG_ERR,
+         "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+         gpa, mfn_x(mfn), p2mt);
+    domain_crash(v->domain);
 }
 
 static void svm_fpu_dirty_intercept(void)
@@ -1659,6 +1674,8 @@
         struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
         uint64_t exitinfo1, exitinfo2;
 
+        paging_update_nestedmode(v);
+
         /* Write real exitinfo1 back into virtual vmcb.
          * nestedsvm_check_intercepts() expects to have the correct
          * exitinfo1 value there.
@@ -1948,7 +1965,7 @@
     case VMEXIT_NPF:
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         regs->error_code = vmcb->exitinfo1;
-        svm_do_nested_pgfault(vmcb->exitinfo2);
+        svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
         break;
 
     case VMEXIT_IRET: {
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/Makefile      Tue Apr 05 15:44:09 2011 +0200
@@ -3,6 +3,7 @@
 obj-y += guest_walk_3level.o
 obj-y += guest_walk_4level.o
 obj-y += p2m-ept.o
+obj-y += nested_hap.o
 
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/guest_walk.c  Tue Apr 05 15:44:09 2011 +0200
@@ -29,24 +29,32 @@
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
 #if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
 
 #include <asm/guest_pt.h>
 #include <asm/p2m.h>
 
 unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
-    struct vcpu *v, unsigned long gva, uint32_t *pfec)
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
-    unsigned long cr3;
+    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
     uint32_t missing;
     mfn_t top_mfn;
     void *top_map;
     p2m_type_t p2mt;
     walk_t gw;
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
 
     /* Get the top-level table's MFN */
-    cr3 = v->arch.hvm_vcpu.guest_cr[3];
     top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
     if ( p2m_is_paging(p2mt) )
     {
@@ -72,7 +80,7 @@
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+    missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
     unmap_domain_page(top_map);
 
     /* Interpret the answer */
@@ -122,6 +130,15 @@
     return INVALID_GFN;
 }
 
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    gdprintk(XENLOG_ERR,
+             "Guest paging level is greater than host paging level!\n");
+    domain_crash(v->domain);
+    return INVALID_GFN;
+}
 #endif
 
 
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/hap.c Tue Apr 05 15:44:09 2011 +0200
@@ -40,6 +40,7 @@
 #include <asm/p2m.h>
 #include <asm/domain.h>
 #include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
 
 #include "private.h"
 
@@ -582,6 +583,7 @@
 int hap_enable(struct domain *d, u32 mode)
 {
     unsigned int old_pages;
+    uint8_t i;
     int rv = 0;
 
     domain_pause(d);
@@ -620,6 +622,12 @@
             goto out;
     }
 
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+        if ( rv != 0 )
+           goto out;
+    }
+
     /* Now let other users see the new mode */
     d->arch.paging.mode = mode | PG_HAP_enable;
 
@@ -630,6 +638,13 @@
 
 void hap_final_teardown(struct domain *d)
 {
+    uint8_t i;
+
+    /* Destroy nestedp2m's first */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m_teardown(d->arch.nested_p2m[i]);
+    }
+
     if ( d->arch.paging.hap.total_pages != 0 )
         hap_teardown(d);
 
@@ -657,7 +672,7 @@
         /* release the monitor table held by each vcpu */
         for_each_vcpu ( d, v )
         {
-            if ( v->arch.paging.mode && paging_mode_external(d) )
+            if ( paging_get_hostmode(v) && paging_mode_external(d) )
             {
                 mfn = pagetable_get_mfn(v->arch.monitor_table);
                 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -725,6 +740,7 @@
 void hap_vcpu_init(struct vcpu *v)
 {
     v->arch.paging.mode = &hap_paging_real_mode;
+    v->arch.paging.nestedmode = &hap_paging_real_mode;
 }
 
 /************************************************/
@@ -751,6 +767,15 @@
  */
 static int hap_invlpg(struct vcpu *v, unsigned long va)
 {
+    if (nestedhvm_enabled(v->domain)) {
+        /* Emulate INVLPGA:
+         * Must perform the flush right now or an other vcpu may
+         * use it when we use the next VMRUN emulation, otherwise.
+         */
+        p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+        return 1;
+    }
+
     HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
               v->domain->domain_id, v->vcpu_id);
     domain_crash(v->domain);
@@ -763,17 +788,22 @@
     hvm_update_guest_cr(v, 3);
 }
 
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
+                                   &hap_paging_protected_mode;
+}
+
 static void hap_update_paging_modes(struct vcpu *v)
 {
     struct domain *d = v->domain;
 
     hap_lock(d);
 
-    v->arch.paging.mode =
-        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
-        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
-        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
-                                   &hap_paging_protected_mode;
+    v->arch.paging.mode = hap_paging_get_mode(v);
 
     if ( pagetable_is_null(v->arch.monitor_table) )
     {
@@ -834,38 +864,65 @@
 hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
                     mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
 {
+    struct domain *d = v->domain;
     uint32_t old_flags;
+    bool_t flush_nestedp2m = 0;
 
-    hap_lock(v->domain);
+    /* We know always use the host p2m here, regardless if the vcpu
+     * is in host or guest mode. The vcpu can be in guest mode by
+     * a hypercall which passes a domain and chooses mostly the first
+     * vcpu. */
 
+    hap_lock(d);
     old_flags = l1e_get_flags(*p);
+
+    if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) ) {
+        /* We are replacing a valid entry so we need to flush nested p2ms,
+         * unless the only change is an increase in access rights. */
+        mfn_t omfn = _mfn(l1e_get_pfn(*p));
+        mfn_t nmfn = _mfn(l1e_get_pfn(new));
+        flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn)
+            && perms_strictly_increased(old_flags, l1e_get_flags(new)) );
+    }
+
     safe_write_pte(p, new);
     if ( (old_flags & _PAGE_PRESENT)
          && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
-             flush_tlb_mask(v->domain->domain_dirty_cpumask);
+             flush_tlb_mask(d->domain_dirty_cpumask);
 
 #if CONFIG_PAGING_LEVELS == 3
     /* install P2M in monitor table for PAE Xen */
     if ( level == 3 )
         /* We have written to the p2m l3: need to sync the per-vcpu
          * copies of it in the monitor tables */
-        p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+        p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
 #endif
 
-    hap_unlock(v->domain);
+    hap_unlock(d);
+
+    if ( flush_nestedp2m )
+        p2m_flush_nestedp2m(d);
 }
 
 static unsigned long hap_gva_to_gfn_real_mode(
-    struct vcpu *v, unsigned long gva, uint32_t *pfec)
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
 {
     return ((paddr_t)gva >> PAGE_SHIFT);
 }
 
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    return (ga >> PAGE_SHIFT);
+}
+
 /* Entry points into this mode of the hap code. */
 static const struct paging_mode hap_paging_real_mode = {
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_real_mode,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_real_mode,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -876,6 +933,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_2_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_2_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -886,6 +944,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_3_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_3_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -896,6 +955,7 @@
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_4_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_4_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/hap/nested_hap.c  Tue Apr 05 15:44:09 2011 +0200
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2011 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT 
+ * 
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ *    1. If #NPF is from L1 guest, then we crash the guest VM (same as old 
+ *       code)
+ *    2. If #NPF is from L2 guest, then we continue from (3)
+ *    3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ *    4. Walk the h_cr3 page table
+ *    5.    - if not present, then we inject #NPF back to L1 guest and 
+ *            re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ *            or fix its p2m table for L2 guest)
+ *    6.    - if present, then we will get the a new translated value L1-GPA 
+ *            (points to L1 machine memory)
+ *    7.        * Use L1-GPA to walk L0 P2M table
+ *    8.            - if not present, then crash the guest (should not happen)
+ *    9.            - if present, then we get a new translated value MPA 
+ *                    (points to real machine memory)
+ *   10.                * Finally, use GPA and MPA to walk nested_p2m 
+ *                        and fix the bits.
+ * }
+ * 
+ */
+
+
+/********************************************/
+/*        NESTED VIRT P2M FUNCTIONS         */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+    struct domain *d = p2m->domain;
+    uint32_t old_flags;
+
+    hap_lock(d);
+
+    old_flags = l1e_get_flags(*p);
+    safe_write_pte(p, new);
+    if (old_flags & _PAGE_PRESENT)
+        nestedhvm_vmcx_flushtlb(p2m);
+    
+    hap_unlock(d);
+}
+
+/********************************************/
+/*          NESTED VIRT FUNCTIONS           */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa,
+    p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    int rv;
+    ASSERT(p2m);
+    ASSERT(p2m->set_entry);
+
+    rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+                         page_to_mfn(maddr_to_page(L0_gpa)),
+                         0 /*4K*/, p2mt, p2ma);
+    if (rv == 0) {
+        gdprintk(XENLOG_ERR,
+               "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+               L2_gpa, L0_gpa);
+        BUG();
+    }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return 
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+    mfn_t mfn;
+    p2m_type_t p2mt;
+
+    /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+    mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+    if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    if ( !mfn_valid(mfn) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the 
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+    paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+    uint32_t pfec;
+    unsigned long nested_cr3, gfn;
+    const struct paging_mode *mode = paging_get_hostmode(v);
+    
+    nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+    /* walk the guest table */
+    gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+    if ( gfn == INVALID_GFN ) 
+        return NESTEDHVM_PAGEFAULT_INJECT;
+
+    *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+    int rv;
+    paddr_t L1_gpa, L0_gpa;
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m, *nested_p2m;
+
+    p2m = p2m_get_hostp2m(d); /* L0 p2m */
+    nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+    /* walk the L1 P2M table, note we have to pass p2m
+     * and not nested_p2m here or we fail the walk forever,
+     * otherwise. */
+    rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+    /* let caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    /* ==> we have to walk L0 P2M */
+    rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+    /* let upper level caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    nestedp2m_lock(d);
+    /* fix p2m_get_pagetable(nested_p2m) */
+    nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa,
+        p2m_ram_rw,
+        p2m_access_rwx /* FIXME: Should use same permission as l1 guest */);
+    nestedp2m_unlock(d);
+
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/*     NESTED VIRT INITIALIZATION FUNCS     */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/hap/private.h     Tue Apr 05 15:44:09 2011 +0200
@@ -23,11 +23,27 @@
 /********************************************/
 /*          GUEST TRANSLATION FUNCS         */
 /********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva, 
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+                                     struct p2m_domain *p2m,
+                                     unsigned long gva, 
                                      uint32_t *pfec);
 
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+
 #endif /* __HAP_PRIVATE_H__ */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/p2m.c     Tue Apr 05 15:44:09 2011 +0200
@@ -34,6 +34,7 @@
 #include <public/mem_event.h>
 #include <asm/mem_sharing.h>
 #include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
 
 /* Debugging and auditing of the P2M code? */
 #define P2M_AUDIT     0
@@ -75,7 +76,7 @@
 #define SUPERPAGE_PAGES (1UL << 9)
 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
 #ifdef __x86_64__
@@ -121,9 +122,9 @@
 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
 //
-static l1_pgentry_t *
+l1_pgentry_t *
 p2m_find_entry(void *table, unsigned long *gfn_remainder,
-                   unsigned long gfn, u32 shift, u32 max)
+                   unsigned long gfn, uint32_t shift, uint32_t max)
 {
     u32 index;
 
@@ -224,20 +225,17 @@
 
         switch ( type ) {
         case PGT_l3_page_table:
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 4);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
4);
             break;
         case PGT_l2_page_table:
 #if CONFIG_PAGING_LEVELS == 3
             /* for PAE mode, PDPE only has PCD/PWT/P bits available */
             new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
 #endif
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 3);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
3);
             break;
         case PGT_l1_page_table:
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   p2m_entry, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 
2);
             break;
         default:
             BUG();
@@ -264,14 +262,13 @@
         for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   l1_entry+i, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 2);
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
-        paging_write_p2m_entry(p2m->domain, gfn,
-                               p2m_entry, *table_mfn, new_entry, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
     }
 
 
@@ -298,15 +295,15 @@
         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + i, flags);
-            paging_write_p2m_entry(p2m->domain, gfn,
-                                   l1_entry+i, *table_mfn, new_entry, 1);
+            p2m->write_p2m_entry(p2m, gfn,
+                l1_entry+i, *table_mfn, new_entry, 1);
         }
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
-        paging_write_p2m_entry(p2m->domain, gfn,
-                               p2m_entry, *table_mfn, new_entry, 2);
+        p2m->write_p2m_entry(p2m, gfn,
+            p2m_entry, *table_mfn, new_entry, 2);
     }
 
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1369,8 +1366,7 @@
                            p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -1410,8 +1406,7 @@
             entry_content = l1e_empty();
         
         /* level 1 entry */
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 1);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
     }
     else if ( page_order == 9 )
@@ -1440,8 +1435,7 @@
             l2e_content = l2e_empty();
         
         entry_content.l1 = l2e_content.l2;
-        paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
-                               table_mfn, entry_content, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -1806,10 +1800,13 @@
     p2m->domain = d;
     p2m->default_access = p2m_access_rwx;
 
+    p2m->cr3 = CR3_EADDR;
     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
     p2m->get_entry_current = p2m_gfn_to_mfn_current;
     p2m->change_entry_type_global = p2m_change_type_global;
+    p2m->write_p2m_entry = paging_write_p2m_entry;
+    cpus_clear(p2m->p2m_dirty_cpumask);
 
     if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
         ept_p2m_init(d);
@@ -1817,6 +1814,25 @@
     return;
 }
 
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+    struct p2m_domain *p2m;
+
+    nestedp2m_lock_init(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+        if (p2m == NULL)
+            return -ENOMEM;
+        p2m_initialise(d, p2m);
+        p2m->get_entry_current = p2m->get_entry;
+        p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+    }
+
+    return 0;
+}
+
 int p2m_init(struct domain *d)
 {
     struct p2m_domain *p2m;
@@ -1825,8 +1841,12 @@
     if ( p2m == NULL )
         return -ENOMEM;
     p2m_initialise(d, p2m);
-    
-    return 0;
+
+    /* Must initialise nestedp2m unconditionally
+     * since nestedhvm_enabled(d) returns false here.
+     * (p2m_init runs too early for HVM_PARAM_* options)
+     */
+    return p2m_init_nestedp2m(d);
 }
 
 void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1919,6 +1939,9 @@
                         p2m_invalid, p2m->default_access) )
         goto error;
 
+    if (p2m_is_nestedp2m(p2m))
+        goto nesteddone;
+
     /* Copy all existing mappings from the page list and m2p */
     spin_lock(&p2m->domain->page_alloc_lock);
     page_list_for_each(page, &p2m->domain->page_list)
@@ -1940,6 +1963,7 @@
     }
     spin_unlock(&p2m->domain->page_alloc_lock);
 
+ nesteddone:
     P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
     p2m_unlock(p2m);
     return 0;
@@ -1966,6 +1990,9 @@
     mfn_t mfn;
 #endif
 
+    if (p2m == NULL)
+        return;
+
     p2m_lock(p2m);
 
 #ifdef __x86_64__
@@ -1984,11 +2011,26 @@
     p2m_unlock(p2m);
 }
 
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        xfree(d->arch.nested_p2m[i]);
+        d->arch.nested_p2m[i] = NULL;
+    }
+}
+
 void p2m_final_teardown(struct domain *d)
 {
     /* Iterate over all p2m tables per domain */
     xfree(d->arch.p2m);
     d->arch.p2m = NULL;
+
+    /* We must teardown unconditionally because
+     * we initialise them unconditionally.
+     */
+    p2m_teardown_nestedp2m(d);
 }
 
 #if P2M_AUDIT
@@ -2573,9 +2615,9 @@
                 gfn = get_gpfn_from_mfn(mfn);
                 flags = p2m_type_to_flags(nt, _mfn(mfn));
                 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                paging_write_p2m_entry(p2m->domain, gfn,
-                                       (l1_pgentry_t *)&l3e[i3],
-                                       l3mfn, l1e_content, 3);
+                p2m->write_p2m_entry(p2m, gfn,
+                                     (l1_pgentry_t *)&l3e[i3],
+                                     l3mfn, l1e_content, 3);
                 continue;
             }
 
@@ -2604,9 +2646,9 @@
                            * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    paging_write_p2m_entry(p2m->domain, gfn,
-                                           (l1_pgentry_t *)&l2e[i2],
-                                           l2mfn, l1e_content, 2);
+                    p2m->write_p2m_entry(p2m, gfn,
+                                         (l1_pgentry_t *)&l2e[i2],
+                                         l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -2628,8 +2670,8 @@
                     /* create a new 1le entry with the new type */
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = l1e_from_pfn(mfn, flags);
-                    paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
-                                           l1mfn, l1e_content, 1);
+                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+                                         l1mfn, l1e_content, 1);
                 }
                 unmap_domain_page(l1e);
             }
@@ -3048,6 +3090,182 @@
 }
 #endif /* __x86_64__ */
 
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+    int i, lru_index = -1;
+    struct p2m_domain *lrup2m, *tmp;
+
+    if (p2m == NULL) {
+        lru_index = MAX_NESTEDP2M - 1;
+        lrup2m = d->arch.nested_p2m[lru_index];
+    } else {
+        lrup2m = p2m;
+        for (i = 0; i < MAX_NESTEDP2M; i++) {
+            if (d->arch.nested_p2m[i] == p2m) {
+                lru_index = i;
+                break;
+            }
+        }
+    }
+
+    ASSERT(lru_index >= 0);
+    if (lru_index == 0) {
+        return lrup2m;
+    }
+
+    /* move the other's down the array "list" */
+    for (i = lru_index - 1; i >= 0; i--) {
+        tmp = d->arch.nested_p2m[i];
+        d->arch.nested_p2m[i+1] = tmp;        
+    }
+
+    /* make the entry the first one */
+    d->arch.nested_p2m[0] = lrup2m;
+
+    return lrup2m;
+}
+
+static int 
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+    ASSERT(p2m);
+    if (p2m->cr3 == CR3_EADDR)
+        /* Microoptimisation: p2m is already empty.
+         * => about 0.3% speedup of overall system performance.
+         */
+        return 0;
+
+    p2m_teardown(p2m);
+    p2m_initialise(p2m->domain, p2m);
+    p2m->get_entry_current = p2m->get_entry;
+    p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+    return p2m_alloc_table(p2m);
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+    struct domain *d = p2m->domain;
+
+    ASSERT(v->domain == d);
+    vcpu_nestedhvm(v).nv_p2m = NULL;
+    nestedp2m_lock(d);
+    BUG_ON(p2m_flush_locked(p2m) != 0);
+    hvm_asid_flush_vcpu(v);
+    nestedhvm_vmcx_flushtlb(p2m);
+    nestedp2m_unlock(d);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+    int i;
+
+    nestedp2m_lock(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        struct p2m_domain *p2m = d->arch.nested_p2m[i];
+        BUG_ON(p2m_flush_locked(p2m) != 0);
+        cpus_clear(p2m->p2m_dirty_cpumask);
+    }
+    nestedhvm_vmcx_flushtlbdomain(d);
+    nestedp2m_unlock(d);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+    /* Use volatile to prevent gcc to cache nv->nv_p2m in a cpu register as
+     * this may change within the loop by an other (v)cpu.
+     */
+    volatile struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+    struct domain *d;
+    struct p2m_domain *p2m;
+    int i, rv;
+
+    if (cr3 == 0 || cr3 == CR3_EADDR)
+        cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+    if (nv->nv_flushp2m && nv->nv_p2m) {
+        nv->nv_p2m = NULL;
+    }
+
+    d = v->domain;
+    nestedp2m_lock(d);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = d->arch.nested_p2m[i];
+        if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m))
+            continue;
+
+        nv->nv_flushp2m = 0;
+        p2m_getlru_nestedp2m(d, p2m);
+        nv->nv_p2m = p2m;
+        if (p2m->cr3 == CR3_EADDR)
+            hvm_asid_flush_vcpu(v);
+        p2m->cr3 = cr3;
+        cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+        nestedp2m_unlock(d);
+        return p2m;
+    }
+
+    /* All p2m's are or were in use. Take the least recent used one,
+     * flush it and reuse.
+     */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = p2m_getlru_nestedp2m(d, NULL);
+        rv = p2m_flush_locked(p2m);
+        if (rv == 0)
+            break;
+    }
+    nv->nv_p2m = p2m;
+    p2m->cr3 = cr3;
+    nv->nv_flushp2m = 0;
+    hvm_asid_flush_vcpu(v);
+    nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+    cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+    nestedp2m_unlock(d);
+
+    return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+    if (!nestedhvm_is_n2(v))
+        return p2m_get_hostp2m(v->domain);
+
+    return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+                                unsigned long va,
+                                uint32_t *pfec)
+{
+    struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+    const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+    if ( is_hvm_domain(v->domain)
+        && paging_mode_hap(v->domain) 
+        && nestedhvm_is_n2(v) )
+    {
+        unsigned long gfn;
+        struct p2m_domain *p2m;
+        const struct paging_mode *mode;
+        uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+        /* translate l2 guest va into l2 guest gfn */
+        p2m = p2m_get_nestedp2m(v, ncr3);
+        mode = paging_get_nestedmode(v);
+        gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+        /* translate l2 guest gfn into l1 guest gfn */
+        return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+            gfn << PAGE_SHIFT, pfec);
+    }
+
+    return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c  Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/paging.c  Tue Apr 05 15:44:09 2011 +0200
@@ -26,6 +26,7 @@
 #include <asm/p2m.h>
 #include <asm/hap.h>
 #include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
 #include <xen/numa.h>
 #include <xsm/xsm.h>
 
@@ -851,21 +852,58 @@
         printk("    paging assistance: ");
         if ( paging_mode_shadow(v->domain) )
         {
-            if ( v->arch.paging.mode )
+            if ( paging_get_hostmode(v) )
                 printk("shadowed %u-on-%u\n",
-                       v->arch.paging.mode->guest_levels,
-                       v->arch.paging.mode->shadow.shadow_levels);
+                       paging_get_hostmode(v)->guest_levels,
+                       paging_get_hostmode(v)->shadow.shadow_levels);
             else
                 printk("not shadowed\n");
         }
-        else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+        else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
             printk("hap, %u levels\n",
-                   v->arch.paging.mode->guest_levels);
+                   paging_get_hostmode(v)->guest_levels);
         else
             printk("none\n");
     }
 }
 
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+    if (!nestedhvm_is_n2(v))
+        return paging_get_hostmode(v);
+
+    return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+    ASSERT(nestedhvm_enabled(v->domain));
+    if (nestedhvm_paging_mode_hap(v))
+        /* nested-on-nested */
+        v->arch.paging.nestedmode = hap_paging_get_mode(v);
+    else
+        /* TODO: shadow-on-shadow */
+        v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+                            l1_pgentry_t *p, mfn_t table_mfn,
+                            l1_pgentry_t new, unsigned int level)
+{
+    struct domain *d = p2m->domain;
+    struct vcpu *v = current;
+    if ( v->domain != d )
+        v = d->vcpu ? d->vcpu[0] : NULL;
+    if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) 
)
+    {
+        return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+                                                       new, level);
+    }
+    else
+        safe_write_pte(p, new);
+}
 
 /*
  * Local variables:
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Tue Apr 05 15:44:09 2011 +0200
@@ -837,22 +837,6 @@
     if ( map != NULL ) sh_unmap_domain_page(map);
 }
 
-static inline int
-perms_strictly_increased(u32 old_flags, u32 new_flags) 
-/* Given the flags of two entries, are the new flags a strict
- * increase in rights over the old ones? */
-{
-    u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
-    u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
-    /* Flip the NX bit, since it's the only one that decreases rights;
-     * we calculate as if it were an "X" bit. */
-    of ^= _PAGE_NX_BIT;
-    nf ^= _PAGE_NX_BIT;
-    /* If the changed bits are all set in the new flags, then rights strictly 
-     * increased between old and new. */
-    return ((of | (of ^ nf)) == nf);
-}
-
 /* type is only used to distinguish grant map pages from ordinary RAM
  * i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw.  */
 static int inline
@@ -3768,7 +3752,8 @@
 
 
 static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+    unsigned long va, uint32_t *pfec)
 /* Called to translate a guest virtual address to what the *guest*
  * pagetables would map it to. */
 {
@@ -4820,7 +4805,7 @@
     struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
 
     /* Translate the VA to a GFN */
-    gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+    gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
     if ( gfn == INVALID_GFN ) 
     {
         if ( is_hvm_vcpu(v) )
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/domain.h      Tue Apr 05 15:44:09 2011 +0200
@@ -210,6 +210,8 @@
 struct paging_vcpu {
     /* Pointers to mode-specific entry points. */
     const struct paging_mode *mode;
+    /* Nested Virtualization: paging mode of nested guest */
+    const struct paging_mode *nestedmode;
     /* HVM guest: last emulate was to a pagetable */
     unsigned int last_write_was_pt:1;
     /* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@
 #define MAX_CPUID_INPUT 40
 typedef xen_domctl_cpuid_t cpuid_input_t;
 
+#define MAX_NESTEDP2M 10
 struct p2m_domain;
 struct time_scale {
     int shift;
@@ -273,6 +276,12 @@
     struct paging_domain paging;
     struct p2m_domain *p2m;
 
+    /* nestedhvm: translate l2 guest physical to host physical */
+    struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+    spinlock_t nested_p2m_lock;
+    int nested_p2m_locker;
+    const char *nested_p2m_function;
+
     /* NB. protected by d->event_lock and by irq_desc[irq].lock */
     int *irq_pirq;
     int *pirq_irq;
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h     Tue Apr 05 15:44:09 2011 +0200
@@ -374,12 +374,12 @@
 
 int hvm_debug_op(struct vcpu *v, int32_t op);
 
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
-                                 bool_t gla_valid, unsigned long gla,
-                                 bool_t access_valid, 
-                                 bool_t access_r,
-                                 bool_t access_w,
-                                 bool_t access_x);
+int hvm_hap_nested_page_fault(unsigned long gpa,
+                              bool_t gla_valid, unsigned long gla,
+                              bool_t access_valid, 
+                              bool_t access_r,
+                              bool_t access_w,
+                              bool_t access_x);
 
 #define hvm_msr_tsc_aux(v) ({                                               \
     struct domain *__d = (v)->domain;                                       \
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h       Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/hvm/nestedhvm.h       Tue Apr 05 15:44:09 2011 +0200
@@ -60,4 +60,9 @@
 #define nestedhvm_vmswitch_in_progress(v)   \
     (!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
 
+void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
+void nestedhvm_vmcx_flushtlbdomain(struct domain *d);
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
+
 #endif /* _HVM_NESTEDHVM_H */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/p2m.h Tue Apr 05 15:44:09 2011 +0200
@@ -199,7 +199,15 @@
     /* Shadow translated domain: p2m mapping */
     pagetable_t        phys_table;
 
+    /* Same as domain_dirty_cpumask but limited to
+     * this p2m and those physical cpus whose vcpu's are in
+     * guestmode.
+     */
+    cpumask_t          p2m_dirty_cpumask;
+
     struct domain     *domain;   /* back pointer to domain */
+#define CR3_EADDR     (~0ULL)
+    uint64_t           cr3;      /* to identify this p2m for re-use */
 
     /* Pages used to construct the p2m */
     struct page_list_head pages;
@@ -223,6 +231,11 @@
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
     
+    void               (*write_p2m_entry)(struct p2m_domain *p2m,
+                                          unsigned long gfn, l1_pgentry_t *p,
+                                          mfn_t table_mfn, l1_pgentry_t new,
+                                          unsigned int level);
+
     /* Default P2M access type for each page in the the domain: new pages,
      * swapped in pages, cleared pages, and pages that are ambiquously
      * retyped get this access type.  See definition of p2m_access_t. */
@@ -264,8 +277,26 @@
 /* get host p2m table */
 #define p2m_get_hostp2m(d)      ((d)->arch.p2m)
 
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m)   ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
 #define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
 
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
 /*
  * The P2M lock.  This protects all updates to the p2m table.
  * Updates are expected to be safe against concurrent reads,
@@ -307,6 +338,38 @@
     (current->processor == (_p2m)->locker)
 
 
+#define nestedp2m_lock_init(_domain)                                  \
+    do {                                                              \
+        spin_lock_init(&(_domain)->arch.nested_p2m_lock);             \
+        (_domain)->arch.nested_p2m_locker = -1;                       \
+        (_domain)->arch.nested_p2m_function = "nobody";               \
+    } while (0)
+
+#define nestedp2m_locked_by_me(_domain)                \
+    (current->processor == (_domain)->arch.nested_p2m_locker)
+
+#define nestedp2m_lock(_domain)                                       \
+    do {                                                              \
+        if ( nestedp2m_locked_by_me(_domain) )                        \
+        {                                                             \
+            printk("Error: p2m lock held by %s\n",                    \
+                   (_domain)->arch.nested_p2m_function);              \
+            BUG();                                                    \
+        }                                                             \
+        spin_lock(&(_domain)->arch.nested_p2m_lock);                  \
+        ASSERT((_domain)->arch.nested_p2m_locker == -1);              \
+        (_domain)->arch.nested_p2m_locker = current->processor;       \
+        (_domain)->arch.nested_p2m_function = __func__;               \
+    } while (0)
+
+#define nestedp2m_unlock(_domain)                                      \
+    do {                                                               \
+        ASSERT(nestedp2m_locked_by_me(_domain));                       \
+        (_domain)->arch.nested_p2m_locker = -1;                        \
+        (_domain)->arch.nested_p2m_function = "nobody";                \
+        spin_unlock(&(_domain)->arch.nested_p2m_lock);                 \
+    } while (0)
+
 /* Extract the type from the PTE flags that store it */
 static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
 {
@@ -424,11 +487,21 @@
 /* Init the datastructures for later use by the p2m code */
 int p2m_init(struct domain *d);
 
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
 /* Allocate a new p2m table for a domain. 
  *
  * Returns 0 for success or -errno. */
 int p2m_alloc_table(struct p2m_domain *p2m);
 
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+               unsigned long gfn, uint32_t shift, uint32_t max);
+
 /* Return all the p2m resources to Xen. */
 void p2m_teardown(struct p2m_domain *p2m);
 void p2m_final_teardown(struct domain *d);
@@ -502,6 +575,8 @@
 int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
 
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
 
 #ifdef __x86_64__
 /* Modify p2m table for shared gfn */
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/page.h        Tue Apr 05 15:44:09 2011 +0200
@@ -391,6 +391,23 @@
     return ((cacheattr & 4) << 5) | ((cacheattr & 3) << 3);
 }
 
+/* return true if permission increased */
+static inline bool_t
+perms_strictly_increased(uint32_t old_flags, uint32_t new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+    uint32_t of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+    uint32_t nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+    /* Flip the NX bit, since it's the only one that decreases rights;
+     * we calculate as if it were an "X" bit. */
+    of ^= _PAGE_NX_BIT;
+    nf ^= _PAGE_NX_BIT;
+    /* If the changed bits are all set in the new flags, then rights strictly
+     * increased between old and new. */
+    return ((of | (of ^ nf)) == nf);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK)
diff -r 9c3fbfa7d0d5 -r 7714b42e72fa xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h      Wed Mar 09 12:36:23 2011 +0100
+++ b/xen/include/asm-x86/paging.h      Tue Apr 05 15:44:09 2011 +0200
@@ -108,8 +108,14 @@
     int           (*page_fault            )(struct vcpu *v, unsigned long va,
                                             struct cpu_user_regs *regs);
     int           (*invlpg                )(struct vcpu *v, unsigned long va);
-    unsigned long (*gva_to_gfn            )(struct vcpu *v, unsigned long va,
+    unsigned long (*gva_to_gfn            )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long va,
                                             uint32_t *pfec);
+    unsigned long (*p2m_ga_to_gfn         )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long cr3,
+                                            paddr_t ga, uint32_t *pfec);
     void          (*update_cr3            )(struct vcpu *v, int do_locking);
     void          (*update_paging_modes   )(struct vcpu *v);
     void          (*write_p2m_entry       )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@
  * creation. */
 int paging_enable(struct domain *d, u32 mode);
 
+#define paging_get_hostmode(v)         ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v)       ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
 
 /* Page fault handler
  * Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@
 paging_fault(unsigned long va, struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    return v->arch.paging.mode->page_fault(v, va, regs);
+    return paging_get_hostmode(v)->page_fault(v, va, regs);
 }
 
 /* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@
  * or 0 if it's safe not to do so. */
 static inline int paging_invlpg(struct vcpu *v, unsigned long va)
 {
-    return v->arch.paging.mode->invlpg(v, va);
+    return paging_get_hostmode(v)->invlpg(v, va);
 }
 
 /* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@
  * walking the tables.  The caller should set the PFEC_page_present bit
  * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
 #define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v, 
-                                              unsigned long va,
-                                              uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+                                unsigned long va,
+                                uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+                                                 struct p2m_domain *p2m,
+                                                 const struct paging_mode 
*mode,
+                                                 unsigned long cr3,
+                                                 paddr_t ga,
+                                                 uint32_t *pfec)
 {
-    return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+    if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+        return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+    /* shadow paging */
+    return paging_gva_to_gfn(v, ga, pfec);
 }
 
 /* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@
  * as the value to load into the host CR3 to schedule this vcpu */
 static inline void paging_update_cr3(struct vcpu *v)
 {
-    v->arch.paging.mode->update_cr3(v, 1);
+    paging_get_hostmode(v)->update_cr3(v, 1);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@
  * has changed, and when bringing up a VCPU for the first time. */
 static inline void paging_update_paging_modes(struct vcpu *v)
 {
-    v->arch.paging.mode->update_paging_modes(v);
+    paging_get_hostmode(v)->update_paging_modes(v);
 }
 
 
@@ -283,7 +312,7 @@
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+        return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
     else 
         return (!__copy_to_user(p, &new, sizeof(new)));
 }
@@ -299,7 +328,7 @@
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+        return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, 
gmfn);
     else 
         return (!cmpxchg_user(p, *old, new));
 }
@@ -327,21 +356,11 @@
  * a pointer to the entry to be written, the MFN in which the entry resides, 
  * the new contents of the entry, and the level in the p2m tree at which 
  * we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn, 
-                                          l1_pgentry_t *p, mfn_t table_mfn,
-                                          l1_pgentry_t new, unsigned int level)
-{
-    struct vcpu *v = current;
-    if ( v->domain != d )
-        v = d->vcpu ? d->vcpu[0] : NULL;
-    if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
-    {
-        return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
-                                                    new, level);
-    }
-    else 
-        safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, 
+                            l1_pgentry_t *p, mfn_t table_mfn,
+                            l1_pgentry_t new, unsigned int level);
 
 /* Called from the guest to indicate that the a process is being
  * torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@
     l2_pgentry_t l2e;
 
     if ( unlikely(paging_mode_translate(v->domain)) )
-        return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+        return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
 
     /* Find this l1e and its enclosing l1mfn in the linear map */
     if ( __copy_from_user(&l2e, 
@@ -398,7 +417,7 @@
         return;
     }
         
-    v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+    paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
 }
 
 /* Read the guest's l1e that maps this address, from the kernel-mode

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.